36 files changed, 0 insertions, 8914 deletions
diff --git a/vendor/github.com/klauspost/compress/LICENSE b/vendor/github.com/klauspost/compress/LICENSE
deleted file mode 100644
index 74487567..00000000
--- a/vendor/github.com/klauspost/compress/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2012 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/flate/copy.go b/vendor/github.com/klauspost/compress/flate/copy.go
deleted file mode 100644
index a3200a8f..00000000
--- a/vendor/github.com/klauspost/compress/flate/copy.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-// forwardCopy is like the built-in copy function except that it always goes
-// forward from the start, even if the dst and src overlap.
-// It is equivalent to:
-//   for i := 0; i < n; i++ {
-//     mem[dst+i] = mem[src+i]
-//   }
-func forwardCopy(mem []byte, dst, src, n int) {
-	if dst <= src {
-		copy(mem[dst:dst+n], mem[src:src+n])
-		return
-	}
-	for {
-		if dst >= src+n {
-			copy(mem[dst:dst+n], mem[src:src+n])
-			return
-		}
-		// There is some forward overlap.  The destination
-		// will be filled with a repeated pattern of mem[src:src+k].
-		// We copy one instance of the pattern here, then repeat.
-		// Each time around this loop k will double.
-		k := dst - src
-		copy(mem[dst:dst+k], mem[src:src+k])
-		n -= k
-		dst += k
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
deleted file mode 100644
index 70a6095e..00000000
--- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
+++ /dev/null
@@ -1,41 +0,0 @@
-//+build !noasm
-//+build !appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-package flate
-
-import (
-	"github.com/klauspost/cpuid"
-)
-
-// crc32sse returns a hash for the first 4 bytes of the slice
-// len(a) must be >= 4.
-//go:noescape
-func crc32sse(a []byte) uint32
-
-// crc32sseAll calculates hashes for each 4-byte set in a.
-// dst must be east len(a) - 4 in size.
-// The size is not checked by the assembly.
-//go:noescape
-func crc32sseAll(a []byte, dst []uint32)
-
-// matchLenSSE4 returns the number of matching bytes in a and b
-// up to length 'max'. Both slices must be at least 'max'
-// bytes in size.
-//
-// TODO: drop the "SSE4" name, since it doesn't use any SSE instructions.
-//
-//go:noescape
-func matchLenSSE4(a, b []byte, max int) int
-
-// histogram accumulates a histogram of b in h.
-// h must be at least 256 entries in length,
-// and must be cleared before calling this function.
-//go:noescape
-func histogram(b []byte, h []int32)
-
-// Detect SSE 4.2 feature.
-func init() {
-	useSSE42 = cpuid.CPU.SSE42()
-}
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
deleted file mode 100644
index 2fb2079b..00000000
--- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
+++ /dev/null
@@ -1,213 +0,0 @@
-//+build !noasm
-//+build !appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-// func crc32sse(a []byte) uint32
-TEXT ·crc32sse(SB), 4, $0
-	MOVQ a+0(FP), R10
-	XORQ BX, BX
-
-	// CRC32   dword (R10), EBX
-	BYTE $0xF2; BYTE $0x41; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0x1a
-
-	MOVL BX, ret+24(FP)
-	RET
-
-// func crc32sseAll(a []byte, dst []uint32)
-TEXT ·crc32sseAll(SB), 4, $0
-	MOVQ  a+0(FP), R8      // R8: src
-	MOVQ  a_len+8(FP), R10 // input length
-	MOVQ  dst+24(FP), R9   // R9: dst
-	SUBQ  $4, R10
-	JS    end
-	JZ    one_crc
-	MOVQ  R10, R13
-	SHRQ  $2, R10          // len/4
-	ANDQ  $3, R13          // len&3
-	XORQ  BX, BX
-	ADDQ  $1, R13
-	TESTQ R10, R10
-	JZ    rem_loop
-
-crc_loop:
-	MOVQ (R8), R11
-	XORQ BX, BX
-	XORQ DX, DX
-	XORQ DI, DI
-	MOVQ R11, R12
-	SHRQ $8, R11
-	MOVQ R12, AX
-	MOVQ R11, CX
-	SHRQ $16, R12
-	SHRQ $16, R11
-	MOVQ R12, SI
-
-	// CRC32   EAX, EBX
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-
-	// CRC32   ECX, EDX
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xd1
-
-	// CRC32   ESI, EDI
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xfe
-	MOVL BX, (R9)
-	MOVL DX, 4(R9)
-	MOVL DI, 8(R9)
-
-	XORQ BX, BX
-	MOVL R11, AX
-
-	// CRC32   EAX, EBX
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-	MOVL BX, 12(R9)
-
-	ADDQ $16, R9
-	ADDQ $4, R8
-	XORQ BX, BX
-	SUBQ $1, R10
-	JNZ  crc_loop
-
-rem_loop:
-	MOVL (R8), AX
-
-	// CRC32   EAX, EBX
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-
-	MOVL BX, (R9)
-	ADDQ $4, R9
-	ADDQ $1, R8
-	XORQ BX, BX
-	SUBQ $1, R13
-	JNZ  rem_loop
-
-end:
-	RET
-
-one_crc:
-	MOVQ $1, R13
-	XORQ BX, BX
-	JMP  rem_loop
-
-// func matchLenSSE4(a, b []byte, max int) int
-TEXT ·matchLenSSE4(SB), 4, $0
-	MOVQ a_base+0(FP), SI
-	MOVQ b_base+24(FP), DI
-	MOVQ DI, DX
-	MOVQ max+48(FP), CX
-
-cmp8:
-	// As long as we are 8 or more bytes before the end of max, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ CX, $8
-	JLT  cmp1
-	MOVQ (SI), AX
-	MOVQ (DI), BX
-	CMPQ AX, BX
-	JNE  bsf
-	ADDQ $8, SI
-	ADDQ $8, DI
-	SUBQ $8, CX
-	JMP  cmp8
-
-bsf:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, DI
-
-	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
-	SUBQ DX, DI
-	MOVQ DI, ret+56(FP)
-	RET
-
-cmp1:
-	// In the slices' tail, compare 1 byte at a time.
-	CMPQ CX, $0
-	JEQ  matchLenEnd
-	MOVB (SI), AX
-	MOVB (DI), BX
-	CMPB AX, BX
-	JNE  matchLenEnd
-	ADDQ $1, SI
-	ADDQ $1, DI
-	SUBQ $1, CX
-	JMP  cmp1
-
-matchLenEnd:
-	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
-	SUBQ DX, DI
-	MOVQ DI, ret+56(FP)
-	RET
-
-// func histogram(b []byte, h []int32)
-TEXT ·histogram(SB), 4, $0
-	MOVQ b+0(FP), SI     // SI: &b
-	MOVQ b_len+8(FP), R9 // R9: len(b)
-	MOVQ h+24(FP), DI    // DI: Histogram
-	MOVQ R9, R8
-	SHRQ $3, R8
-	JZ   hist1
-	XORQ R11, R11
-
-loop_hist8:
-	MOVQ (SI), R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	INCL (DI)(R10*4)
-
-	ADDQ $8, SI
-	DECQ R8
-	JNZ  loop_hist8
-
-hist1:
-	ANDQ $7, R9
-	JZ   end_hist
-	XORQ R10, R10
-
-loop_hist1:
-	MOVB (SI), R10
-	INCL (DI)(R10*4)
-	INCQ SI
-	DECQ R9
-	JNZ  loop_hist1
-
-end_hist:
-	RET
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
deleted file mode 100644
index bd98bd59..00000000
--- a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
+++ /dev/null
@@ -1,35 +0,0 @@
-//+build !amd64 noasm appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-package flate
-
-func init() {
-	useSSE42 = false
-}
-
-// crc32sse should never be called.
-func crc32sse(a []byte) uint32 {
-	panic("no assembler")
-}
-
-// crc32sseAll should never be called.
-func crc32sseAll(a []byte, dst []uint32) {
-	panic("no assembler")
-}
-
-// matchLenSSE4 should never be called.
-func matchLenSSE4(a, b []byte, max int) int {
-	panic("no assembler")
-	return 0
-}
-
-// histogram accumulates a histogram of b in h.
-//
-// len(h) must be >= 256, and h's elements must be all zeroes.
-func histogram(b []byte, h []int32) {
-	h = h[:256]
-	for _, t := range b {
-		h[t]++
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
deleted file mode 100644
index 76e9682f..00000000
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ /dev/null
@@ -1,1351 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright (c) 2015 Klaus Post
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-import (
-	"fmt"
-	"io"
-	"math"
-)
-
-const (
-	NoCompression      = 0
-	BestSpeed          = 1
-	BestCompression    = 9
-	DefaultCompression = -1
-
-	// HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
-	// entropy encoding. This mode is useful in compressing data that has
-	// already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
-	// that lacks an entropy encoder. Compression gains are achieved when
-	// certain bytes in the input stream occur more frequently than others.
-	//
-	// Note that HuffmanOnly produces a compressed output that is
-	// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
-	// continue to be able to decompress this output.
-	HuffmanOnly         = -2
-	ConstantCompression = HuffmanOnly // compatibility alias.
-
-	logWindowSize    = 15
-	windowSize       = 1 << logWindowSize
-	windowMask       = windowSize - 1
-	logMaxOffsetSize = 15  // Standard DEFLATE
-	minMatchLength   = 4   // The smallest match that the compressor looks for
-	maxMatchLength   = 258 // The longest match for the compressor
-	minOffsetSize    = 1   // The shortest offset that makes any sense
-
-	// The maximum number of tokens we put into a single flat block, just too
-	// stop things from getting too large.
-	maxFlateBlockTokens = 1 << 14
-	maxStoreBlockSize   = 65535
-	hashBits            = 17 // After 17 performance degrades
-	hashSize            = 1 << hashBits
-	hashMask            = (1 << hashBits) - 1
-	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
-	maxHashOffset       = 1 << 24
-
-	skipNever = math.MaxInt32
-)
-
-var useSSE42 bool
-
-type compressionLevel struct {
-	good, lazy, nice, chain, fastSkipHashing, level int
-}
-
-// Compression levels have been rebalanced from zlib deflate defaults
-// to give a bigger spread in speed and compression.
-// See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
-var levels = []compressionLevel{
-	{}, // 0
-	// Level 1-4 uses specialized algorithm - values not used
-	{0, 0, 0, 0, 0, 1},
-	{0, 0, 0, 0, 0, 2},
-	{0, 0, 0, 0, 0, 3},
-	{0, 0, 0, 0, 0, 4},
-	// For levels 5-6 we don't bother trying with lazy matches.
-	// Lazy matching is at least 30% slower, with 1.5% increase.
-	{6, 0, 12, 8, 12, 5},
-	{8, 0, 24, 16, 16, 6},
-	// Levels 7-9 use increasingly more lazy matching
-	// and increasingly stringent conditions for "good enough".
-	{8, 8, 24, 16, skipNever, 7},
-	{10, 16, 24, 64, skipNever, 8},
-	{32, 258, 258, 4096, skipNever, 9},
-}
-
-type compressor struct {
-	compressionLevel
-
-	w          *huffmanBitWriter
-	bulkHasher func([]byte, []uint32)
-
-	// compression algorithm
-	fill func(*compressor, []byte) int // copy data to window
-	step func(*compressor)             // process window
-	sync bool                          // requesting flush
-
-	// Input hash chains
-	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
-	// If hashHead[hashValue] is within the current window, then
-	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
-	// with the same hash value.
-	chainHead  int
-	hashHead   [hashSize]uint32
-	hashPrev   [windowSize]uint32
-	hashOffset int
-
-	// input window: unprocessed data is window[index:windowEnd]
-	index         int
-	window        []byte
-	windowEnd     int
-	blockStart    int  // window index where current tokens start
-	byteAvailable bool // if true, still need to process window[index-1].
-
-	// queued output tokens
-	tokens tokens
-
-	// deflate state
-	length         int
-	offset         int
-	hash           uint32
-	maxInsertIndex int
-	err            error
-	ii             uint16 // position of last match, intended to overflow to reset.
-
-	snap      snappyEnc
-	hashMatch [maxMatchLength + minMatchLength]uint32
-}
-
-func (d *compressor) fillDeflate(b []byte) int {
-	if d.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
-		// shift the window by windowSize
-		copy(d.window[:], d.window[windowSize:2*windowSize])
-		d.index -= windowSize
-		d.windowEnd -= windowSize
-		if d.blockStart >= windowSize {
-			d.blockStart -= windowSize
-		} else {
-			d.blockStart = math.MaxInt32
-		}
-		d.hashOffset += windowSize
-		if d.hashOffset > maxHashOffset {
-			delta := d.hashOffset - 1
-			d.hashOffset -= delta
-			d.chainHead -= delta
-			for i, v := range d.hashPrev {
-				if int(v) > delta {
-					d.hashPrev[i] = uint32(int(v) - delta)
-				} else {
-					d.hashPrev[i] = 0
-				}
-			}
-			for i, v := range d.hashHead {
-				if int(v) > delta {
-					d.hashHead[i] = uint32(int(v) - delta)
-				} else {
-					d.hashHead[i] = 0
-				}
-			}
-		}
-	}
-	n := copy(d.window[d.windowEnd:], b)
-	d.windowEnd += n
-	return n
-}
-
-func (d *compressor) writeBlock(tok tokens, index int, eof bool) error {
-	if index > 0 || eof {
-		var window []byte
-		if d.blockStart <= index {
-			window = d.window[d.blockStart:index]
-		}
-		d.blockStart = index
-		d.w.writeBlock(tok.tokens[:tok.n], eof, window)
-		return d.w.err
-	}
-	return nil
-}
-
-// writeBlockSkip writes the current block and uses the number of tokens
-// to determine if the block should be stored on no matches, or
-// only huffman encoded.
-func (d *compressor) writeBlockSkip(tok tokens, index int, eof bool) error {
-	if index > 0 || eof {
-		if d.blockStart <= index {
-			window := d.window[d.blockStart:index]
-			// If we removed less than a 64th of all literals
-			// we huffman compress the block.
-			if int(tok.n) > len(window)-int(tok.n>>6) {
-				d.w.writeBlockHuff(eof, window)
-			} else {
-				// Write a dynamic huffman block.
-				d.w.writeBlockDynamic(tok.tokens[:tok.n], eof, window)
-			}
-		} else {
-			d.w.writeBlock(tok.tokens[:tok.n], eof, nil)
-		}
-		d.blockStart = index
-		return d.w.err
-	}
-	return nil
-}
-
-// fillWindow will fill the current window with the supplied
-// dictionary and calculate all hashes.
-// This is much faster than doing a full encode.
-// Should only be used after a start/reset.
-func (d *compressor) fillWindow(b []byte) {
-	// Do not fill window if we are in store-only mode,
-	// use constant or Snappy compression.
-	switch d.compressionLevel.level {
-	case 0, 1, 2:
-		return
-	}
-	// If we are given too much, cut it.
-	if len(b) > windowSize {
-		b = b[len(b)-windowSize:]
-	}
-	// Add all to window.
-	n := copy(d.window[d.windowEnd:], b)
-
-	// Calculate 256 hashes at the time (more L1 cache hits)
-	loops := (n + 256 - minMatchLength) / 256
-	for j := 0; j < loops; j++ {
-		startindex := j * 256
-		end := startindex + 256 + minMatchLength - 1
-		if end > n {
-			end = n
-		}
-		tocheck := d.window[startindex:end]
-		dstSize := len(tocheck) - minMatchLength + 1
-
-		if dstSize <= 0 {
-			continue
-		}
-
-		dst := d.hashMatch[:dstSize]
-		d.bulkHasher(tocheck, dst)
-		var newH uint32
-		for i, val := range dst {
-			di := i + startindex
-			newH = val & hashMask
-			// Get previous value with the same hash.
-			// Our chain should point to the previous value.
-			d.hashPrev[di&windowMask] = d.hashHead[newH]
-			// Set the head of the hash chain to us.
-			d.hashHead[newH] = uint32(di + d.hashOffset)
-		}
-		d.hash = newH
-	}
-	// Update window information.
-	d.windowEnd += n
-	d.index = n
-}
-
-// Try to find a match starting at index whose length is greater than prevSize.
-// We only look at chainCount possibilities before giving up.
-// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead
-func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
-	minMatchLook := maxMatchLength
-	if lookahead < minMatchLook {
-		minMatchLook = lookahead
-	}
-
-	win := d.window[0 : pos+minMatchLook]
-
-	// We quit when we get a match that's at least nice long
-	nice := len(win) - pos
-	if d.nice < nice {
-		nice = d.nice
-	}
-
-	// If we've got a match that's good enough, only look in 1/4 the chain.
-	tries := d.chain
-	length = prevLength
-	if length >= d.good {
-		tries >>= 2
-	}
-
-	wEnd := win[pos+length]
-	wPos := win[pos:]
-	minIndex := pos - windowSize
-
-	for i := prevHead; tries > 0; tries-- {
-		if wEnd == win[i+length] {
-			n := matchLen(win[i:], wPos, minMatchLook)
-
-			if n > length && (n > minMatchLength || pos-i <= 4096) {
-				length = n
-				offset = pos - i
-				ok = true
-				if n >= nice {
-					// The match is good enough that we don't try to find a better one.
-					break
-				}
-				wEnd = win[pos+n]
-			}
-		}
-		if i == minIndex {
-			// hashPrev[i & windowMask] has already been overwritten, so stop now.
-			break
-		}
-		i = int(d.hashPrev[i&windowMask]) - d.hashOffset
-		if i < minIndex || i < 0 {
-			break
-		}
-	}
-	return
-}
-
-// Try to find a match starting at index whose length is greater than prevSize.
-// We only look at chainCount possibilities before giving up.
-// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead
-func (d *compressor) findMatchSSE(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
-	minMatchLook := maxMatchLength
-	if lookahead < minMatchLook {
-		minMatchLook = lookahead
-	}
-
-	win := d.window[0 : pos+minMatchLook]
-
-	// We quit when we get a match that's at least nice long
-	nice := len(win) - pos
-	if d.nice < nice {
-		nice = d.nice
-	}
-
-	// If we've got a match that's good enough, only look in 1/4 the chain.
-	tries := d.chain
-	length = prevLength
-	if length >= d.good {
-		tries >>= 2
-	}
-
-	wEnd := win[pos+length]
-	wPos := win[pos:]
-	minIndex := pos - windowSize
-
-	for i := prevHead; tries > 0; tries-- {
-		if wEnd == win[i+length] {
-			n := matchLenSSE4(win[i:], wPos, minMatchLook)
-
-			if n > length && (n > minMatchLength || pos-i <= 4096) {
-				length = n
-				offset = pos - i
-				ok = true
-				if n >= nice {
-					// The match is good enough that we don't try to find a better one.
-					break
-				}
-				wEnd = win[pos+n]
-			}
-		}
-		if i == minIndex {
-			// hashPrev[i & windowMask] has already been overwritten, so stop now.
-			break
-		}
-		i = int(d.hashPrev[i&windowMask]) - d.hashOffset
-		if i < minIndex || i < 0 {
-			break
-		}
-	}
-	return
-}
-
-func (d *compressor) writeStoredBlock(buf []byte) error {
-	if d.w.writeStoredHeader(len(buf), false); d.w.err != nil {
-		return d.w.err
-	}
-	d.w.writeBytes(buf)
-	return d.w.err
-}
-
-const hashmul = 0x1e35a7bd
-
-// hash4 returns a hash representation of the first 4 bytes
-// of the supplied slice.
-// The caller must ensure that len(b) >= 4.
-func hash4(b []byte) uint32 {
-	return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits)
-}
-
-// bulkHash4 will compute hashes using the same
-// algorithm as hash4
-func bulkHash4(b []byte, dst []uint32) {
-	if len(b) < minMatchLength {
-		return
-	}
-	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
-	dst[0] = (hb * hashmul) >> (32 - hashBits)
-	end := len(b) - minMatchLength + 1
-	for i := 1; i < end; i++ {
-		hb = (hb << 8) | uint32(b[i+3])
-		dst[i] = (hb * hashmul) >> (32 - hashBits)
-	}
-}
-
-// matchLen returns the number of matching bytes in a and b
-// up to length 'max'. Both slices must be at least 'max'
-// bytes in size.
-func matchLen(a, b []byte, max int) int {
-	a = a[:max]
-	b = b[:len(a)]
-	for i, av := range a {
-		if b[i] != av {
-			return i
-		}
-	}
-	return max
-}
-
-func (d *compressor) initDeflate() {
-	d.window = make([]byte, 2*windowSize)
-	d.hashOffset = 1
-	d.length = minMatchLength - 1
-	d.offset = 0
-	d.byteAvailable = false
-	d.index = 0
-	d.hash = 0
-	d.chainHead = -1
-	d.bulkHasher = bulkHash4
-	if useSSE42 {
-		d.bulkHasher = crc32sseAll
-	}
-}
-
-// Assumes that d.fastSkipHashing != skipNever,
-// otherwise use deflateLazy
-func (d *compressor) deflate() {
-
-	// Sanity enables additional runtime tests.
-	// It's intended to be used during development
-	// to supplement the currently ad-hoc unit tests.
-	const sanity = false
-
-	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
-		return
-	}
-
-	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
-	if d.index < d.maxInsertIndex {
-		d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-	}
-
-	for {
-		if sanity && d.index > d.windowEnd {
-			panic("index > windowEnd")
-		}
-		lookahead := d.windowEnd - d.index
-		if lookahead < minMatchLength+maxMatchLength {
-			if !d.sync {
-				return
-			}
-			if sanity && d.index > d.windowEnd {
-				panic("index > windowEnd")
-			}
-			if lookahead == 0 {
-				if d.tokens.n > 0 {
-					if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
-						return
-					}
-					d.tokens.n = 0
-				}
-				return
-			}
-		}
-		if d.index < d.maxInsertIndex {
-			// Update the hash
-			d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-			ch := d.hashHead[d.hash&hashMask]
-			d.chainHead = int(ch)
-			d.hashPrev[d.index&windowMask] = ch
-			d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset)
-		}
-		d.length = minMatchLength - 1
-		d.offset = 0
-		minIndex := d.index - windowSize
-		if minIndex < 0 {
-			minIndex = 0
-		}
-
-		if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 {
-			if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-				d.length = newLength
-				d.offset = newOffset
-			}
-		}
-		if d.length >= minMatchLength {
-			d.ii = 0
-			// There was a match at the previous step, and the current match is
-			// not better. Output the previous match.
-			// "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3
-			d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize))
-			d.tokens.n++
-			// Insert in the hash table all strings up to the end of the match.
-			// index and index-1 are already inserted. If there is not enough
-			// lookahead, the last two strings are not inserted into the hash
-			// table.
-			if d.length <= d.fastSkipHashing {
-				var newIndex int
-				newIndex = d.index + d.length
-				// Calculate missing hashes
-				end := newIndex
-				if end > d.maxInsertIndex {
-					end = d.maxInsertIndex
-				}
-				end += minMatchLength - 1
-				startindex := d.index + 1
-				if startindex > d.maxInsertIndex {
-					startindex = d.maxInsertIndex
-				}
-				tocheck := d.window[startindex:end]
-				dstSize := len(tocheck) - minMatchLength + 1
-				if dstSize > 0 {
-					dst := d.hashMatch[:dstSize]
-					bulkHash4(tocheck, dst)
-					var newH uint32
-					for i, val := range dst {
-						di := i + startindex
-						newH = val & hashMask
-						// Get previous value with the same hash.
-						// Our chain should point to the previous value.
-						d.hashPrev[di&windowMask] = d.hashHead[newH]
-						// Set the head of the hash chain to us.
-						d.hashHead[newH] = uint32(di + d.hashOffset)
-					}
-					d.hash = newH
-				}
-				d.index = newIndex
-			} else {
-				// For matches this long, we don't bother inserting each individual
-				// item into the table.
-				d.index += d.length
-				if d.index < d.maxInsertIndex {
-					d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-				}
-			}
-			if d.tokens.n == maxFlateBlockTokens {
-				// The block includes the current character
-				if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
-					return
-				}
-				d.tokens.n = 0
-			}
-		} else {
-			d.ii++
-			end := d.index + int(d.ii>>uint(d.fastSkipHashing)) + 1
-			if end > d.windowEnd {
-				end = d.windowEnd
-			}
-			for i := d.index; i < end; i++ {
-				d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i]))
-				d.tokens.n++
-				if d.tokens.n == maxFlateBlockTokens {
-					if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil {
-						return
-					}
-					d.tokens.n = 0
-				}
-			}
-			d.index = end
-		}
-	}
-}
-
-// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
-// meaning it always has lazy matching on.
-func (d *compressor) deflateLazy() {
-	// Sanity enables additional runtime tests.
-	// It's intended to be used during development
-	// to supplement the currently ad-hoc unit tests.
-	const sanity = false
-
-	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
-		return
-	}
-
-	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
-	if d.index < d.maxInsertIndex {
-		d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-	}
-
-	for {
-		if sanity && d.index > d.windowEnd {
-			panic("index > windowEnd")
-		}
-		lookahead := d.windowEnd - d.index
-		if lookahead < minMatchLength+maxMatchLength {
-			if !d.sync {
-				return
-			}
-			if sanity && d.index > d.windowEnd {
-				panic("index > windowEnd")
-			}
-			if lookahead == 0 {
-				// Flush current output block if any.
-				if d.byteAvailable {
-					// There is still one pending token that needs to be flushed
-					d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-					d.tokens.n++
-					d.byteAvailable = false
-				}
-				if d.tokens.n > 0 {
-					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-						return
-					}
-					d.tokens.n = 0
-				}
-				return
-			}
-		}
-		if d.index < d.maxInsertIndex {
-			// Update the hash
-			d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-			ch := d.hashHead[d.hash&hashMask]
-			d.chainHead = int(ch)
-			d.hashPrev[d.index&windowMask] = ch
-			d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset)
-		}
-		prevLength := d.length
-		prevOffset := d.offset
-		d.length = minMatchLength - 1
-		d.offset = 0
-		minIndex := d.index - windowSize
-		if minIndex < 0 {
-			minIndex = 0
-		}
-
-		if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
-			if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-				d.length = newLength
-				d.offset = newOffset
-			}
-		}
-		if prevLength >= minMatchLength && d.length <= prevLength {
-			// There was a match at the previous step, and the current match is
-			// not better. Output the previous match.
-			d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
-			d.tokens.n++
-
-			// Insert in the hash table all strings up to the end of the match.
-			// index and index-1 are already inserted. If there is not enough
-			// lookahead, the last two strings are not inserted into the hash
-			// table.
-			var newIndex int
-			newIndex = d.index + prevLength - 1
-			// Calculate missing hashes
-			end := newIndex
-			if end > d.maxInsertIndex {
-				end = d.maxInsertIndex
-			}
-			end += minMatchLength - 1
-			startindex := d.index + 1
-			if startindex > d.maxInsertIndex {
-				startindex = d.maxInsertIndex
-			}
-			tocheck := d.window[startindex:end]
-			dstSize := len(tocheck) - minMatchLength + 1
-			if dstSize > 0 {
-				dst := d.hashMatch[:dstSize]
-				bulkHash4(tocheck, dst)
-				var newH uint32
-				for i, val := range dst {
-					di := i + startindex
-					newH = val & hashMask
-					// Get previous value with the same hash.
-					// Our chain should point to the previous value.
-					d.hashPrev[di&windowMask] = d.hashHead[newH]
-					// Set the head of the hash chain to us.
-					d.hashHead[newH] = uint32(di + d.hashOffset)
-				}
-				d.hash = newH
-			}
-
-			d.index = newIndex
-			d.byteAvailable = false
-			d.length = minMatchLength - 1
-			if d.tokens.n == maxFlateBlockTokens {
-				// The block includes the current character
-				if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-					return
-				}
-				d.tokens.n = 0
-			}
-		} else {
-			// Reset, if we got a match this run.
-			if d.length >= minMatchLength {
-				d.ii = 0
-			}
-			// We have a byte waiting. Emit it.
-			if d.byteAvailable {
-				d.ii++
-				d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-				d.tokens.n++
-				if d.tokens.n == maxFlateBlockTokens {
-					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-						return
-					}
-					d.tokens.n = 0
-				}
-				d.index++
-
-				// If we have a long run of no matches, skip additional bytes
-				// Resets when d.ii overflows after 64KB.
-				if d.ii > 31 {
-					n := int(d.ii >> 5)
-					for j := 0; j < n; j++ {
-						if d.index >= d.windowEnd-1 {
-							break
-						}
-
-						d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-						d.tokens.n++
-						if d.tokens.n == maxFlateBlockTokens {
-							if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-								return
-							}
-							d.tokens.n = 0
-						}
-						d.index++
-					}
-					// Flush last byte
-					d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-					d.tokens.n++
-					d.byteAvailable = false
-					// d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength
-					if d.tokens.n == maxFlateBlockTokens {
-						if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-							return
-						}
-						d.tokens.n = 0
-					}
-				}
-			} else {
-				d.index++
-				d.byteAvailable = true
-			}
-		}
-	}
-}
-
-// Assumes that d.fastSkipHashing != skipNever,
-// otherwise use deflateLazySSE
-func (d *compressor) deflateSSE() {
-
-	// Sanity enables additional runtime tests.
-	// It's intended to be used during development
-	// to supplement the currently ad-hoc unit tests.
-	const sanity = false
-
-	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
-		return
-	}
-
-	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
-	if d.index < d.maxInsertIndex {
-		d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-	}
-
-	for {
-		if sanity && d.index > d.windowEnd {
-			panic("index > windowEnd")
-		}
-		lookahead := d.windowEnd - d.index
-		if lookahead < minMatchLength+maxMatchLength {
-			if !d.sync {
-				return
-			}
-			if sanity && d.index > d.windowEnd {
-				panic("index > windowEnd")
-			}
-			if lookahead == 0 {
-				if d.tokens.n > 0 {
-					if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
-						return
-					}
-					d.tokens.n = 0
-				}
-				return
-			}
-		}
-		if d.index < d.maxInsertIndex {
-			// Update the hash
-			d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-			ch := d.hashHead[d.hash]
-			d.chainHead = int(ch)
-			d.hashPrev[d.index&windowMask] = ch
-			d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
-		}
-		d.length = minMatchLength - 1
-		d.offset = 0
-		minIndex := d.index - windowSize
-		if minIndex < 0 {
-			minIndex = 0
-		}
-
-		if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 {
-			if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-				d.length = newLength
-				d.offset = newOffset
-			}
-		}
-		if d.length >= minMatchLength {
-			d.ii = 0
-			// There was a match at the previous step, and the current match is
-			// not better. Output the previous match.
-			// "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3
-			d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize))
-			d.tokens.n++
-			// Insert in the hash table all strings up to the end of the match.
-			// index and index-1 are already inserted. If there is not enough
-			// lookahead, the last two strings are not inserted into the hash
-			// table.
-			if d.length <= d.fastSkipHashing {
-				var newIndex int
-				newIndex = d.index + d.length
-				// Calculate missing hashes
-				end := newIndex
-				if end > d.maxInsertIndex {
-					end = d.maxInsertIndex
-				}
-				end += minMatchLength - 1
-				startindex := d.index + 1
-				if startindex > d.maxInsertIndex {
-					startindex = d.maxInsertIndex
-				}
-				tocheck := d.window[startindex:end]
-				dstSize := len(tocheck) - minMatchLength + 1
-				if dstSize > 0 {
-					dst := d.hashMatch[:dstSize]
-
-					crc32sseAll(tocheck, dst)
-					var newH uint32
-					for i, val := range dst {
-						di := i + startindex
-						newH = val & hashMask
-						// Get previous value with the same hash.
-						// Our chain should point to the previous value.
-						d.hashPrev[di&windowMask] = d.hashHead[newH]
-						// Set the head of the hash chain to us.
-						d.hashHead[newH] = uint32(di + d.hashOffset)
-					}
-					d.hash = newH
-				}
-				d.index = newIndex
-			} else {
-				// For matches this long, we don't bother inserting each individual
-				// item into the table.
-				d.index += d.length
-				if d.index < d.maxInsertIndex {
-					d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-				}
-			}
-			if d.tokens.n == maxFlateBlockTokens {
-				// The block includes the current character
-				if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
-					return
-				}
-				d.tokens.n = 0
-			}
-		} else {
-			d.ii++
-			end := d.index + int(d.ii>>5) + 1
-			if end > d.windowEnd {
-				end = d.windowEnd
-			}
-			for i := d.index; i < end; i++ {
-				d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i]))
-				d.tokens.n++
-				if d.tokens.n == maxFlateBlockTokens {
-					if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil {
-						return
-					}
-					d.tokens.n = 0
-				}
-			}
-			d.index = end
-		}
-	}
-}
-
-// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
-// meaning it always has lazy matching on.
-func (d *compressor) deflateLazySSE() {
-	// Sanity enables additional runtime tests.
-	// It's intended to be used during development
-	// to supplement the currently ad-hoc unit tests.
-	const sanity = false
-
-	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
-		return
-	}
-
-	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
-	if d.index < d.maxInsertIndex {
-		d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-	}
-
-	for {
-		if sanity && d.index > d.windowEnd {
-			panic("index > windowEnd")
-		}
-		lookahead := d.windowEnd - d.index
-		if lookahead < minMatchLength+maxMatchLength {
-			if !d.sync {
-				return
-			}
-			if sanity && d.index > d.windowEnd {
-				panic("index > windowEnd")
-			}
-			if lookahead == 0 {
-				// Flush current output block if any.
-				if d.byteAvailable {
-					// There is still one pending token that needs to be flushed
-					d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-					d.tokens.n++
-					d.byteAvailable = false
-				}
-				if d.tokens.n > 0 {
-					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-						return
-					}
-					d.tokens.n = 0
-				}
-				return
-			}
-		}
-		if d.index < d.maxInsertIndex {
-			// Update the hash
-			d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-			ch := d.hashHead[d.hash]
-			d.chainHead = int(ch)
-			d.hashPrev[d.index&windowMask] = ch
-			d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
-		}
-		prevLength := d.length
-		prevOffset := d.offset
-		d.length = minMatchLength - 1
-		d.offset = 0
-		minIndex := d.index - windowSize
-		if minIndex < 0 {
-			minIndex = 0
-		}
-
-		if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
-			if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-				d.length = newLength
-				d.offset = newOffset
-			}
-		}
-		if prevLength >= minMatchLength && d.length <= prevLength {
-			// There was a match at the previous step, and the current match is
-			// not better. Output the previous match.
-			d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
-			d.tokens.n++
-
-			// Insert in the hash table all strings up to the end of the match.
-			// index and index-1 are already inserted. If there is not enough
-			// lookahead, the last two strings are not inserted into the hash
-			// table.
-			var newIndex int
-			newIndex = d.index + prevLength - 1
-			// Calculate missing hashes
-			end := newIndex
-			if end > d.maxInsertIndex {
-				end = d.maxInsertIndex
-			}
-			end += minMatchLength - 1
-			startindex := d.index + 1
-			if startindex > d.maxInsertIndex {
-				startindex = d.maxInsertIndex
-			}
-			tocheck := d.window[startindex:end]
-			dstSize := len(tocheck) - minMatchLength + 1
-			if dstSize > 0 {
-				dst := d.hashMatch[:dstSize]
-				crc32sseAll(tocheck, dst)
-				var newH uint32
-				for i, val := range dst {
-					di := i + startindex
-					newH = val & hashMask
-					// Get previous value with the same hash.
-					// Our chain should point to the previous value.
-					d.hashPrev[di&windowMask] = d.hashHead[newH]
-					// Set the head of the hash chain to us.
-					d.hashHead[newH] = uint32(di + d.hashOffset)
-				}
-				d.hash = newH
-			}
-
-			d.index = newIndex
-			d.byteAvailable = false
-			d.length = minMatchLength - 1
-			if d.tokens.n == maxFlateBlockTokens {
-				// The block includes the current character
-				if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-					return
-				}
-				d.tokens.n = 0
-			}
-		} else {
-			// Reset, if we got a match this run.
-			if d.length >= minMatchLength {
-				d.ii = 0
-			}
-			// We have a byte waiting. Emit it.
-			if d.byteAvailable {
-				d.ii++
-				d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-				d.tokens.n++
-				if d.tokens.n == maxFlateBlockTokens {
-					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-						return
-					}
-					d.tokens.n = 0
-				}
-				d.index++
-
-				// If we have a long run of no matches, skip additional bytes
-				// Resets when d.ii overflows after 64KB.
-				if d.ii > 31 {
-					n := int(d.ii >> 6)
-					for j := 0; j < n; j++ {
-						if d.index >= d.windowEnd-1 {
-							break
-						}
-
-						d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-						d.tokens.n++
-						if d.tokens.n == maxFlateBlockTokens {
-							if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-								return
-							}
-							d.tokens.n = 0
-						}
-						d.index++
-					}
-					// Flush last byte
-					d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-					d.tokens.n++
-					d.byteAvailable = false
-					// d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength
-					if d.tokens.n == maxFlateBlockTokens {
-						if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-							return
-						}
-						d.tokens.n = 0
-					}
-				}
-			} else {
-				d.index++
-				d.byteAvailable = true
-			}
-		}
-	}
-}
-
-func (d *compressor) store() {
-	if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
-		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
-		d.windowEnd = 0
-	}
-}
-
-// fillWindow will fill the buffer with data for huffman-only compression.
-// The number of bytes copied is returned.
-func (d *compressor) fillBlock(b []byte) int {
-	n := copy(d.window[d.windowEnd:], b)
-	d.windowEnd += n
-	return n
-}
-
-// storeHuff will compress and store the currently added data,
-// if enough has been accumulated or we at the end of the stream.
-// Any error that occurred will be in d.err
-func (d *compressor) storeHuff() {
-	if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
-		return
-	}
-	d.w.writeBlockHuff(false, d.window[:d.windowEnd])
-	d.err = d.w.err
-	d.windowEnd = 0
-}
-
-// storeHuff will compress and store the currently added data,
-// if enough has been accumulated or we at the end of the stream.
-// Any error that occurred will be in d.err
-func (d *compressor) storeSnappy() {
-	// We only compress if we have maxStoreBlockSize.
-	if d.windowEnd < maxStoreBlockSize {
-		if !d.sync {
-			return
-		}
-		// Handle extremely small sizes.
-		if d.windowEnd < 128 {
-			if d.windowEnd == 0 {
-				return
-			}
-			if d.windowEnd <= 32 {
-				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
-				d.tokens.n = 0
-				d.windowEnd = 0
-			} else {
-				d.w.writeBlockHuff(false, d.window[:d.windowEnd])
-				d.err = d.w.err
-			}
-			d.tokens.n = 0
-			d.windowEnd = 0
-			d.snap.Reset()
-			return
-		}
-	}
-
-	d.snap.Encode(&d.tokens, d.window[:d.windowEnd])
-	// If we made zero matches, store the block as is.
-	if int(d.tokens.n) == d.windowEnd {
-		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
-		// If we removed less than 1/16th, huffman compress the block.
-	} else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
-		d.w.writeBlockHuff(false, d.window[:d.windowEnd])
-		d.err = d.w.err
-	} else {
-		d.w.writeBlockDynamic(d.tokens.tokens[:d.tokens.n], false, d.window[:d.windowEnd])
-		d.err = d.w.err
-	}
-	d.tokens.n = 0
-	d.windowEnd = 0
-}
-
-// write will add input byte to the stream.
-// Unless an error occurs all bytes will be consumed.
-func (d *compressor) write(b []byte) (n int, err error) {
-	if d.err != nil {
-		return 0, d.err
-	}
-	n = len(b)
-	for len(b) > 0 {
-		d.step(d)
-		b = b[d.fill(d, b):]
-		if d.err != nil {
-			return 0, d.err
-		}
-	}
-	return n, d.err
-}
-
-func (d *compressor) syncFlush() error {
-	d.sync = true
-	if d.err != nil {
-		return d.err
-	}
-	d.step(d)
-	if d.err == nil {
-		d.w.writeStoredHeader(0, false)
-		d.w.flush()
-		d.err = d.w.err
-	}
-	d.sync = false
-	return d.err
-}
-
-func (d *compressor) init(w io.Writer, level int) (err error) {
-	d.w = newHuffmanBitWriter(w)
-
-	switch {
-	case level == NoCompression:
-		d.window = make([]byte, maxStoreBlockSize)
-		d.fill = (*compressor).fillBlock
-		d.step = (*compressor).store
-	case level == ConstantCompression:
-		d.window = make([]byte, maxStoreBlockSize)
-		d.fill = (*compressor).fillBlock
-		d.step = (*compressor).storeHuff
-	case level >= 1 && level <= 4:
-		d.snap = newSnappy(level)
-		d.window = make([]byte, maxStoreBlockSize)
-		d.fill = (*compressor).fillBlock
-		d.step = (*compressor).storeSnappy
-	case level == DefaultCompression:
-		level = 5
-		fallthrough
-	case 5 <= level && level <= 9:
-		d.compressionLevel = levels[level]
-		d.initDeflate()
-		d.fill = (*compressor).fillDeflate
-		if d.fastSkipHashing == skipNever {
-			if useSSE42 {
-				d.step = (*compressor).deflateLazySSE
-			} else {
-				d.step = (*compressor).deflateLazy
-			}
-		} else {
-			if useSSE42 {
-				d.step = (*compressor).deflateSSE
-			} else {
-				d.step = (*compressor).deflate
-
-			}
-		}
-	default:
-		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
-	}
-	return nil
-}
-
-// reset the state of the compressor.
-func (d *compressor) reset(w io.Writer) {
-	d.w.reset(w)
-	d.sync = false
-	d.err = nil
-	// We only need to reset a few things for Snappy.
-	if d.snap != nil {
-		d.snap.Reset()
-		d.windowEnd = 0
-		d.tokens.n = 0
-		return
-	}
-	switch d.compressionLevel.chain {
-	case 0:
-		// level was NoCompression or ConstantCompresssion.
-		d.windowEnd = 0
-	default:
-		d.chainHead = -1
-		for i := range d.hashHead {
-			d.hashHead[i] = 0
-		}
-		for i := range d.hashPrev {
-			d.hashPrev[i] = 0
-		}
-		d.hashOffset = 1
-		d.index, d.windowEnd = 0, 0
-		d.blockStart, d.byteAvailable = 0, false
-		d.tokens.n = 0
-		d.length = minMatchLength - 1
-		d.offset = 0
-		d.hash = 0
-		d.ii = 0
-		d.maxInsertIndex = 0
-	}
-}
-
-func (d *compressor) close() error {
-	if d.err != nil {
-		return d.err
-	}
-	d.sync = true
-	d.step(d)
-	if d.err != nil {
-		return d.err
-	}
-	if d.w.writeStoredHeader(0, true); d.w.err != nil {
-		return d.w.err
-	}
-	d.w.flush()
-	return d.w.err
-}
-
-// NewWriter returns a new Writer compressing data at the given level.
-// Following zlib, levels range from 1 (BestSpeed) to 9 (BestCompression);
-// higher levels typically run slower but compress more.
-// Level 0 (NoCompression) does not attempt any compression; it only adds the
-// necessary DEFLATE framing.
-// Level -1 (DefaultCompression) uses the default compression level.
-// Level -2 (ConstantCompression) will use Huffman compression only, giving
-// a very fast compression for all types of input, but sacrificing considerable
-// compression efficiency.
-//
-// If level is in the range [-2, 9] then the error returned will be nil.
-// Otherwise the error returned will be non-nil.
-func NewWriter(w io.Writer, level int) (*Writer, error) {
-	var dw Writer
-	if err := dw.d.init(w, level); err != nil {
-		return nil, err
-	}
-	return &dw, nil
-}
-
-// NewWriterDict is like NewWriter but initializes the new
-// Writer with a preset dictionary.  The returned Writer behaves
-// as if the dictionary had been written to it without producing
-// any compressed output.  The compressed data written to w
-// can only be decompressed by a Reader initialized with the
-// same dictionary.
-func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
-	dw := &dictWriter{w}
-	zw, err := NewWriter(dw, level)
-	if err != nil {
-		return nil, err
-	}
-	zw.d.fillWindow(dict)
-	zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
-	return zw, err
-}
-
-type dictWriter struct {
-	w io.Writer
-}
-
-func (w *dictWriter) Write(b []byte) (n int, err error) {
-	return w.w.Write(b)
-}
-
-// A Writer takes data written to it and writes the compressed
-// form of that data to an underlying writer (see NewWriter).
-type Writer struct {
-	d    compressor
-	dict []byte
-}
-
-// Write writes data to w, which will eventually write the
-// compressed form of data to its underlying writer.
-func (w *Writer) Write(data []byte) (n int, err error) {
-	return w.d.write(data)
-}
-
-// Flush flushes any pending data to the underlying writer.
-// It is useful mainly in compressed network protocols, to ensure that
-// a remote reader has enough data to reconstruct a packet.
-// Flush does not return until the data has been written.
-// Calling Flush when there is no pending data still causes the Writer
-// to emit a sync marker of at least 4 bytes.
-// If the underlying writer returns an error, Flush returns that error.
-//
-// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
-func (w *Writer) Flush() error {
-	// For more about flushing:
-	// http://www.bolet.org/~pornin/deflate-flush.html
-	return w.d.syncFlush()
-}
-
-// Close flushes and closes the writer.
-func (w *Writer) Close() error {
-	return w.d.close()
-}
-
-// Reset discards the writer's state and makes it equivalent to
-// the result of NewWriter or NewWriterDict called with dst
-// and w's level and dictionary.
-func (w *Writer) Reset(dst io.Writer) {
-	if dw, ok := w.d.w.writer.(*dictWriter); ok {
-		// w was created with NewWriterDict
-		dw.w = dst
-		w.d.reset(dw)
-		w.d.fillWindow(w.dict)
-	} else {
-		// w was created with NewWriter
-		w.d.reset(dst)
-	}
-}
-
-// ResetDict discards the writer's state and makes it equivalent to
-// the result of NewWriter or NewWriterDict called with dst
-// and w's level, but sets a specific dictionary.
-func (w *Writer) ResetDict(dst io.Writer, dict []byte) {
-	w.dict = dict
-	w.d.reset(dst)
-	w.d.fillWindow(w.dict)
-}
diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
deleted file mode 100644
index 71c75a06..00000000
--- a/vendor/github.com/klauspost/compress/flate/dict_decoder.go
+++ /dev/null
@@ -1,184 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-// dictDecoder implements the LZ77 sliding dictionary as used in decompression.
-// LZ77 decompresses data through sequences of two forms of commands:
-//
-//	* Literal insertions: Runs of one or more symbols are inserted into the data
-//	stream as is. This is accomplished through the writeByte method for a
-//	single symbol, or combinations of writeSlice/writeMark for multiple symbols.
-//	Any valid stream must start with a literal insertion if no preset dictionary
-//	is used.
-//
-//	* Backward copies: Runs of one or more symbols are copied from previously
-//	emitted data. Backward copies come as the tuple (dist, length) where dist
-//	determines how far back in the stream to copy from and length determines how
-//	many bytes to copy. Note that it is valid for the length to be greater than
-//	the distance. Since LZ77 uses forward copies, that situation is used to
-//	perform a form of run-length encoding on repeated runs of symbols.
-//	The writeCopy and tryWriteCopy are used to implement this command.
-//
-// For performance reasons, this implementation performs little to no sanity
-// checks about the arguments. As such, the invariants documented for each
-// method call must be respected.
-type dictDecoder struct {
-	hist []byte // Sliding window history
-
-	// Invariant: 0 <= rdPos <= wrPos <= len(hist)
-	wrPos int  // Current output position in buffer
-	rdPos int  // Have emitted hist[:rdPos] already
-	full  bool // Has a full window length been written yet?
-}
-
-// init initializes dictDecoder to have a sliding window dictionary of the given
-// size. If a preset dict is provided, it will initialize the dictionary with
-// the contents of dict.
-func (dd *dictDecoder) init(size int, dict []byte) {
-	*dd = dictDecoder{hist: dd.hist}
-
-	if cap(dd.hist) < size {
-		dd.hist = make([]byte, size)
-	}
-	dd.hist = dd.hist[:size]
-
-	if len(dict) > len(dd.hist) {
-		dict = dict[len(dict)-len(dd.hist):]
-	}
-	dd.wrPos = copy(dd.hist, dict)
-	if dd.wrPos == len(dd.hist) {
-		dd.wrPos = 0
-		dd.full = true
-	}
-	dd.rdPos = dd.wrPos
-}
-
-// histSize reports the total amount of historical data in the dictionary.
-func (dd *dictDecoder) histSize() int {
-	if dd.full {
-		return len(dd.hist)
-	}
-	return dd.wrPos
-}
-
-// availRead reports the number of bytes that can be flushed by readFlush.
-func (dd *dictDecoder) availRead() int {
-	return dd.wrPos - dd.rdPos
-}
-
-// availWrite reports the available amount of output buffer space.
-func (dd *dictDecoder) availWrite() int {
-	return len(dd.hist) - dd.wrPos
-}
-
-// writeSlice returns a slice of the available buffer to write data to.
-//
-// This invariant will be kept: len(s) <= availWrite()
-func (dd *dictDecoder) writeSlice() []byte {
-	return dd.hist[dd.wrPos:]
-}
-
-// writeMark advances the writer pointer by cnt.
-//
-// This invariant must be kept: 0 <= cnt <= availWrite()
-func (dd *dictDecoder) writeMark(cnt int) {
-	dd.wrPos += cnt
-}
-
-// writeByte writes a single byte to the dictionary.
-//
-// This invariant must be kept: 0 < availWrite()
-func (dd *dictDecoder) writeByte(c byte) {
-	dd.hist[dd.wrPos] = c
-	dd.wrPos++
-}
-
-// writeCopy copies a string at a given (dist, length) to the output.
-// This returns the number of bytes copied and may be less than the requested
-// length if the available space in the output buffer is too small.
-//
-// This invariant must be kept: 0 < dist <= histSize()
-func (dd *dictDecoder) writeCopy(dist, length int) int {
-	dstBase := dd.wrPos
-	dstPos := dstBase
-	srcPos := dstPos - dist
-	endPos := dstPos + length
-	if endPos > len(dd.hist) {
-		endPos = len(dd.hist)
-	}
-
-	// Copy non-overlapping section after destination position.
-	//
-	// This section is non-overlapping in that the copy length for this section
-	// is always less than or equal to the backwards distance. This can occur
-	// if a distance refers to data that wraps-around in the buffer.
-	// Thus, a backwards copy is performed here; that is, the exact bytes in
-	// the source prior to the copy is placed in the destination.
-	if srcPos < 0 {
-		srcPos += len(dd.hist)
-		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:])
-		srcPos = 0
-	}
-
-	// Copy possibly overlapping section before destination position.
-	//
-	// This section can overlap if the copy length for this section is larger
-	// than the backwards distance. This is allowed by LZ77 so that repeated
-	// strings can be succinctly represented using (dist, length) pairs.
-	// Thus, a forwards copy is performed here; that is, the bytes copied is
-	// possibly dependent on the resulting bytes in the destination as the copy
-	// progresses along. This is functionally equivalent to the following:
-	//
-	//	for i := 0; i < endPos-dstPos; i++ {
-	//		dd.hist[dstPos+i] = dd.hist[srcPos+i]
-	//	}
-	//	dstPos = endPos
-	//
-	for dstPos < endPos {
-		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
-	}
-
-	dd.wrPos = dstPos
-	return dstPos - dstBase
-}
-
-// tryWriteCopy tries to copy a string at a given (distance, length) to the
-// output. This specialized version is optimized for short distances.
-//
-// This method is designed to be inlined for performance reasons.
-//
-// This invariant must be kept: 0 < dist <= histSize()
-func (dd *dictDecoder) tryWriteCopy(dist, length int) int {
-	dstPos := dd.wrPos
-	endPos := dstPos + length
-	if dstPos < dist || endPos > len(dd.hist) {
-		return 0
-	}
-	dstBase := dstPos
-	srcPos := dstPos - dist
-
-	// Copy possibly overlapping section before destination position.
-loop:
-	dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
-	if dstPos < endPos {
-		goto loop // Avoid for-loop so that this function can be inlined
-	}
-
-	dd.wrPos = dstPos
-	return dstPos - dstBase
-}
-
-// readFlush returns a slice of the historical buffer that is ready to be
-// emitted to the user. The data returned by readFlush must be fully consumed
-// before calling any other dictDecoder methods.
-func (dd *dictDecoder) readFlush() []byte {
-	toRead := dd.hist[dd.rdPos:dd.wrPos]
-	dd.rdPos = dd.wrPos
-	if dd.wrPos == len(dd.hist) {
-		dd.wrPos, dd.rdPos = 0, 0
-		dd.full = true
-	}
-	return toRead
-}
diff --git a/vendor/github.com/klauspost/compress/flate/gen.go b/vendor/github.com/klauspost/compress/flate/gen.go
deleted file mode 100644
index 154c89a4..00000000
--- a/vendor/github.com/klauspost/compress/flate/gen.go
+++ /dev/null
@@ -1,265 +0,0 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ignore
-
-// This program generates fixedhuff.go
-// Invoke as
-//
-//	go run gen.go -output fixedhuff.go
-
-package main
-
-import (
-	"bytes"
-	"flag"
-	"fmt"
-	"go/format"
-	"io/ioutil"
-	"log"
-)
-
-var filename = flag.String("output", "fixedhuff.go", "output file name")
-
-const maxCodeLen = 16
-
-// Note: the definition of the huffmanDecoder struct is copied from
-// inflate.go, as it is private to the implementation.
-
-// chunk & 15 is number of bits
-// chunk >> 4 is value, including table link
-
-const (
-	huffmanChunkBits  = 9
-	huffmanNumChunks  = 1 << huffmanChunkBits
-	huffmanCountMask  = 15
-	huffmanValueShift = 4
-)
-
-type huffmanDecoder struct {
-	min      int                      // the minimum code length
-	chunks   [huffmanNumChunks]uint32 // chunks as described above
-	links    [][]uint32               // overflow links
-	linkMask uint32                   // mask the width of the link table
-}
-
-// Initialize Huffman decoding tables from array of code lengths.
-// Following this function, h is guaranteed to be initialized into a complete
-// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
-// degenerate case where the tree has only a single symbol with length 1. Empty
-// trees are permitted.
-func (h *huffmanDecoder) init(bits []int) bool {
-	// Sanity enables additional runtime tests during Huffman
-	// table construction.  It's intended to be used during
-	// development to supplement the currently ad-hoc unit tests.
-	const sanity = false
-
-	if h.min != 0 {
-		*h = huffmanDecoder{}
-	}
-
-	// Count number of codes of each length,
-	// compute min and max length.
-	var count [maxCodeLen]int
-	var min, max int
-	for _, n := range bits {
-		if n == 0 {
-			continue
-		}
-		if min == 0 || n < min {
-			min = n
-		}
-		if n > max {
-			max = n
-		}
-		count[n]++
-	}
-
-	// Empty tree. The decompressor.huffSym function will fail later if the tree
-	// is used. Technically, an empty tree is only valid for the HDIST tree and
-	// not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
-	// is guaranteed to fail since it will attempt to use the tree to decode the
-	// codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
-	// guaranteed to fail later since the compressed data section must be
-	// composed of at least one symbol (the end-of-block marker).
-	if max == 0 {
-		return true
-	}
-
-	code := 0
-	var nextcode [maxCodeLen]int
-	for i := min; i <= max; i++ {
-		code <<= 1
-		nextcode[i] = code
-		code += count[i]
-	}
-
-	// Check that the coding is complete (i.e., that we've
-	// assigned all 2-to-the-max possible bit sequences).
-	// Exception: To be compatible with zlib, we also need to
-	// accept degenerate single-code codings.  See also
-	// TestDegenerateHuffmanCoding.
-	if code != 1<<uint(max) && !(code == 1 && max == 1) {
-		return false
-	}
-
-	h.min = min
-	if max > huffmanChunkBits {
-		numLinks := 1 << (uint(max) - huffmanChunkBits)
-		h.linkMask = uint32(numLinks - 1)
-
-		// create link tables
-		link := nextcode[huffmanChunkBits+1] >> 1
-		h.links = make([][]uint32, huffmanNumChunks-link)
-		for j := uint(link); j < huffmanNumChunks; j++ {
-			reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
-			reverse >>= uint(16 - huffmanChunkBits)
-			off := j - uint(link)
-			if sanity && h.chunks[reverse] != 0 {
-				panic("impossible: overwriting existing chunk")
-			}
-			h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
-			h.links[off] = make([]uint32, numLinks)
-		}
-	}
-
-	for i, n := range bits {
-		if n == 0 {
-			continue
-		}
-		code := nextcode[n]
-		nextcode[n]++
-		chunk := uint32(i<<huffmanValueShift | n)
-		reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
-		reverse >>= uint(16 - n)
-		if n <= huffmanChunkBits {
-			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
-				// We should never need to overwrite
-				// an existing chunk.  Also, 0 is
-				// never a valid chunk, because the
-				// lower 4 "count" bits should be
-				// between 1 and 15.
-				if sanity && h.chunks[off] != 0 {
-					panic("impossible: overwriting existing chunk")
-				}
-				h.chunks[off] = chunk
-			}
-		} else {
-			j := reverse & (huffmanNumChunks - 1)
-			if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
-				// Longer codes should have been
-				// associated with a link table above.
-				panic("impossible: not an indirect chunk")
-			}
-			value := h.chunks[j] >> huffmanValueShift
-			linktab := h.links[value]
-			reverse >>= huffmanChunkBits
-			for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
-				if sanity && linktab[off] != 0 {
-					panic("impossible: overwriting existing chunk")
-				}
-				linktab[off] = chunk
-			}
-		}
-	}
-
-	if sanity {
-		// Above we've sanity checked that we never overwrote
-		// an existing entry.  Here we additionally check that
-		// we filled the tables completely.
-		for i, chunk := range h.chunks {
-			if chunk == 0 {
-				// As an exception, in the degenerate
-				// single-code case, we allow odd
-				// chunks to be missing.
-				if code == 1 && i%2 == 1 {
-					continue
-				}
-				panic("impossible: missing chunk")
-			}
-		}
-		for _, linktab := range h.links {
-			for _, chunk := range linktab {
-				if chunk == 0 {
-					panic("impossible: missing chunk")
-				}
-			}
-		}
-	}
-
-	return true
-}
-
-func main() {
-	flag.Parse()
-
-	var h huffmanDecoder
-	var bits [288]int
-	initReverseByte()
-	for i := 0; i < 144; i++ {
-		bits[i] = 8
-	}
-	for i := 144; i < 256; i++ {
-		bits[i] = 9
-	}
-	for i := 256; i < 280; i++ {
-		bits[i] = 7
-	}
-	for i := 280; i < 288; i++ {
-		bits[i] = 8
-	}
-	h.init(bits[:])
-	if h.links != nil {
-		log.Fatal("Unexpected links table in fixed Huffman decoder")
-	}
-
-	var buf bytes.Buffer
-
-	fmt.Fprintf(&buf, `// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.`+"\n\n")
-
-	fmt.Fprintln(&buf, "package flate")
-	fmt.Fprintln(&buf)
-	fmt.Fprintln(&buf, "// autogenerated by go run gen.go -output fixedhuff.go, DO NOT EDIT")
-	fmt.Fprintln(&buf)
-	fmt.Fprintln(&buf, "var fixedHuffmanDecoder = huffmanDecoder{")
-	fmt.Fprintf(&buf, "\t%d,\n", h.min)
-	fmt.Fprintln(&buf, "\t[huffmanNumChunks]uint32{")
-	for i := 0; i < huffmanNumChunks; i++ {
-		if i&7 == 0 {
-			fmt.Fprintf(&buf, "\t\t")
-		} else {
-			fmt.Fprintf(&buf, " ")
-		}
-		fmt.Fprintf(&buf, "0x%04x,", h.chunks[i])
-		if i&7 == 7 {
-			fmt.Fprintln(&buf)
-		}
-	}
-	fmt.Fprintln(&buf, "\t},")
-	fmt.Fprintln(&buf, "\tnil, 0,")
-	fmt.Fprintln(&buf, "}")
-
-	data, err := format.Source(buf.Bytes())
-	if err != nil {
-		log.Fatal(err)
-	}
-	err = ioutil.WriteFile(*filename, data, 0644)
-	if err != nil {
-		log.Fatal(err)
-	}
-}
-
-var reverseByte [256]byte
-
-func initReverseByte() {
-	for x := 0; x < 256; x++ {
-		var result byte
-		for i := uint(0); i < 8; i++ {
-			result |= byte(((x >> i) & 1) << (7 - i))
-		}
-		reverseByte[x] = result
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
deleted file mode 100644
index f9b2a699..00000000
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ /dev/null
@@ -1,701 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-import (
-	"io"
-)
-
-const (
-	// The largest offset code.
-	offsetCodeCount = 30
-
-	// The special code used to mark the end of a block.
-	endBlockMarker = 256
-
-	// The first length code.
-	lengthCodesStart = 257
-
-	// The number of codegen codes.
-	codegenCodeCount = 19
-	badCode          = 255
-
-	// bufferFlushSize indicates the buffer size
-	// after which bytes are flushed to the writer.
-	// Should preferably be a multiple of 6, since
-	// we accumulate 6 bytes between writes to the buffer.
-	bufferFlushSize = 240
-
-	// bufferSize is the actual output byte buffer size.
-	// It must have additional headroom for a flush
-	// which can contain up to 8 bytes.
-	bufferSize = bufferFlushSize + 8
-)
-
-// The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = []int8{
-	/* 257 */ 0, 0, 0,
-	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
-	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
-	/* 280 */ 4, 5, 5, 5, 5, 0,
-}
-
-// The length indicated by length code X - LENGTH_CODES_START.
-var lengthBase = []uint32{
-	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
-	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
-	64, 80, 96, 112, 128, 160, 192, 224, 255,
-}
-
-// offset code word extra bits.
-var offsetExtraBits = []int8{
-	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
-	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
-	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
-	/* extended window */
-	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
-}
-
-var offsetBase = []uint32{
-	/* normal deflate */
-	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
-	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
-	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
-	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
-	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
-	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
-
-	/* extended window */
-	0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
-	0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
-	0x100000, 0x180000, 0x200000, 0x300000,
-}
-
-// The odd order in which the codegen code sizes are written.
-var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
-
-type huffmanBitWriter struct {
-	// writer is the underlying writer.
-	// Do not use it directly; use the write method, which ensures
-	// that Write errors are sticky.
-	writer io.Writer
-
-	// Data waiting to be written is bytes[0:nbytes]
-	// and then the low nbits of bits.
-	bits            uint64
-	nbits           uint
-	bytes           [bufferSize]byte
-	codegenFreq     [codegenCodeCount]int32
-	nbytes          int
-	literalFreq     []int32
-	offsetFreq      []int32
-	codegen         []uint8
-	literalEncoding *huffmanEncoder
-	offsetEncoding  *huffmanEncoder
-	codegenEncoding *huffmanEncoder
-	err             error
-}
-
-func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
-	return &huffmanBitWriter{
-		writer:          w,
-		literalFreq:     make([]int32, maxNumLit),
-		offsetFreq:      make([]int32, offsetCodeCount),
-		codegen:         make([]uint8, maxNumLit+offsetCodeCount+1),
-		literalEncoding: newHuffmanEncoder(maxNumLit),
-		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
-		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
-	}
-}
-
-func (w *huffmanBitWriter) reset(writer io.Writer) {
-	w.writer = writer
-	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
-	w.bytes = [bufferSize]byte{}
-}
-
-func (w *huffmanBitWriter) flush() {
-	if w.err != nil {
-		w.nbits = 0
-		return
-	}
-	n := w.nbytes
-	for w.nbits != 0 {
-		w.bytes[n] = byte(w.bits)
-		w.bits >>= 8
-		if w.nbits > 8 { // Avoid underflow
-			w.nbits -= 8
-		} else {
-			w.nbits = 0
-		}
-		n++
-	}
-	w.bits = 0
-	w.write(w.bytes[:n])
-	w.nbytes = 0
-}
-
-func (w *huffmanBitWriter) write(b []byte) {
-	if w.err != nil {
-		return
-	}
-	_, w.err = w.writer.Write(b)
-}
-
-func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
-	if w.err != nil {
-		return
-	}
-	w.bits |= uint64(b) << w.nbits
-	w.nbits += nb
-	if w.nbits >= 48 {
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		n := w.nbytes
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n >= bufferFlushSize {
-			w.write(w.bytes[:n])
-			n = 0
-		}
-		w.nbytes = n
-	}
-}
-
-func (w *huffmanBitWriter) writeBytes(bytes []byte) {
-	if w.err != nil {
-		return
-	}
-	n := w.nbytes
-	if w.nbits&7 != 0 {
-		w.err = InternalError("writeBytes with unfinished bits")
-		return
-	}
-	for w.nbits != 0 {
-		w.bytes[n] = byte(w.bits)
-		w.bits >>= 8
-		w.nbits -= 8
-		n++
-	}
-	if n != 0 {
-		w.write(w.bytes[:n])
-	}
-	w.nbytes = 0
-	w.write(bytes)
-}
-
-// RFC 1951 3.2.7 specifies a special run-length encoding for specifying
-// the literal and offset lengths arrays (which are concatenated into a single
-// array).  This method generates that run-length encoding.
-//
-// The result is written into the codegen array, and the frequencies
-// of each code is written into the codegenFreq array.
-// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
-// information. Code badCode is an end marker
-//
-//  numLiterals      The number of literals in literalEncoding
-//  numOffsets       The number of offsets in offsetEncoding
-//  litenc, offenc   The literal and offset encoder to use
-func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
-	for i := range w.codegenFreq {
-		w.codegenFreq[i] = 0
-	}
-	// Note that we are using codegen both as a temporary variable for holding
-	// a copy of the frequencies, and as the place where we put the result.
-	// This is fine because the output is always shorter than the input used
-	// so far.
-	codegen := w.codegen // cache
-	// Copy the concatenated code sizes to codegen. Put a marker at the end.
-	cgnl := codegen[:numLiterals]
-	for i := range cgnl {
-		cgnl[i] = uint8(litEnc.codes[i].len)
-	}
-
-	cgnl = codegen[numLiterals : numLiterals+numOffsets]
-	for i := range cgnl {
-		cgnl[i] = uint8(offEnc.codes[i].len)
-	}
-	codegen[numLiterals+numOffsets] = badCode
-
-	size := codegen[0]
-	count := 1
-	outIndex := 0
-	for inIndex := 1; size != badCode; inIndex++ {
-		// INVARIANT: We have seen "count" copies of size that have not yet
-		// had output generated for them.
-		nextSize := codegen[inIndex]
-		if nextSize == size {
-			count++
-			continue
-		}
-		// We need to generate codegen indicating "count" of size.
-		if size != 0 {
-			codegen[outIndex] = size
-			outIndex++
-			w.codegenFreq[size]++
-			count--
-			for count >= 3 {
-				n := 6
-				if n > count {
-					n = count
-				}
-				codegen[outIndex] = 16
-				outIndex++
-				codegen[outIndex] = uint8(n - 3)
-				outIndex++
-				w.codegenFreq[16]++
-				count -= n
-			}
-		} else {
-			for count >= 11 {
-				n := 138
-				if n > count {
-					n = count
-				}
-				codegen[outIndex] = 18
-				outIndex++
-				codegen[outIndex] = uint8(n - 11)
-				outIndex++
-				w.codegenFreq[18]++
-				count -= n
-			}
-			if count >= 3 {
-				// count >= 3 && count <= 10
-				codegen[outIndex] = 17
-				outIndex++
-				codegen[outIndex] = uint8(count - 3)
-				outIndex++
-				w.codegenFreq[17]++
-				count = 0
-			}
-		}
-		count--
-		for ; count >= 0; count-- {
-			codegen[outIndex] = size
-			outIndex++
-			w.codegenFreq[size]++
-		}
-		// Set up invariant for next time through the loop.
-		size = nextSize
-		count = 1
-	}
-	// Marker indicating the end of the codegen.
-	codegen[outIndex] = badCode
-}
-
-// dynamicSize returns the size of dynamically encoded data in bits.
-func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
-	numCodegens = len(w.codegenFreq)
-	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
-		numCodegens--
-	}
-	header := 3 + 5 + 5 + 4 + (3 * numCodegens) +
-		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
-		int(w.codegenFreq[16])*2 +
-		int(w.codegenFreq[17])*3 +
-		int(w.codegenFreq[18])*7
-	size = header +
-		litEnc.bitLength(w.literalFreq) +
-		offEnc.bitLength(w.offsetFreq) +
-		extraBits
-
-	return size, numCodegens
-}
-
-// fixedSize returns the size of dynamically encoded data in bits.
-func (w *huffmanBitWriter) fixedSize(extraBits int) int {
-	return 3 +
-		fixedLiteralEncoding.bitLength(w.literalFreq) +
-		fixedOffsetEncoding.bitLength(w.offsetFreq) +
-		extraBits
-}
-
-// storedSize calculates the stored size, including header.
-// The function returns the size in bits and whether the block
-// fits inside a single block.
-func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
-	if in == nil {
-		return 0, false
-	}
-	if len(in) <= maxStoreBlockSize {
-		return (len(in) + 5) * 8, true
-	}
-	return 0, false
-}
-
-func (w *huffmanBitWriter) writeCode(c hcode) {
-	if w.err != nil {
-		return
-	}
-	w.bits |= uint64(c.code) << w.nbits
-	w.nbits += uint(c.len)
-	if w.nbits >= 48 {
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		n := w.nbytes
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n >= bufferFlushSize {
-			w.write(w.bytes[:n])
-			n = 0
-		}
-		w.nbytes = n
-	}
-}
-
-// Write the header of a dynamic Huffman block to the output stream.
-//
-//  numLiterals  The number of literals specified in codegen
-//  numOffsets   The number of offsets specified in codegen
-//  numCodegens  The number of codegens used in codegen
-func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
-	if w.err != nil {
-		return
-	}
-	var firstBits int32 = 4
-	if isEof {
-		firstBits = 5
-	}
-	w.writeBits(firstBits, 3)
-	w.writeBits(int32(numLiterals-257), 5)
-	w.writeBits(int32(numOffsets-1), 5)
-	w.writeBits(int32(numCodegens-4), 4)
-
-	for i := 0; i < numCodegens; i++ {
-		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len)
-		w.writeBits(int32(value), 3)
-	}
-
-	i := 0
-	for {
-		var codeWord int = int(w.codegen[i])
-		i++
-		if codeWord == badCode {
-			break
-		}
-		w.writeCode(w.codegenEncoding.codes[uint32(codeWord)])
-
-		switch codeWord {
-		case 16:
-			w.writeBits(int32(w.codegen[i]), 2)
-			i++
-			break
-		case 17:
-			w.writeBits(int32(w.codegen[i]), 3)
-			i++
-			break
-		case 18:
-			w.writeBits(int32(w.codegen[i]), 7)
-			i++
-			break
-		}
-	}
-}
-
-func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
-	if w.err != nil {
-		return
-	}
-	var flag int32
-	if isEof {
-		flag = 1
-	}
-	w.writeBits(flag, 3)
-	w.flush()
-	w.writeBits(int32(length), 16)
-	w.writeBits(int32(^uint16(length)), 16)
-}
-
-func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
-	if w.err != nil {
-		return
-	}
-	// Indicate that we are a fixed Huffman block
-	var value int32 = 2
-	if isEof {
-		value = 3
-	}
-	w.writeBits(value, 3)
-}
-
-// writeBlock will write a block of tokens with the smallest encoding.
-// The original input can be supplied, and if the huffman encoded data
-// is larger than the original bytes, the data will be written as a
-// stored block.
-// If the input is nil, the tokens will always be Huffman encoded.
-func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
-	if w.err != nil {
-		return
-	}
-
-	tokens = append(tokens, endBlockMarker)
-	numLiterals, numOffsets := w.indexTokens(tokens)
-
-	var extraBits int
-	storedSize, storable := w.storedSize(input)
-	if storable {
-		// We only bother calculating the costs of the extra bits required by
-		// the length of offset fields (which will be the same for both fixed
-		// and dynamic encoding), if we need to compare those two encodings
-		// against stored encoding.
-		for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
-			// First eight length codes have extra size = 0.
-			extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart])
-		}
-		for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
-			// First four offset codes have extra size = 0.
-			extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode])
-		}
-	}
-
-	// Figure out smallest code.
-	// Fixed Huffman baseline.
-	var literalEncoding = fixedLiteralEncoding
-	var offsetEncoding = fixedOffsetEncoding
-	var size = w.fixedSize(extraBits)
-
-	// Dynamic Huffman?
-	var numCodegens int
-
-	// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	dynamicSize, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
-
-	if dynamicSize < size {
-		size = dynamicSize
-		literalEncoding = w.literalEncoding
-		offsetEncoding = w.offsetEncoding
-	}
-
-	// Stored bytes?
-	if storable && storedSize < size {
-		w.writeStoredHeader(len(input), eof)
-		w.writeBytes(input)
-		return
-	}
-
-	// Huffman.
-	if literalEncoding == fixedLiteralEncoding {
-		w.writeFixedHeader(eof)
-	} else {
-		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-	}
-
-	// Write the tokens.
-	w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes)
-}
-
-// writeBlockDynamic encodes a block using a dynamic Huffman table.
-// This should be used if the symbols used have a disproportionate
-// histogram distribution.
-// If input is supplied and the compression savings are below 1/16th of the
-// input size the block is stored.
-func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) {
-	if w.err != nil {
-		return
-	}
-
-	tokens = append(tokens, endBlockMarker)
-	numLiterals, numOffsets := w.indexTokens(tokens)
-
-	// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0)
-
-	// Store bytes, if we don't get a reasonable improvement.
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
-		w.writeStoredHeader(len(input), eof)
-		w.writeBytes(input)
-		return
-	}
-
-	// Write Huffman table.
-	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-
-	// Write the tokens.
-	w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes)
-}
-
-// indexTokens indexes a slice of tokens, and updates
-// literalFreq and offsetFreq, and generates literalEncoding
-// and offsetEncoding.
-// The number of literal and offset tokens is returned.
-func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) {
-	for i := range w.literalFreq {
-		w.literalFreq[i] = 0
-	}
-	for i := range w.offsetFreq {
-		w.offsetFreq[i] = 0
-	}
-
-	for _, t := range tokens {
-		if t < matchType {
-			w.literalFreq[t.literal()]++
-			continue
-		}
-		length := t.length()
-		offset := t.offset()
-		w.literalFreq[lengthCodesStart+lengthCode(length)]++
-		w.offsetFreq[offsetCode(offset)]++
-	}
-
-	// get the number of literals
-	numLiterals = len(w.literalFreq)
-	for w.literalFreq[numLiterals-1] == 0 {
-		numLiterals--
-	}
-	// get the number of offsets
-	numOffsets = len(w.offsetFreq)
-	for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
-		numOffsets--
-	}
-	if numOffsets == 0 {
-		// We haven't found a single match. If we want to go with the dynamic encoding,
-		// we should count at least one offset to be sure that the offset huffman tree could be encoded.
-		w.offsetFreq[0] = 1
-		numOffsets = 1
-	}
-	w.literalEncoding.generate(w.literalFreq, 15)
-	w.offsetEncoding.generate(w.offsetFreq, 15)
-	return
-}
-
-// writeTokens writes a slice of tokens to the output.
-// codes for literal and offset encoding must be supplied.
-func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
-	if w.err != nil {
-		return
-	}
-	for _, t := range tokens {
-		if t < matchType {
-			w.writeCode(leCodes[t.literal()])
-			continue
-		}
-		// Write the length
-		length := t.length()
-		lengthCode := lengthCode(length)
-		w.writeCode(leCodes[lengthCode+lengthCodesStart])
-		extraLengthBits := uint(lengthExtraBits[lengthCode])
-		if extraLengthBits > 0 {
-			extraLength := int32(length - lengthBase[lengthCode])
-			w.writeBits(extraLength, extraLengthBits)
-		}
-		// Write the offset
-		offset := t.offset()
-		offsetCode := offsetCode(offset)
-		w.writeCode(oeCodes[offsetCode])
-		extraOffsetBits := uint(offsetExtraBits[offsetCode])
-		if extraOffsetBits > 0 {
-			extraOffset := int32(offset - offsetBase[offsetCode])
-			w.writeBits(extraOffset, extraOffsetBits)
-		}
-	}
-}
-
-// huffOffset is a static offset encoder used for huffman only encoding.
-// It can be reused since we will not be encoding offset values.
-var huffOffset *huffmanEncoder
-
-func init() {
-	w := newHuffmanBitWriter(nil)
-	w.offsetFreq[0] = 1
-	huffOffset = newHuffmanEncoder(offsetCodeCount)
-	huffOffset.generate(w.offsetFreq, 15)
-}
-
-// writeBlockHuff encodes a block of bytes as either
-// Huffman encoded literals or uncompressed bytes if the
-// results only gains very little from compression.
-func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
-	if w.err != nil {
-		return
-	}
-
-	// Clear histogram
-	for i := range w.literalFreq {
-		w.literalFreq[i] = 0
-	}
-
-	// Add everything as literals
-	histogram(input, w.literalFreq)
-
-	w.literalFreq[endBlockMarker] = 1
-
-	const numLiterals = endBlockMarker + 1
-	const numOffsets = 1
-
-	w.literalEncoding.generate(w.literalFreq, 15)
-
-	// Figure out smallest code.
-	// Always use dynamic Huffman or Store
-	var numCodegens int
-
-	// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
-
-	// Store bytes, if we don't get a reasonable improvement.
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
-		w.writeStoredHeader(len(input), eof)
-		w.writeBytes(input)
-		return
-	}
-
-	// Huffman.
-	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-	encoding := w.literalEncoding.codes[:257]
-	n := w.nbytes
-	for _, t := range input {
-		// Bitwriting inlined, ~30% speedup
-		c := encoding[t]
-		w.bits |= uint64(c.code) << w.nbits
-		w.nbits += uint(c.len)
-		if w.nbits < 48 {
-			continue
-		}
-		// Store 6 bytes
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n < bufferFlushSize {
-			continue
-		}
-		w.write(w.bytes[:n])
-		if w.err != nil {
-			return // Return early in the event of write failures
-		}
-		n = 0
-	}
-	w.nbytes = n
-	w.writeCode(encoding[endBlockMarker])
-}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
deleted file mode 100644
index bdcbd823..00000000
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ /dev/null
@@ -1,344 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-import (
-	"math"
-	"sort"
-)
-
-// hcode is a huffman code with a bit code and bit length.
-type hcode struct {
-	code, len uint16
-}
-
-type huffmanEncoder struct {
-	codes     []hcode
-	freqcache []literalNode
-	bitCount  [17]int32
-	lns       byLiteral // stored to avoid repeated allocation in generate
-	lfs       byFreq    // stored to avoid repeated allocation in generate
-}
-
-type literalNode struct {
-	literal uint16
-	freq    int32
-}
-
-// A levelInfo describes the state of the constructed tree for a given depth.
-type levelInfo struct {
-	// Our level.  for better printing
-	level int32
-
-	// The frequency of the last node at this level
-	lastFreq int32
-
-	// The frequency of the next character to add to this level
-	nextCharFreq int32
-
-	// The frequency of the next pair (from level below) to add to this level.
-	// Only valid if the "needed" value of the next lower level is 0.
-	nextPairFreq int32
-
-	// The number of chains remaining to generate for this level before moving
-	// up to the next level
-	needed int32
-}
-
-// set sets the code and length of an hcode.
-func (h *hcode) set(code uint16, length uint16) {
-	h.len = length
-	h.code = code
-}
-
-func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} }
-
-func newHuffmanEncoder(size int) *huffmanEncoder {
-	return &huffmanEncoder{codes: make([]hcode, size)}
-}
-
-// Generates a HuffmanCode corresponding to the fixed literal table
-func generateFixedLiteralEncoding() *huffmanEncoder {
-	h := newHuffmanEncoder(maxNumLit)
-	codes := h.codes
-	var ch uint16
-	for ch = 0; ch < maxNumLit; ch++ {
-		var bits uint16
-		var size uint16
-		switch {
-		case ch < 144:
-			// size 8, 000110000  .. 10111111
-			bits = ch + 48
-			size = 8
-			break
-		case ch < 256:
-			// size 9, 110010000 .. 111111111
-			bits = ch + 400 - 144
-			size = 9
-			break
-		case ch < 280:
-			// size 7, 0000000 .. 0010111
-			bits = ch - 256
-			size = 7
-			break
-		default:
-			// size 8, 11000000 .. 11000111
-			bits = ch + 192 - 280
-			size = 8
-		}
-		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
-	}
-	return h
-}
-
-func generateFixedOffsetEncoding() *huffmanEncoder {
-	h := newHuffmanEncoder(30)
-	codes := h.codes
-	for ch := range codes {
-		codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
-	}
-	return h
-}
-
-var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
-var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
-
-func (h *huffmanEncoder) bitLength(freq []int32) int {
-	var total int
-	for i, f := range freq {
-		if f != 0 {
-			total += int(f) * int(h.codes[i].len)
-		}
-	}
-	return total
-}
-
-const maxBitsLimit = 16
-
-// Return the number of literals assigned to each bit size in the Huffman encoding
-//
-// This method is only called when list.length >= 3
-// The cases of 0, 1, and 2 literals are handled by special case code.
-//
-// list  An array of the literals with non-zero frequencies
-//             and their associated frequencies. The array is in order of increasing
-//             frequency, and has as its last element a special element with frequency
-//             MaxInt32
-// maxBits     The maximum number of bits that should be used to encode any literal.
-//             Must be less than 16.
-// return      An integer array in which array[i] indicates the number of literals
-//             that should be encoded in i bits.
-func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
-	if maxBits >= maxBitsLimit {
-		panic("flate: maxBits too large")
-	}
-	n := int32(len(list))
-	list = list[0 : n+1]
-	list[n] = maxNode()
-
-	// The tree can't have greater depth than n - 1, no matter what. This
-	// saves a little bit of work in some small cases
-	if maxBits > n-1 {
-		maxBits = n - 1
-	}
-
-	// Create information about each of the levels.
-	// A bogus "Level 0" whose sole purpose is so that
-	// level1.prev.needed==0.  This makes level1.nextPairFreq
-	// be a legitimate value that never gets chosen.
-	var levels [maxBitsLimit]levelInfo
-	// leafCounts[i] counts the number of literals at the left
-	// of ancestors of the rightmost node at level i.
-	// leafCounts[i][j] is the number of literals at the left
-	// of the level j ancestor.
-	var leafCounts [maxBitsLimit][maxBitsLimit]int32
-
-	for level := int32(1); level <= maxBits; level++ {
-		// For every level, the first two items are the first two characters.
-		// We initialize the levels as if we had already figured this out.
-		levels[level] = levelInfo{
-			level:        level,
-			lastFreq:     list[1].freq,
-			nextCharFreq: list[2].freq,
-			nextPairFreq: list[0].freq + list[1].freq,
-		}
-		leafCounts[level][level] = 2
-		if level == 1 {
-			levels[level].nextPairFreq = math.MaxInt32
-		}
-	}
-
-	// We need a total of 2*n - 2 items at top level and have already generated 2.
-	levels[maxBits].needed = 2*n - 4
-
-	level := maxBits
-	for {
-		l := &levels[level]
-		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
-			// We've run out of both leafs and pairs.
-			// End all calculations for this level.
-			// To make sure we never come back to this level or any lower level,
-			// set nextPairFreq impossibly large.
-			l.needed = 0
-			levels[level+1].nextPairFreq = math.MaxInt32
-			level++
-			continue
-		}
-
-		prevFreq := l.lastFreq
-		if l.nextCharFreq < l.nextPairFreq {
-			// The next item on this row is a leaf node.
-			n := leafCounts[level][level] + 1
-			l.lastFreq = l.nextCharFreq
-			// Lower leafCounts are the same of the previous node.
-			leafCounts[level][level] = n
-			l.nextCharFreq = list[n].freq
-		} else {
-			// The next item on this row is a pair from the previous row.
-			// nextPairFreq isn't valid until we generate two
-			// more values in the level below
-			l.lastFreq = l.nextPairFreq
-			// Take leaf counts from the lower level, except counts[level] remains the same.
-			copy(leafCounts[level][:level], leafCounts[level-1][:level])
-			levels[l.level-1].needed = 2
-		}
-
-		if l.needed--; l.needed == 0 {
-			// We've done everything we need to do for this level.
-			// Continue calculating one level up. Fill in nextPairFreq
-			// of that level with the sum of the two nodes we've just calculated on
-			// this level.
-			if l.level == maxBits {
-				// All done!
-				break
-			}
-			levels[l.level+1].nextPairFreq = prevFreq + l.lastFreq
-			level++
-		} else {
-			// If we stole from below, move down temporarily to replenish it.
-			for levels[level-1].needed > 0 {
-				level--
-			}
-		}
-	}
-
-	// Somethings is wrong if at the end, the top level is null or hasn't used
-	// all of the leaves.
-	if leafCounts[maxBits][maxBits] != n {
-		panic("leafCounts[maxBits][maxBits] != n")
-	}
-
-	bitCount := h.bitCount[:maxBits+1]
-	bits := 1
-	counts := &leafCounts[maxBits]
-	for level := maxBits; level > 0; level-- {
-		// chain.leafCount gives the number of literals requiring at least "bits"
-		// bits to encode.
-		bitCount[bits] = counts[level] - counts[level-1]
-		bits++
-	}
-	return bitCount
-}
-
-// Look at the leaves and assign them a bit count and an encoding as specified
-// in RFC 1951 3.2.2
-func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalNode) {
-	code := uint16(0)
-	for n, bits := range bitCount {
-		code <<= 1
-		if n == 0 || bits == 0 {
-			continue
-		}
-		// The literals list[len(list)-bits] .. list[len(list)-bits]
-		// are encoded using "bits" bits, and get the values
-		// code, code + 1, ....  The code values are
-		// assigned in literal order (not frequency order).
-		chunk := list[len(list)-int(bits):]
-
-		h.lns.sort(chunk)
-		for _, node := range chunk {
-			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
-			code++
-		}
-		list = list[0 : len(list)-int(bits)]
-	}
-}
-
-// Update this Huffman Code object to be the minimum code for the specified frequency count.
-//
-// freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
-// maxBits  The maximum number of bits to use for any literal.
-func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
-	if h.freqcache == nil {
-		// Allocate a reusable buffer with the longest possible frequency table.
-		// Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit.
-		// The largest of these is maxNumLit, so we allocate for that case.
-		h.freqcache = make([]literalNode, maxNumLit+1)
-	}
-	list := h.freqcache[:len(freq)+1]
-	// Number of non-zero literals
-	count := 0
-	// Set list to be the set of all non-zero literals and their frequencies
-	for i, f := range freq {
-		if f != 0 {
-			list[count] = literalNode{uint16(i), f}
-			count++
-		} else {
-			list[count] = literalNode{}
-			h.codes[i].len = 0
-		}
-	}
-	list[len(freq)] = literalNode{}
-
-	list = list[:count]
-	if count <= 2 {
-		// Handle the small cases here, because they are awkward for the general case code. With
-		// two or fewer literals, everything has bit length 1.
-		for i, node := range list {
-			// "list" is in order of increasing literal value.
-			h.codes[node.literal].set(uint16(i), 1)
-		}
-		return
-	}
-	h.lfs.sort(list)
-
-	// Get the number of literals for each bit count
-	bitCount := h.bitCounts(list, maxBits)
-	// And do the assignment
-	h.assignEncodingAndSize(bitCount, list)
-}
-
-type byLiteral []literalNode
-
-func (s *byLiteral) sort(a []literalNode) {
-	*s = byLiteral(a)
-	sort.Sort(s)
-}
-
-func (s byLiteral) Len() int { return len(s) }
-
-func (s byLiteral) Less(i, j int) bool {
-	return s[i].literal < s[j].literal
-}
-
-func (s byLiteral) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
-
-type byFreq []literalNode
-
-func (s *byFreq) sort(a []literalNode) {
-	*s = byFreq(a)
-	sort.Sort(s)
-}
-
-func (s byFreq) Len() int { return len(s) }
-
-func (s byFreq) Less(i, j int) bool {
-	if s[i].freq == s[j].freq {
-		return s[i].literal < s[j].literal
-	}
-	return s[i].freq < s[j].freq
-}
-
-func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
deleted file mode 100644
index 53b63d9a..00000000
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ /dev/null
@@ -1,846 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package flate implements the DEFLATE compressed data format, described in
-// RFC 1951.  The gzip and zlib packages implement access to DEFLATE-based file
-// formats.
-package flate
-
-import (
-	"bufio"
-	"io"
-	"strconv"
-	"sync"
-)
-
-const (
-	maxCodeLen = 16 // max length of Huffman code
-	// The next three numbers come from the RFC section 3.2.7, with the
-	// additional proviso in section 3.2.5 which implies that distance codes
-	// 30 and 31 should never occur in compressed data.
-	maxNumLit  = 286
-	maxNumDist = 30
-	numCodes   = 19 // number of codes in Huffman meta-code
-)
-
-// Initialize the fixedHuffmanDecoder only once upon first use.
-var fixedOnce sync.Once
-var fixedHuffmanDecoder huffmanDecoder
-
-// A CorruptInputError reports the presence of corrupt input at a given offset.
-type CorruptInputError int64
-
-func (e CorruptInputError) Error() string {
-	return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
-}
-
-// An InternalError reports an error in the flate code itself.
-type InternalError string
-
-func (e InternalError) Error() string { return "flate: internal error: " + string(e) }
-
-// A ReadError reports an error encountered while reading input.
-//
-// Deprecated: No longer returned.
-type ReadError struct {
-	Offset int64 // byte offset where error occurred
-	Err    error // error returned by underlying Read
-}
-
-func (e *ReadError) Error() string {
-	return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
-}
-
-// A WriteError reports an error encountered while writing output.
-//
-// Deprecated: No longer returned.
-type WriteError struct {
-	Offset int64 // byte offset where error occurred
-	Err    error // error returned by underlying Write
-}
-
-func (e *WriteError) Error() string {
-	return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
-}
-
-// Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
-// to switch to a new underlying Reader. This permits reusing a ReadCloser
-// instead of allocating a new one.
-type Resetter interface {
-	// Reset discards any buffered data and resets the Resetter as if it was
-	// newly initialized with the given reader.
-	Reset(r io.Reader, dict []byte) error
-}
-
-// The data structure for decoding Huffman tables is based on that of
-// zlib. There is a lookup table of a fixed bit width (huffmanChunkBits),
-// For codes smaller than the table width, there are multiple entries
-// (each combination of trailing bits has the same value). For codes
-// larger than the table width, the table contains a link to an overflow
-// table. The width of each entry in the link table is the maximum code
-// size minus the chunk width.
-//
-// Note that you can do a lookup in the table even without all bits
-// filled. Since the extra bits are zero, and the DEFLATE Huffman codes
-// have the property that shorter codes come before longer ones, the
-// bit length estimate in the result is a lower bound on the actual
-// number of bits.
-//
-// See the following:
-//	http://www.gzip.org/algorithm.txt
-
-// chunk & 15 is number of bits
-// chunk >> 4 is value, including table link
-
-const (
-	huffmanChunkBits  = 9
-	huffmanNumChunks  = 1 << huffmanChunkBits
-	huffmanCountMask  = 15
-	huffmanValueShift = 4
-)
-
-type huffmanDecoder struct {
-	min      int                      // the minimum code length
-	chunks   [huffmanNumChunks]uint32 // chunks as described above
-	links    [][]uint32               // overflow links
-	linkMask uint32                   // mask the width of the link table
-}
-
-// Initialize Huffman decoding tables from array of code lengths.
-// Following this function, h is guaranteed to be initialized into a complete
-// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
-// degenerate case where the tree has only a single symbol with length 1. Empty
-// trees are permitted.
-func (h *huffmanDecoder) init(bits []int) bool {
-	// Sanity enables additional runtime tests during Huffman
-	// table construction. It's intended to be used during
-	// development to supplement the currently ad-hoc unit tests.
-	const sanity = false
-
-	if h.min != 0 {
-		*h = huffmanDecoder{}
-	}
-
-	// Count number of codes of each length,
-	// compute min and max length.
-	var count [maxCodeLen]int
-	var min, max int
-	for _, n := range bits {
-		if n == 0 {
-			continue
-		}
-		if min == 0 || n < min {
-			min = n
-		}
-		if n > max {
-			max = n
-		}
-		count[n]++
-	}
-
-	// Empty tree. The decompressor.huffSym function will fail later if the tree
-	// is used. Technically, an empty tree is only valid for the HDIST tree and
-	// not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
-	// is guaranteed to fail since it will attempt to use the tree to decode the
-	// codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
-	// guaranteed to fail later since the compressed data section must be
-	// composed of at least one symbol (the end-of-block marker).
-	if max == 0 {
-		return true
-	}
-
-	code := 0
-	var nextcode [maxCodeLen]int
-	for i := min; i <= max; i++ {
-		code <<= 1
-		nextcode[i] = code
-		code += count[i]
-	}
-
-	// Check that the coding is complete (i.e., that we've
-	// assigned all 2-to-the-max possible bit sequences).
-	// Exception: To be compatible with zlib, we also need to
-	// accept degenerate single-code codings. See also
-	// TestDegenerateHuffmanCoding.
-	if code != 1<<uint(max) && !(code == 1 && max == 1) {
-		return false
-	}
-
-	h.min = min
-	if max > huffmanChunkBits {
-		numLinks := 1 << (uint(max) - huffmanChunkBits)
-		h.linkMask = uint32(numLinks - 1)
-
-		// create link tables
-		link := nextcode[huffmanChunkBits+1] >> 1
-		h.links = make([][]uint32, huffmanNumChunks-link)
-		for j := uint(link); j < huffmanNumChunks; j++ {
-			reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
-			reverse >>= uint(16 - huffmanChunkBits)
-			off := j - uint(link)
-			if sanity && h.chunks[reverse] != 0 {
-				panic("impossible: overwriting existing chunk")
-			}
-			h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
-			h.links[off] = make([]uint32, numLinks)
-		}
-	}
-
-	for i, n := range bits {
-		if n == 0 {
-			continue
-		}
-		code := nextcode[n]
-		nextcode[n]++
-		chunk := uint32(i<<huffmanValueShift | n)
-		reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
-		reverse >>= uint(16 - n)
-		if n <= huffmanChunkBits {
-			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
-				// We should never need to overwrite
-				// an existing chunk. Also, 0 is
-				// never a valid chunk, because the
-				// lower 4 "count" bits should be
-				// between 1 and 15.
-				if sanity && h.chunks[off] != 0 {
-					panic("impossible: overwriting existing chunk")
-				}
-				h.chunks[off] = chunk
-			}
-		} else {
-			j := reverse & (huffmanNumChunks - 1)
-			if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
-				// Longer codes should have been
-				// associated with a link table above.
-				panic("impossible: not an indirect chunk")
-			}
-			value := h.chunks[j] >> huffmanValueShift
-			linktab := h.links[value]
-			reverse >>= huffmanChunkBits
-			for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
-				if sanity && linktab[off] != 0 {
-					panic("impossible: overwriting existing chunk")
-				}
-				linktab[off] = chunk
-			}
-		}
-	}
-
-	if sanity {
-		// Above we've sanity checked that we never overwrote
-		// an existing entry. Here we additionally check that
-		// we filled the tables completely.
-		for i, chunk := range h.chunks {
-			if chunk == 0 {
-				// As an exception, in the degenerate
-				// single-code case, we allow odd
-				// chunks to be missing.
-				if code == 1 && i%2 == 1 {
-					continue
-				}
-				panic("impossible: missing chunk")
-			}
-		}
-		for _, linktab := range h.links {
-			for _, chunk := range linktab {
-				if chunk == 0 {
-					panic("impossible: missing chunk")
-				}
-			}
-		}
-	}
-
-	return true
-}
-
-// The actual read interface needed by NewReader.
-// If the passed in io.Reader does not also have ReadByte,
-// the NewReader will introduce its own buffering.
-type Reader interface {
-	io.Reader
-	io.ByteReader
-}
-
-// Decompress state.
-type decompressor struct {
-	// Input source.
-	r       Reader
-	roffset int64
-
-	// Input bits, in top of b.
-	b  uint32
-	nb uint
-
-	// Huffman decoders for literal/length, distance.
-	h1, h2 huffmanDecoder
-
-	// Length arrays used to define Huffman codes.
-	bits     *[maxNumLit + maxNumDist]int
-	codebits *[numCodes]int
-
-	// Output history, buffer.
-	dict dictDecoder
-
-	// Temporary buffer (avoids repeated allocation).
-	buf [4]byte
-
-	// Next step in the decompression,
-	// and decompression state.
-	step      func(*decompressor)
-	stepState int
-	final     bool
-	err       error
-	toRead    []byte
-	hl, hd    *huffmanDecoder
-	copyLen   int
-	copyDist  int
-}
-
-func (f *decompressor) nextBlock() {
-	for f.nb < 1+2 {
-		if f.err = f.moreBits(); f.err != nil {
-			return
-		}
-	}
-	f.final = f.b&1 == 1
-	f.b >>= 1
-	typ := f.b & 3
-	f.b >>= 2
-	f.nb -= 1 + 2
-	switch typ {
-	case 0:
-		f.dataBlock()
-	case 1:
-		// compressed, fixed Huffman tables
-		f.hl = &fixedHuffmanDecoder
-		f.hd = nil
-		f.huffmanBlock()
-	case 2:
-		// compressed, dynamic Huffman tables
-		if f.err = f.readHuffman(); f.err != nil {
-			break
-		}
-		f.hl = &f.h1
-		f.hd = &f.h2
-		f.huffmanBlock()
-	default:
-		// 3 is reserved.
-		f.err = CorruptInputError(f.roffset)
-	}
-}
-
-func (f *decompressor) Read(b []byte) (int, error) {
-	for {
-		if len(f.toRead) > 0 {
-			n := copy(b, f.toRead)
-			f.toRead = f.toRead[n:]
-			if len(f.toRead) == 0 {
-				return n, f.err
-			}
-			return n, nil
-		}
-		if f.err != nil {
-			return 0, f.err
-		}
-		f.step(f)
-		if f.err != nil && len(f.toRead) == 0 {
-			f.toRead = f.dict.readFlush() // Flush what's left in case of error
-		}
-	}
-}
-
-// Support the io.WriteTo interface for io.Copy and friends.
-func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
-	total := int64(0)
-	flushed := false
-	for {
-		if len(f.toRead) > 0 {
-			n, err := w.Write(f.toRead)
-			total += int64(n)
-			if err != nil {
-				f.err = err
-				return total, err
-			}
-			if n != len(f.toRead) {
-				return total, io.ErrShortWrite
-			}
-			f.toRead = f.toRead[:0]
-		}
-		if f.err != nil && flushed {
-			if f.err == io.EOF {
-				return total, nil
-			}
-			return total, f.err
-		}
-		if f.err == nil {
-			f.step(f)
-		}
-		if len(f.toRead) == 0 && f.err != nil && !flushed {
-			f.toRead = f.dict.readFlush() // Flush what's left in case of error
-			flushed = true
-		}
-	}
-}
-
-func (f *decompressor) Close() error {
-	if f.err == io.EOF {
-		return nil
-	}
-	return f.err
-}
-
-// RFC 1951 section 3.2.7.
-// Compression with dynamic Huffman codes
-
-var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
-
-func (f *decompressor) readHuffman() error {
-	// HLIT[5], HDIST[5], HCLEN[4].
-	for f.nb < 5+5+4 {
-		if err := f.moreBits(); err != nil {
-			return err
-		}
-	}
-	nlit := int(f.b&0x1F) + 257
-	if nlit > maxNumLit {
-		return CorruptInputError(f.roffset)
-	}
-	f.b >>= 5
-	ndist := int(f.b&0x1F) + 1
-	if ndist > maxNumDist {
-		return CorruptInputError(f.roffset)
-	}
-	f.b >>= 5
-	nclen := int(f.b&0xF) + 4
-	// numCodes is 19, so nclen is always valid.
-	f.b >>= 4
-	f.nb -= 5 + 5 + 4
-
-	// (HCLEN+4)*3 bits: code lengths in the magic codeOrder order.
-	for i := 0; i < nclen; i++ {
-		for f.nb < 3 {
-			if err := f.moreBits(); err != nil {
-				return err
-			}
-		}
-		f.codebits[codeOrder[i]] = int(f.b & 0x7)
-		f.b >>= 3
-		f.nb -= 3
-	}
-	for i := nclen; i < len(codeOrder); i++ {
-		f.codebits[codeOrder[i]] = 0
-	}
-	if !f.h1.init(f.codebits[0:]) {
-		return CorruptInputError(f.roffset)
-	}
-
-	// HLIT + 257 code lengths, HDIST + 1 code lengths,
-	// using the code length Huffman code.
-	for i, n := 0, nlit+ndist; i < n; {
-		x, err := f.huffSym(&f.h1)
-		if err != nil {
-			return err
-		}
-		if x < 16 {
-			// Actual length.
-			f.bits[i] = x
-			i++
-			continue
-		}
-		// Repeat previous length or zero.
-		var rep int
-		var nb uint
-		var b int
-		switch x {
-		default:
-			return InternalError("unexpected length code")
-		case 16:
-			rep = 3
-			nb = 2
-			if i == 0 {
-				return CorruptInputError(f.roffset)
-			}
-			b = f.bits[i-1]
-		case 17:
-			rep = 3
-			nb = 3
-			b = 0
-		case 18:
-			rep = 11
-			nb = 7
-			b = 0
-		}
-		for f.nb < nb {
-			if err := f.moreBits(); err != nil {
-				return err
-			}
-		}
-		rep += int(f.b & uint32(1<<nb-1))
-		f.b >>= nb
-		f.nb -= nb
-		if i+rep > n {
-			return CorruptInputError(f.roffset)
-		}
-		for j := 0; j < rep; j++ {
-			f.bits[i] = b
-			i++
-		}
-	}
-
-	if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
-		return CorruptInputError(f.roffset)
-	}
-
-	// As an optimization, we can initialize the min bits to read at a time
-	// for the HLIT tree to the length of the EOB marker since we know that
-	// every block must terminate with one. This preserves the property that
-	// we never read any extra bytes after the end of the DEFLATE stream.
-	if f.h1.min < f.bits[endBlockMarker] {
-		f.h1.min = f.bits[endBlockMarker]
-	}
-
-	return nil
-}
-
-// Decode a single Huffman block from f.
-// hl and hd are the Huffman states for the lit/length values
-// and the distance values, respectively. If hd == nil, using the
-// fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlock() {
-	const (
-		stateInit = iota // Zero value must be stateInit
-		stateDict
-	)
-
-	switch f.stepState {
-	case stateInit:
-		goto readLiteral
-	case stateDict:
-		goto copyHistory
-	}
-
-readLiteral:
-	// Read literal and/or (length, distance) according to RFC section 3.2.3.
-	{
-		v, err := f.huffSym(f.hl)
-		if err != nil {
-			f.err = err
-			return
-		}
-		var n uint // number of bits extra
-		var length int
-		switch {
-		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlock
-				f.stepState = stateInit
-				return
-			}
-			goto readLiteral
-		case v == 256:
-			f.finishBlock()
-			return
-		// otherwise, reference to older data
-		case v < 265:
-			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
-		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = f.moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
-		}
-
-		var dist int
-		if f.hd == nil {
-			for f.nb < 5 {
-				if err = f.moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			dist = int(reverseByte[(f.b&0x1F)<<3])
-			f.b >>= 5
-			f.nb -= 5
-		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				f.err = err
-				return
-			}
-		}
-
-		switch {
-		case dist < 4:
-			dist++
-		case dist < maxNumDist:
-			nb := uint(dist-2) >> 1
-			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = f.moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
-		default:
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		f.copyLen, f.copyDist = length, dist
-		goto copyHistory
-	}
-
-copyHistory:
-	// Perform a backwards copy according to RFC section 3.2.3.
-	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
-		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
-		}
-		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlock // We need to continue this work
-			f.stepState = stateDict
-			return
-		}
-		goto readLiteral
-	}
-}
-
-// Copy a single uncompressed data block from input to output.
-func (f *decompressor) dataBlock() {
-	// Uncompressed.
-	// Discard current half-byte.
-	f.nb = 0
-	f.b = 0
-
-	// Length then ones-complement of length.
-	nr, err := io.ReadFull(f.r, f.buf[0:4])
-	f.roffset += int64(nr)
-	if err != nil {
-		if err == io.EOF {
-			err = io.ErrUnexpectedEOF
-		}
-		f.err = err
-		return
-	}
-	n := int(f.buf[0]) | int(f.buf[1])<<8
-	nn := int(f.buf[2]) | int(f.buf[3])<<8
-	if uint16(nn) != uint16(^n) {
-		f.err = CorruptInputError(f.roffset)
-		return
-	}
-
-	if n == 0 {
-		f.toRead = f.dict.readFlush()
-		f.finishBlock()
-		return
-	}
-
-	f.copyLen = n
-	f.copyData()
-}
-
-// copyData copies f.copyLen bytes from the underlying reader into f.hist.
-// It pauses for reads when f.hist is full.
-func (f *decompressor) copyData() {
-	buf := f.dict.writeSlice()
-	if len(buf) > f.copyLen {
-		buf = buf[:f.copyLen]
-	}
-
-	cnt, err := io.ReadFull(f.r, buf)
-	f.roffset += int64(cnt)
-	f.copyLen -= cnt
-	f.dict.writeMark(cnt)
-	if err != nil {
-		if err == io.EOF {
-			err = io.ErrUnexpectedEOF
-		}
-		f.err = err
-		return
-	}
-
-	if f.dict.availWrite() == 0 || f.copyLen > 0 {
-		f.toRead = f.dict.readFlush()
-		f.step = (*decompressor).copyData
-		return
-	}
-	f.finishBlock()
-}
-
-func (f *decompressor) finishBlock() {
-	if f.final {
-		if f.dict.availRead() > 0 {
-			f.toRead = f.dict.readFlush()
-		}
-		f.err = io.EOF
-	}
-	f.step = (*decompressor).nextBlock
-}
-
-func (f *decompressor) moreBits() error {
-	c, err := f.r.ReadByte()
-	if err != nil {
-		if err == io.EOF {
-			err = io.ErrUnexpectedEOF
-		}
-		return err
-	}
-	f.roffset++
-	f.b |= uint32(c) << f.nb
-	f.nb += 8
-	return nil
-}
-
-// Read the next Huffman-encoded symbol from f according to h.
-func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
-	// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-	// with single element, huffSym must error on these two edge cases. In both
-	// cases, the chunks slice will be 0 for the invalid sequence, leading it
-	// satisfy the n == 0 check below.
-	n := uint(h.min)
-	for {
-		for f.nb < n {
-			if err := f.moreBits(); err != nil {
-				return 0, err
-			}
-		}
-		chunk := h.chunks[f.b&(huffmanNumChunks-1)]
-		n = uint(chunk & huffmanCountMask)
-		if n > huffmanChunkBits {
-			chunk = h.links[chunk>>huffmanValueShift][(f.b>>huffmanChunkBits)&h.linkMask]
-			n = uint(chunk & huffmanCountMask)
-		}
-		if n <= f.nb {
-			if n == 0 {
-				f.err = CorruptInputError(f.roffset)
-				return 0, f.err
-			}
-			f.b >>= n
-			f.nb -= n
-			return int(chunk >> huffmanValueShift), nil
-		}
-	}
-}
-
-func makeReader(r io.Reader) Reader {
-	if rr, ok := r.(Reader); ok {
-		return rr
-	}
-	return bufio.NewReader(r)
-}
-
-func fixedHuffmanDecoderInit() {
-	fixedOnce.Do(func() {
-		// These come from the RFC section 3.2.6.
-		var bits [288]int
-		for i := 0; i < 144; i++ {
-			bits[i] = 8
-		}
-		for i := 144; i < 256; i++ {
-			bits[i] = 9
-		}
-		for i := 256; i < 280; i++ {
-			bits[i] = 7
-		}
-		for i := 280; i < 288; i++ {
-			bits[i] = 8
-		}
-		fixedHuffmanDecoder.init(bits[:])
-	})
-}
-
-func (f *decompressor) Reset(r io.Reader, dict []byte) error {
-	*f = decompressor{
-		r:        makeReader(r),
-		bits:     f.bits,
-		codebits: f.codebits,
-		dict:     f.dict,
-		step:     (*decompressor).nextBlock,
-	}
-	f.dict.init(maxMatchOffset, dict)
-	return nil
-}
-
-// NewReader returns a new ReadCloser that can be used
-// to read the uncompressed version of r.
-// If r does not also implement io.ByteReader,
-// the decompressor may read more data than necessary from r.
-// It is the caller's responsibility to call Close on the ReadCloser
-// when finished reading.
-//
-// The ReadCloser returned by NewReader also implements Resetter.
-func NewReader(r io.Reader) io.ReadCloser {
-	fixedHuffmanDecoderInit()
-
-	var f decompressor
-	f.r = makeReader(r)
-	f.bits = new([maxNumLit + maxNumDist]int)
-	f.codebits = new([numCodes]int)
-	f.step = (*decompressor).nextBlock
-	f.dict.init(maxMatchOffset, nil)
-	return &f
-}
-
-// NewReaderDict is like NewReader but initializes the reader
-// with a preset dictionary. The returned Reader behaves as if
-// the uncompressed data stream started with the given dictionary,
-// which has already been read. NewReaderDict is typically used
-// to read data compressed by NewWriterDict.
-//
-// The ReadCloser returned by NewReader also implements Resetter.
-func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
-	fixedHuffmanDecoderInit()
-
-	var f decompressor
-	f.r = makeReader(r)
-	f.bits = new([maxNumLit + maxNumDist]int)
-	f.codebits = new([numCodes]int)
-	f.step = (*decompressor).nextBlock
-	f.dict.init(maxMatchOffset, dict)
-	return &f
-}
diff --git a/vendor/github.com/klauspost/compress/flate/reverse_bits.go b/vendor/github.com/klauspost/compress/flate/reverse_bits.go
deleted file mode 100644
index c1a02720..00000000
--- a/vendor/github.com/klauspost/compress/flate/reverse_bits.go
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-var reverseByte = [256]byte{
-	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
-	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
-	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
-	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
-	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
-	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
-	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
-	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
-	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
-	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
-	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
-	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
-	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
-	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
-	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
-	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
-	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
-	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
-	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
-	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
-	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
-	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
-	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
-	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
-	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
-	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
-	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
-	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
-	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
-	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
-	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
-	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
-}
-
-func reverseUint16(v uint16) uint16 {
-	return uint16(reverseByte[v>>8]) | uint16(reverseByte[v&0xFF])<<8
-}
-
-func reverseBits(number uint16, bitLength byte) uint16 {
-	return reverseUint16(number << uint8(16-bitLength))
-}
diff --git a/vendor/github.com/klauspost/compress/flate/snappy.go b/vendor/github.com/klauspost/compress/flate/snappy.go
deleted file mode 100644
index 8a57c05b..00000000
--- a/vendor/github.com/klauspost/compress/flate/snappy.go
+++ /dev/null
@@ -1,900 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Modified for deflate by Klaus Post (c) 2015.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-// emitLiteral writes a literal chunk and returns the number of bytes written.
-func emitLiteral(dst *tokens, lit []byte) {
-	ol := int(dst.n)
-	for i, v := range lit {
-		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
-	}
-	dst.n += uint16(len(lit))
-}
-
-// emitCopy writes a copy chunk and returns the number of bytes written.
-func emitCopy(dst *tokens, offset, length int) {
-	dst.tokens[dst.n] = matchToken(uint32(length-3), uint32(offset-minOffsetSize))
-	dst.n++
-}
-
-type snappyEnc interface {
-	Encode(dst *tokens, src []byte)
-	Reset()
-}
-
-func newSnappy(level int) snappyEnc {
-	switch level {
-	case 1:
-		return &snappyL1{}
-	case 2:
-		return &snappyL2{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
-	case 3:
-		return &snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
-	case 4:
-		return &snappyL4{snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}}
-	default:
-		panic("invalid level specified")
-	}
-}
-
-const (
-	tableBits       = 14             // Bits used in the table
-	tableSize       = 1 << tableBits // Size of the table
-	tableMask       = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
-	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
-	baseMatchOffset = 1              // The smallest match offset
-	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
-	maxMatchOffset  = 1 << 15        // The largest match offset
-)
-
-func load32(b []byte, i int) uint32 {
-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load64(b []byte, i int) uint64 {
-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-func hash(u uint32) uint32 {
-	return (u * 0x1e35a7bd) >> tableShift
-}
-
-// snappyL1 encapsulates level 1 compression
-type snappyL1 struct{}
-
-func (e *snappyL1) Reset() {}
-
-func (e *snappyL1) Encode(dst *tokens, src []byte) {
-	const (
-		inputMargin            = 16 - 1
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-	)
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		// We do not fill the token table.
-		// This will be picked up by caller.
-		dst.n = uint16(len(src))
-		return
-	}
-
-	// Initialize the hash table.
-	//
-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
-	// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
-	var table [tableSize]uint16
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := len(src) - inputMargin
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := 0
-
-	// The encoded form must start with a literal, as there are no previous
-	// bytes to copy, so we start looking for hash matches at s == 1.
-	s := 1
-	nextHash := hash(load32(src, s))
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := 32
-
-		nextS := s
-		candidate := 0
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidate = int(table[nextHash&tableMask])
-			table[nextHash&tableMask] = uint16(s)
-			nextHash = hash(load32(src, nextS))
-			if s-candidate <= maxMatchOffset && load32(src, s) == load32(src, candidate) {
-				break
-			}
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-			base := s
-
-			// Extend the 4-byte match as long as possible.
-			//
-			// This is an inlined version of Snappy's:
-			//	s = extendMatch(src, candidate+4, s+4)
-			s += 4
-			s1 := base + maxMatchLength
-			if s1 > len(src) {
-				s1 = len(src)
-			}
-			a := src[s:s1]
-			b := src[candidate+4:]
-			b = b[:len(a)]
-			l := len(a)
-			for i := range a {
-				if a[i] != b[i] {
-					l = i
-					break
-				}
-			}
-			s += l
-
-			// matchToken is flate's equivalent of Snappy's emitCopy.
-			dst.tokens[dst.n] = matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset))
-			dst.n++
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load64(src, s-1)
-			prevHash := hash(uint32(x >> 0))
-			table[prevHash&tableMask] = uint16(s - 1)
-			currHash := hash(uint32(x >> 8))
-			candidate = int(table[currHash&tableMask])
-			table[currHash&tableMask] = uint16(s)
-			if s-candidate > maxMatchOffset || uint32(x>>8) != load32(src, candidate) {
-				nextHash = hash(uint32(x >> 16))
-				s++
-				break
-			}
-		}
-	}
-
-emitRemainder:
-	if nextEmit < len(src) {
-		emitLiteral(dst, src[nextEmit:])
-	}
-}
-
-type tableEntry struct {
-	val    uint32
-	offset int32
-}
-
-func load3232(b []byte, i int32) uint32 {
-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load6432(b []byte, i int32) uint64 {
-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-// snappyGen maintains the table for matches,
-// and the previous byte block for level 2.
-// This is the generic implementation.
-type snappyGen struct {
-	prev []byte
-	cur  int32
-}
-
-// snappyGen maintains the table for matches,
-// and the previous byte block for level 2.
-// This is the generic implementation.
-type snappyL2 struct {
-	snappyGen
-	table [tableSize]tableEntry
-}
-
-// EncodeL2 uses a similar algorithm to level 1, but is capable
-// of matching across blocks giving better compression at a small slowdown.
-func (e *snappyL2) Encode(dst *tokens, src []byte) {
-	const (
-		inputMargin            = 8 - 1
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-	)
-
-	// Protect against e.cur wraparound.
-	if e.cur > 1<<30 {
-		for i := range e.table {
-			e.table[i] = tableEntry{}
-		}
-		e.cur = maxStoreBlockSize
-	}
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		// We do not fill the token table.
-		// This will be picked up by caller.
-		dst.n = uint16(len(src))
-		e.cur += maxStoreBlockSize
-		e.prev = e.prev[:0]
-		return
-	}
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := int32(len(src) - inputMargin)
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := int32(0)
-	s := int32(0)
-	cv := load3232(src, s)
-	nextHash := hash(cv)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := int32(32)
-
-		nextS := s
-		var candidate tableEntry
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidate = e.table[nextHash&tableMask]
-			now := load3232(src, nextS)
-			e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
-			nextHash = hash(now)
-
-			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || cv != candidate.val {
-				// Out of range or not matched.
-				cv = now
-				continue
-			}
-			break
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-
-			// Extend the 4-byte match as long as possible.
-			//
-			s += 4
-			t := candidate.offset - e.cur + 4
-			l := e.matchlen(s, t, src)
-
-			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
-			dst.n++
-			s += l
-			nextEmit = s
-			if s >= sLimit {
-				t += l
-				// Index first pair after match end.
-				if int(t+4) < len(src) && t > 0 {
-					cv := load3232(src, t)
-					e.table[hash(cv)&tableMask] = tableEntry{offset: t + e.cur, val: cv}
-				}
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load6432(src, s-1)
-			prevHash := hash(uint32(x))
-			e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
-			x >>= 8
-			currHash := hash(uint32(x))
-			candidate = e.table[currHash&tableMask]
-			e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
-
-			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || uint32(x) != candidate.val {
-				cv = uint32(x >> 8)
-				nextHash = hash(cv)
-				s++
-				break
-			}
-		}
-	}
-
-emitRemainder:
-	if int(nextEmit) < len(src) {
-		emitLiteral(dst, src[nextEmit:])
-	}
-	e.cur += int32(len(src))
-	e.prev = e.prev[:len(src)]
-	copy(e.prev, src)
-}
-
-type tableEntryPrev struct {
-	Cur  tableEntry
-	Prev tableEntry
-}
-
-// snappyL3
-type snappyL3 struct {
-	snappyGen
-	table [tableSize]tableEntryPrev
-}
-
-// Encode uses a similar algorithm to level 2, will check up to two candidates.
-func (e *snappyL3) Encode(dst *tokens, src []byte) {
-	const (
-		inputMargin            = 8 - 1
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-	)
-
-	// Protect against e.cur wraparound.
-	if e.cur > 1<<30 {
-		for i := range e.table {
-			e.table[i] = tableEntryPrev{}
-		}
-		e.snappyGen = snappyGen{cur: maxStoreBlockSize, prev: e.prev[:0]}
-	}
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		// We do not fill the token table.
-		// This will be picked up by caller.
-		dst.n = uint16(len(src))
-		e.cur += maxStoreBlockSize
-		e.prev = e.prev[:0]
-		return
-	}
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := int32(len(src) - inputMargin)
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := int32(0)
-	s := int32(0)
-	cv := load3232(src, s)
-	nextHash := hash(cv)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := int32(32)
-
-		nextS := s
-		var candidate tableEntry
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidates := e.table[nextHash&tableMask]
-			now := load3232(src, nextS)
-			e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
-			nextHash = hash(now)
-
-			// Check both candidates
-			candidate = candidates.Cur
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset <= maxMatchOffset {
-					break
-				}
-			} else {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
-				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						break
-					}
-				}
-			}
-			cv = now
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-
-			// Extend the 4-byte match as long as possible.
-			//
-			s += 4
-			t := candidate.offset - e.cur + 4
-			l := e.matchlen(s, t, src)
-
-			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
-			dst.n++
-			s += l
-			nextEmit = s
-			if s >= sLimit {
-				t += l
-				// Index first pair after match end.
-				if int(t+4) < len(src) && t > 0 {
-					cv := load3232(src, t)
-					nextHash = hash(cv)
-					e.table[nextHash&tableMask] = tableEntryPrev{
-						Prev: e.table[nextHash&tableMask].Cur,
-						Cur:  tableEntry{offset: e.cur + t, val: cv},
-					}
-				}
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-3 to s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load6432(src, s-3)
-			prevHash := hash(uint32(x))
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 3, val: uint32(x)},
-			}
-			x >>= 8
-			prevHash = hash(uint32(x))
-
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
-			}
-			x >>= 8
-			prevHash = hash(uint32(x))
-
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
-			}
-			x >>= 8
-			currHash := hash(uint32(x))
-			candidates := e.table[currHash&tableMask]
-			cv = uint32(x)
-			e.table[currHash&tableMask] = tableEntryPrev{
-				Prev: candidates.Cur,
-				Cur:  tableEntry{offset: s + e.cur, val: cv},
-			}
-
-			// Check both candidates
-			candidate = candidates.Cur
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset <= maxMatchOffset {
-					continue
-				}
-			} else {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
-				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						continue
-					}
-				}
-			}
-			cv = uint32(x >> 8)
-			nextHash = hash(cv)
-			s++
-			break
-		}
-	}
-
-emitRemainder:
-	if int(nextEmit) < len(src) {
-		emitLiteral(dst, src[nextEmit:])
-	}
-	e.cur += int32(len(src))
-	e.prev = e.prev[:len(src)]
-	copy(e.prev, src)
-}
-
-// snappyL4
-type snappyL4 struct {
-	snappyL3
-}
-
-// Encode uses a similar algorithm to level 3,
-// but will check up to two candidates if first isn't long enough.
-func (e *snappyL4) Encode(dst *tokens, src []byte) {
-	const (
-		inputMargin            = 8 - 3
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-		matchLenGood           = 12
-	)
-
-	// Protect against e.cur wraparound.
-	if e.cur > 1<<30 {
-		for i := range e.table {
-			e.table[i] = tableEntryPrev{}
-		}
-		e.snappyGen = snappyGen{cur: maxStoreBlockSize, prev: e.prev[:0]}
-	}
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		// We do not fill the token table.
-		// This will be picked up by caller.
-		dst.n = uint16(len(src))
-		e.cur += maxStoreBlockSize
-		e.prev = e.prev[:0]
-		return
-	}
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := int32(len(src) - inputMargin)
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := int32(0)
-	s := int32(0)
-	cv := load3232(src, s)
-	nextHash := hash(cv)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := int32(32)
-
-		nextS := s
-		var candidate tableEntry
-		var candidateAlt tableEntry
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidates := e.table[nextHash&tableMask]
-			now := load3232(src, nextS)
-			e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
-			nextHash = hash(now)
-
-			// Check both candidates
-			candidate = candidates.Cur
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset < maxMatchOffset {
-					offset = s - (candidates.Prev.offset - e.cur)
-					if cv == candidates.Prev.val && offset < maxMatchOffset {
-						candidateAlt = candidates.Prev
-					}
-					break
-				}
-			} else {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
-				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset < maxMatchOffset {
-						break
-					}
-				}
-			}
-			cv = now
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-
-			// Extend the 4-byte match as long as possible.
-			//
-			s += 4
-			t := candidate.offset - e.cur + 4
-			l := e.matchlen(s, t, src)
-			// Try alternative candidate if match length < matchLenGood.
-			if l < matchLenGood-4 && candidateAlt.offset != 0 {
-				t2 := candidateAlt.offset - e.cur + 4
-				l2 := e.matchlen(s, t2, src)
-				if l2 > l {
-					l = l2
-					t = t2
-				}
-			}
-			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
-			dst.n++
-			s += l
-			nextEmit = s
-			if s >= sLimit {
-				t += l
-				// Index first pair after match end.
-				if int(t+4) < len(src) && t > 0 {
-					cv := load3232(src, t)
-					nextHash = hash(cv)
-					e.table[nextHash&tableMask] = tableEntryPrev{
-						Prev: e.table[nextHash&tableMask].Cur,
-						Cur:  tableEntry{offset: e.cur + t, val: cv},
-					}
-				}
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-3 to s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load6432(src, s-3)
-			prevHash := hash(uint32(x))
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 3, val: uint32(x)},
-			}
-			x >>= 8
-			prevHash = hash(uint32(x))
-
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
-			}
-			x >>= 8
-			prevHash = hash(uint32(x))
-
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
-			}
-			x >>= 8
-			currHash := hash(uint32(x))
-			candidates := e.table[currHash&tableMask]
-			cv = uint32(x)
-			e.table[currHash&tableMask] = tableEntryPrev{
-				Prev: candidates.Cur,
-				Cur:  tableEntry{offset: s + e.cur, val: cv},
-			}
-
-			// Check both candidates
-			candidate = candidates.Cur
-			candidateAlt = tableEntry{}
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset <= maxMatchOffset {
-					offset = s - (candidates.Prev.offset - e.cur)
-					if cv == candidates.Prev.val && offset <= maxMatchOffset {
-						candidateAlt = candidates.Prev
-					}
-					continue
-				}
-			} else {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
-				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						continue
-					}
-				}
-			}
-			cv = uint32(x >> 8)
-			nextHash = hash(cv)
-			s++
-			break
-		}
-	}
-
-emitRemainder:
-	if int(nextEmit) < len(src) {
-		emitLiteral(dst, src[nextEmit:])
-	}
-	e.cur += int32(len(src))
-	e.prev = e.prev[:len(src)]
-	copy(e.prev, src)
-}
-
-func (e *snappyGen) matchlen(s, t int32, src []byte) int32 {
-	s1 := int(s) + maxMatchLength - 4
-	if s1 > len(src) {
-		s1 = len(src)
-	}
-
-	// If we are inside the current block
-	if t >= 0 {
-		b := src[t:]
-		a := src[s:s1]
-		b = b[:len(a)]
-		// Extend the match to be as long as possible.
-		for i := range a {
-			if a[i] != b[i] {
-				return int32(i)
-			}
-		}
-		return int32(len(a))
-	}
-
-	// We found a match in the previous block.
-	tp := int32(len(e.prev)) + t
-	if tp < 0 {
-		return 0
-	}
-
-	// Extend the match to be as long as possible.
-	a := src[s:s1]
-	b := e.prev[tp:]
-	if len(b) > len(a) {
-		b = b[:len(a)]
-	}
-	a = a[:len(b)]
-	for i := range b {
-		if a[i] != b[i] {
-			return int32(i)
-		}
-	}
-
-	// If we reached our limit, we matched everything we are
-	// allowed to in the previous block and we return.
-	n := int32(len(b))
-	if int(s+n) == s1 {
-		return n
-	}
-
-	// Continue looking for more matches in the current block.
-	a = src[s+n : s1]
-	b = src[:len(a)]
-	for i := range a {
-		if a[i] != b[i] {
-			return int32(i) + n
-		}
-	}
-	return int32(len(a)) + n
-}
-
-// Reset the encoding table.
-func (e *snappyGen) Reset() {
-	e.prev = e.prev[:0]
-	e.cur += maxMatchOffset
-}
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
deleted file mode 100644
index 4f275ea6..00000000
--- a/vendor/github.com/klauspost/compress/flate/token.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-import "fmt"
-
-const (
-	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
-	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
-	// 22 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
-	lengthShift = 22
-	offsetMask  = 1<<lengthShift - 1
-	typeMask    = 3 << 30
-	literalType = 0 << 30
-	matchType   = 1 << 30
-)
-
-// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
-// is lengthCodes[length - MIN_MATCH_LENGTH]
-var lengthCodes = [...]uint32{
-	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
-	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
-	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
-	15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
-	17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
-	18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
-	19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
-	20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
-	21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
-	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
-	22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
-	23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
-	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-	25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
-	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-	26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
-	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	27, 27, 27, 27, 27, 28,
-}
-
-var offsetCodes = [...]uint32{
-	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
-	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-}
-
-type token uint32
-
-type tokens struct {
-	tokens [maxStoreBlockSize + 1]token
-	n      uint16 // Must be able to contain maxStoreBlockSize
-}
-
-// Convert a literal into a literal token.
-func literalToken(literal uint32) token { return token(literalType + literal) }
-
-// Convert a < xlength, xoffset > pair into a match token.
-func matchToken(xlength uint32, xoffset uint32) token {
-	return token(matchType + xlength<<lengthShift + xoffset)
-}
-
-func matchTokend(xlength uint32, xoffset uint32) token {
-	if xlength > maxMatchLength || xoffset > maxMatchOffset {
-		panic(fmt.Sprintf("Invalid match: len: %d, offset: %d\n", xlength, xoffset))
-		return token(matchType)
-	}
-	return token(matchType + xlength<<lengthShift + xoffset)
-}
-
-// Returns the type of a token
-func (t token) typ() uint32 { return uint32(t) & typeMask }
-
-// Returns the literal of a literal token
-func (t token) literal() uint32 { return uint32(t - literalType) }
-
-// Returns the extra offset of a match token
-func (t token) offset() uint32 { return uint32(t) & offsetMask }
-
-func (t token) length() uint32 { return uint32((t - matchType) >> lengthShift) }
-
-func lengthCode(len uint32) uint32 { return lengthCodes[len] }
-
-// Returns the offset code corresponding to a specific offset
-func offsetCode(off uint32) uint32 {
-	if off < uint32(len(offsetCodes)) {
-		return offsetCodes[off]
-	} else if off>>7 < uint32(len(offsetCodes)) {
-		return offsetCodes[off>>7] + 14
-	} else {
-		return offsetCodes[off>>14] + 28
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/gzip/gunzip.go b/vendor/github.com/klauspost/compress/gzip/gunzip.go
deleted file mode 100644
index e73fab3f..00000000
--- a/vendor/github.com/klauspost/compress/gzip/gunzip.go
+++ /dev/null
@@ -1,344 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package gzip implements reading and writing of gzip format compressed files,
-// as specified in RFC 1952.
-package gzip
-
-import (
-	"bufio"
-	"encoding/binary"
-	"errors"
-	"io"
-	"time"
-
-	"github.com/klauspost/compress/flate"
-	"github.com/klauspost/crc32"
-)
-
-const (
-	gzipID1     = 0x1f
-	gzipID2     = 0x8b
-	gzipDeflate = 8
-	flagText    = 1 << 0
-	flagHdrCrc  = 1 << 1
-	flagExtra   = 1 << 2
-	flagName    = 1 << 3
-	flagComment = 1 << 4
-)
-
-var (
-	// ErrChecksum is returned when reading GZIP data that has an invalid checksum.
-	ErrChecksum = errors.New("gzip: invalid checksum")
-	// ErrHeader is returned when reading GZIP data that has an invalid header.
-	ErrHeader = errors.New("gzip: invalid header")
-)
-
-var le = binary.LittleEndian
-
-// noEOF converts io.EOF to io.ErrUnexpectedEOF.
-func noEOF(err error) error {
-	if err == io.EOF {
-		return io.ErrUnexpectedEOF
-	}
-	return err
-}
-
-// The gzip file stores a header giving metadata about the compressed file.
-// That header is exposed as the fields of the Writer and Reader structs.
-//
-// Strings must be UTF-8 encoded and may only contain Unicode code points
-// U+0001 through U+00FF, due to limitations of the GZIP file format.
-type Header struct {
-	Comment string    // comment
-	Extra   []byte    // "extra data"
-	ModTime time.Time // modification time
-	Name    string    // file name
-	OS      byte      // operating system type
-}
-
-// A Reader is an io.Reader that can be read to retrieve
-// uncompressed data from a gzip-format compressed file.
-//
-// In general, a gzip file can be a concatenation of gzip files,
-// each with its own header. Reads from the Reader
-// return the concatenation of the uncompressed data of each.
-// Only the first header is recorded in the Reader fields.
-//
-// Gzip files store a length and checksum of the uncompressed data.
-// The Reader will return a ErrChecksum when Read
-// reaches the end of the uncompressed data if it does not
-// have the expected length or checksum. Clients should treat data
-// returned by Read as tentative until they receive the io.EOF
-// marking the end of the data.
-type Reader struct {
-	Header       // valid after NewReader or Reader.Reset
-	r            flate.Reader
-	decompressor io.ReadCloser
-	digest       uint32 // CRC-32, IEEE polynomial (section 8)
-	size         uint32 // Uncompressed size (section 2.3.1)
-	buf          [512]byte
-	err          error
-	multistream  bool
-}
-
-// NewReader creates a new Reader reading the given reader.
-// If r does not also implement io.ByteReader,
-// the decompressor may read more data than necessary from r.
-//
-// It is the caller's responsibility to call Close on the Reader when done.
-//
-// The Reader.Header fields will be valid in the Reader returned.
-func NewReader(r io.Reader) (*Reader, error) {
-	z := new(Reader)
-	if err := z.Reset(r); err != nil {
-		return nil, err
-	}
-	return z, nil
-}
-
-// Reset discards the Reader z's state and makes it equivalent to the
-// result of its original state from NewReader, but reading from r instead.
-// This permits reusing a Reader rather than allocating a new one.
-func (z *Reader) Reset(r io.Reader) error {
-	*z = Reader{
-		decompressor: z.decompressor,
-		multistream:  true,
-	}
-	if rr, ok := r.(flate.Reader); ok {
-		z.r = rr
-	} else {
-		z.r = bufio.NewReader(r)
-	}
-	z.Header, z.err = z.readHeader()
-	return z.err
-}
-
-// Multistream controls whether the reader supports multistream files.
-//
-// If enabled (the default), the Reader expects the input to be a sequence
-// of individually gzipped data streams, each with its own header and
-// trailer, ending at EOF. The effect is that the concatenation of a sequence
-// of gzipped files is treated as equivalent to the gzip of the concatenation
-// of the sequence. This is standard behavior for gzip readers.
-//
-// Calling Multistream(false) disables this behavior; disabling the behavior
-// can be useful when reading file formats that distinguish individual gzip
-// data streams or mix gzip data streams with other data streams.
-// In this mode, when the Reader reaches the end of the data stream,
-// Read returns io.EOF. If the underlying reader implements io.ByteReader,
-// it will be left positioned just after the gzip stream.
-// To start the next stream, call z.Reset(r) followed by z.Multistream(false).
-// If there is no next stream, z.Reset(r) will return io.EOF.
-func (z *Reader) Multistream(ok bool) {
-	z.multistream = ok
-}
-
-// readString reads a NUL-terminated string from z.r.
-// It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and
-// will output a string encoded using UTF-8.
-// This method always updates z.digest with the data read.
-func (z *Reader) readString() (string, error) {
-	var err error
-	needConv := false
-	for i := 0; ; i++ {
-		if i >= len(z.buf) {
-			return "", ErrHeader
-		}
-		z.buf[i], err = z.r.ReadByte()
-		if err != nil {
-			return "", err
-		}
-		if z.buf[i] > 0x7f {
-			needConv = true
-		}
-		if z.buf[i] == 0 {
-			// Digest covers the NUL terminator.
-			z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:i+1])
-
-			// Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1).
-			if needConv {
-				s := make([]rune, 0, i)
-				for _, v := range z.buf[:i] {
-					s = append(s, rune(v))
-				}
-				return string(s), nil
-			}
-			return string(z.buf[:i]), nil
-		}
-	}
-}
-
-// readHeader reads the GZIP header according to section 2.3.1.
-// This method does not set z.err.
-func (z *Reader) readHeader() (hdr Header, err error) {
-	if _, err = io.ReadFull(z.r, z.buf[:10]); err != nil {
-		// RFC 1952, section 2.2, says the following:
-		//	A gzip file consists of a series of "members" (compressed data sets).
-		//
-		// Other than this, the specification does not clarify whether a
-		// "series" is defined as "one or more" or "zero or more". To err on the
-		// side of caution, Go interprets this to mean "zero or more".
-		// Thus, it is okay to return io.EOF here.
-		return hdr, err
-	}
-	if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate {
-		return hdr, ErrHeader
-	}
-	flg := z.buf[3]
-	hdr.ModTime = time.Unix(int64(le.Uint32(z.buf[4:8])), 0)
-	// z.buf[8] is XFL and is currently ignored.
-	hdr.OS = z.buf[9]
-	z.digest = crc32.ChecksumIEEE(z.buf[:10])
-
-	if flg&flagExtra != 0 {
-		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
-			return hdr, noEOF(err)
-		}
-		z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:2])
-		data := make([]byte, le.Uint16(z.buf[:2]))
-		if _, err = io.ReadFull(z.r, data); err != nil {
-			return hdr, noEOF(err)
-		}
-		z.digest = crc32.Update(z.digest, crc32.IEEETable, data)
-		hdr.Extra = data
-	}
-
-	var s string
-	if flg&flagName != 0 {
-		if s, err = z.readString(); err != nil {
-			return hdr, err
-		}
-		hdr.Name = s
-	}
-
-	if flg&flagComment != 0 {
-		if s, err = z.readString(); err != nil {
-			return hdr, err
-		}
-		hdr.Comment = s
-	}
-
-	if flg&flagHdrCrc != 0 {
-		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
-			return hdr, noEOF(err)
-		}
-		digest := le.Uint16(z.buf[:2])
-		if digest != uint16(z.digest) {
-			return hdr, ErrHeader
-		}
-	}
-
-	z.digest = 0
-	if z.decompressor == nil {
-		z.decompressor = flate.NewReader(z.r)
-	} else {
-		z.decompressor.(flate.Resetter).Reset(z.r, nil)
-	}
-	return hdr, nil
-}
-
-// Read implements io.Reader, reading uncompressed bytes from its underlying Reader.
-func (z *Reader) Read(p []byte) (n int, err error) {
-	if z.err != nil {
-		return 0, z.err
-	}
-
-	n, z.err = z.decompressor.Read(p)
-	z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n])
-	z.size += uint32(n)
-	if z.err != io.EOF {
-		// In the normal case we return here.
-		return n, z.err
-	}
-
-	// Finished file; check checksum and size.
-	if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil {
-		z.err = noEOF(err)
-		return n, z.err
-	}
-	digest := le.Uint32(z.buf[:4])
-	size := le.Uint32(z.buf[4:8])
-	if digest != z.digest || size != z.size {
-		z.err = ErrChecksum
-		return n, z.err
-	}
-	z.digest, z.size = 0, 0
-
-	// File is ok; check if there is another.
-	if !z.multistream {
-		return n, io.EOF
-	}
-	z.err = nil // Remove io.EOF
-
-	if _, z.err = z.readHeader(); z.err != nil {
-		return n, z.err
-	}
-
-	// Read from next file, if necessary.
-	if n > 0 {
-		return n, nil
-	}
-	return z.Read(p)
-}
-
-// Support the io.WriteTo interface for io.Copy and friends.
-func (z *Reader) WriteTo(w io.Writer) (int64, error) {
-	total := int64(0)
-	crcWriter := crc32.NewIEEE()
-	for {
-		if z.err != nil {
-			if z.err == io.EOF {
-				return total, nil
-			}
-			return total, z.err
-		}
-
-		// We write both to output and digest.
-		mw := io.MultiWriter(w, crcWriter)
-		n, err := z.decompressor.(io.WriterTo).WriteTo(mw)
-		total += n
-		z.size += uint32(n)
-		if err != nil {
-			z.err = err
-			return total, z.err
-		}
-
-		// Finished file; check checksum + size.
-		if _, err := io.ReadFull(z.r, z.buf[0:8]); err != nil {
-			if err == io.EOF {
-				err = io.ErrUnexpectedEOF
-			}
-			z.err = err
-			return total, err
-		}
-		z.digest = crcWriter.Sum32()
-		digest := le.Uint32(z.buf[:4])
-		size := le.Uint32(z.buf[4:8])
-		if digest != z.digest || size != z.size {
-			z.err = ErrChecksum
-			return total, z.err
-		}
-		z.digest, z.size = 0, 0
-
-		// File is ok; check if there is another.
-		if !z.multistream {
-			return total, nil
-		}
-		crcWriter.Reset()
-		z.err = nil // Remove io.EOF
-
-		if _, z.err = z.readHeader(); z.err != nil {
-			if z.err == io.EOF {
-				return total, nil
-			}
-			return total, z.err
-		}
-	}
-}
-
-// Close closes the Reader. It does not close the underlying io.Reader.
-// In order for the GZIP checksum to be verified, the reader must be
-// fully consumed until the io.EOF.
-func (z *Reader) Close() error { return z.decompressor.Close() }
diff --git a/vendor/github.com/klauspost/compress/gzip/gzip.go b/vendor/github.com/klauspost/compress/gzip/gzip.go
deleted file mode 100644
index a0f3ed0f..00000000
--- a/vendor/github.com/klauspost/compress/gzip/gzip.go
+++ /dev/null
@@ -1,251 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package gzip
-
-import (
-	"errors"
-	"fmt"
-	"io"
-
-	"github.com/klauspost/compress/flate"
-	"github.com/klauspost/crc32"
-)
-
-// These constants are copied from the flate package, so that code that imports
-// "compress/gzip" does not also have to import "compress/flate".
-const (
-	NoCompression       = flate.NoCompression
-	BestSpeed           = flate.BestSpeed
-	BestCompression     = flate.BestCompression
-	DefaultCompression  = flate.DefaultCompression
-	ConstantCompression = flate.ConstantCompression
-	HuffmanOnly         = flate.HuffmanOnly
-)
-
-// A Writer is an io.WriteCloser.
-// Writes to a Writer are compressed and written to w.
-type Writer struct {
-	Header      // written at first call to Write, Flush, or Close
-	w           io.Writer
-	level       int
-	wroteHeader bool
-	compressor  *flate.Writer
-	digest      uint32 // CRC-32, IEEE polynomial (section 8)
-	size        uint32 // Uncompressed size (section 2.3.1)
-	closed      bool
-	buf         [10]byte
-	err         error
-}
-
-// NewWriter returns a new Writer.
-// Writes to the returned writer are compressed and written to w.
-//
-// It is the caller's responsibility to call Close on the WriteCloser when done.
-// Writes may be buffered and not flushed until Close.
-//
-// Callers that wish to set the fields in Writer.Header must do so before
-// the first call to Write, Flush, or Close.
-func NewWriter(w io.Writer) *Writer {
-	z, _ := NewWriterLevel(w, DefaultCompression)
-	return z
-}
-
-// NewWriterLevel is like NewWriter but specifies the compression level instead
-// of assuming DefaultCompression.
-//
-// The compression level can be DefaultCompression, NoCompression, or any
-// integer value between BestSpeed and BestCompression inclusive. The error
-// returned will be nil if the level is valid.
-func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
-	if level < HuffmanOnly || level > BestCompression {
-		return nil, fmt.Errorf("gzip: invalid compression level: %d", level)
-	}
-	z := new(Writer)
-	z.init(w, level)
-	return z, nil
-}
-
-func (z *Writer) init(w io.Writer, level int) {
-	compressor := z.compressor
-	if compressor != nil {
-		compressor.Reset(w)
-	}
-	*z = Writer{
-		Header: Header{
-			OS: 255, // unknown
-		},
-		w:          w,
-		level:      level,
-		compressor: compressor,
-	}
-}
-
-// Reset discards the Writer z's state and makes it equivalent to the
-// result of its original state from NewWriter or NewWriterLevel, but
-// writing to w instead. This permits reusing a Writer rather than
-// allocating a new one.
-func (z *Writer) Reset(w io.Writer) {
-	z.init(w, z.level)
-}
-
-// writeBytes writes a length-prefixed byte slice to z.w.
-func (z *Writer) writeBytes(b []byte) error {
-	if len(b) > 0xffff {
-		return errors.New("gzip.Write: Extra data is too large")
-	}
-	le.PutUint16(z.buf[:2], uint16(len(b)))
-	_, err := z.w.Write(z.buf[:2])
-	if err != nil {
-		return err
-	}
-	_, err = z.w.Write(b)
-	return err
-}
-
-// writeString writes a UTF-8 string s in GZIP's format to z.w.
-// GZIP (RFC 1952) specifies that strings are NUL-terminated ISO 8859-1 (Latin-1).
-func (z *Writer) writeString(s string) (err error) {
-	// GZIP stores Latin-1 strings; error if non-Latin-1; convert if non-ASCII.
-	needconv := false
-	for _, v := range s {
-		if v == 0 || v > 0xff {
-			return errors.New("gzip.Write: non-Latin-1 header string")
-		}
-		if v > 0x7f {
-			needconv = true
-		}
-	}
-	if needconv {
-		b := make([]byte, 0, len(s))
-		for _, v := range s {
-			b = append(b, byte(v))
-		}
-		_, err = z.w.Write(b)
-	} else {
-		_, err = io.WriteString(z.w, s)
-	}
-	if err != nil {
-		return err
-	}
-	// GZIP strings are NUL-terminated.
-	z.buf[0] = 0
-	_, err = z.w.Write(z.buf[:1])
-	return err
-}
-
-// Write writes a compressed form of p to the underlying io.Writer. The
-// compressed bytes are not necessarily flushed until the Writer is closed.
-func (z *Writer) Write(p []byte) (int, error) {
-	if z.err != nil {
-		return 0, z.err
-	}
-	var n int
-	// Write the GZIP header lazily.
-	if !z.wroteHeader {
-		z.wroteHeader = true
-		z.buf[0] = gzipID1
-		z.buf[1] = gzipID2
-		z.buf[2] = gzipDeflate
-		z.buf[3] = 0
-		if z.Extra != nil {
-			z.buf[3] |= 0x04
-		}
-		if z.Name != "" {
-			z.buf[3] |= 0x08
-		}
-		if z.Comment != "" {
-			z.buf[3] |= 0x10
-		}
-		le.PutUint32(z.buf[4:8], uint32(z.ModTime.Unix()))
-		if z.level == BestCompression {
-			z.buf[8] = 2
-		} else if z.level == BestSpeed {
-			z.buf[8] = 4
-		} else {
-			z.buf[8] = 0
-		}
-		z.buf[9] = z.OS
-		n, z.err = z.w.Write(z.buf[:10])
-		if z.err != nil {
-			return n, z.err
-		}
-		if z.Extra != nil {
-			z.err = z.writeBytes(z.Extra)
-			if z.err != nil {
-				return n, z.err
-			}
-		}
-		if z.Name != "" {
-			z.err = z.writeString(z.Name)
-			if z.err != nil {
-				return n, z.err
-			}
-		}
-		if z.Comment != "" {
-			z.err = z.writeString(z.Comment)
-			if z.err != nil {
-				return n, z.err
-			}
-		}
-		if z.compressor == nil {
-			z.compressor, _ = flate.NewWriter(z.w, z.level)
-		}
-	}
-	z.size += uint32(len(p))
-	z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
-	n, z.err = z.compressor.Write(p)
-	return n, z.err
-}
-
-// Flush flushes any pending compressed data to the underlying writer.
-//
-// It is useful mainly in compressed network protocols, to ensure that
-// a remote reader has enough data to reconstruct a packet. Flush does
-// not return until the data has been written. If the underlying
-// writer returns an error, Flush returns that error.
-//
-// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
-func (z *Writer) Flush() error {
-	if z.err != nil {
-		return z.err
-	}
-	if z.closed {
-		return nil
-	}
-	if !z.wroteHeader {
-		z.Write(nil)
-		if z.err != nil {
-			return z.err
-		}
-	}
-	z.err = z.compressor.Flush()
-	return z.err
-}
-
-// Close closes the Writer, flushing any unwritten data to the underlying
-// io.Writer, but does not close the underlying io.Writer.
-func (z *Writer) Close() error {
-	if z.err != nil {
-		return z.err
-	}
-	if z.closed {
-		return nil
-	}
-	z.closed = true
-	if !z.wroteHeader {
-		z.Write(nil)
-		if z.err != nil {
-			return z.err
-		}
-	}
-	z.err = z.compressor.Close()
-	if z.err != nil {
-		return z.err
-	}
-	le.PutUint32(z.buf[:4], z.digest)
-	le.PutUint32(z.buf[4:8], z.size)
-	_, z.err = z.w.Write(z.buf[:8])
-	return z.err
-}
diff --git a/vendor/github.com/klauspost/cpuid/LICENSE b/vendor/github.com/klauspost/cpuid/LICENSE
deleted file mode 100644
index 5cec7ee9..00000000
--- a/vendor/github.com/klauspost/cpuid/LICENSE
+++ /dev/null
@@ -1,22 +0,0 @@
-The MIT License (MIT)
-
-Copyright (c) 2015 Klaus Post
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
diff --git a/vendor/github.com/klauspost/cpuid/README.md b/vendor/github.com/klauspost/cpuid/README.md
deleted file mode 100644
index b2b6bee8..00000000
--- a/vendor/github.com/klauspost/cpuid/README.md
+++ /dev/null
@@ -1,145 +0,0 @@
-# cpuid
-Package cpuid provides information about the CPU running the current program.
-
-CPU features are detected on startup, and kept for fast access through the life of the application.
-Currently x86 / x64 (AMD64) is supported, and no external C (cgo) code is used, which should make the library very easy to use.
-
-You can access the CPU information by accessing the shared CPU variable of the cpuid library.
-
-Package home: https://github.com/klauspost/cpuid
-
-[![GoDoc][1]][2] [![Build Status][3]][4]
-
-[1]: https://godoc.org/github.com/klauspost/cpuid?status.svg
-[2]: https://godoc.org/github.com/klauspost/cpuid
-[3]: https://travis-ci.org/klauspost/cpuid.svg
-[4]: https://travis-ci.org/klauspost/cpuid
-
-# features
-## CPU Instructions
-*  **CMOV** (i686 CMOV)
-*  **NX** (NX (No-Execute) bit)
-*  **AMD3DNOW** (AMD 3DNOW)
-*  **AMD3DNOWEXT** (AMD 3DNowExt)
-*  **MMX** (standard MMX)
-*  **MMXEXT** (SSE integer functions or AMD MMX ext)
-*  **SSE** (SSE functions)
-*  **SSE2** (P4 SSE functions)
-*  **SSE3** (Prescott SSE3 functions)
-*  **SSSE3** (Conroe SSSE3 functions)
-*  **SSE4** (Penryn SSE4.1 functions)
-*  **SSE4A** (AMD Barcelona microarchitecture SSE4a instructions)
-*  **SSE42** (Nehalem SSE4.2 functions)
-*  **AVX** (AVX functions)
-*  **AVX2** (AVX2 functions)
-*  **FMA3** (Intel FMA 3)
-*  **FMA4** (Bulldozer FMA4 functions)
-*  **XOP** (Bulldozer XOP functions)
-*  **F16C** (Half-precision floating-point conversion)
-*  **BMI1** (Bit Manipulation Instruction Set 1)
-*  **BMI2** (Bit Manipulation Instruction Set 2)
-*  **TBM** (AMD Trailing Bit Manipulation)
-*  **LZCNT** (LZCNT instruction)
-*  **POPCNT** (POPCNT instruction)
-*  **AESNI** (Advanced Encryption Standard New Instructions)
-*  **CLMUL** (Carry-less Multiplication)
-*  **HTT** (Hyperthreading (enabled))
-*  **HLE** (Hardware Lock Elision)
-*  **RTM** (Restricted Transactional Memory)
-*  **RDRAND** (RDRAND instruction is available)
-*  **RDSEED** (RDSEED instruction is available)
-*  **ADX** (Intel ADX (Multi-Precision Add-Carry Instruction Extensions))
-*  **SHA** (Intel SHA Extensions)
-*  **AVX512F** (AVX-512 Foundation)
-*  **AVX512DQ** (AVX-512 Doubleword and Quadword Instructions)
-*  **AVX512IFMA** (AVX-512 Integer Fused Multiply-Add Instructions)
-*  **AVX512PF** (AVX-512 Prefetch Instructions)
-*  **AVX512ER** (AVX-512 Exponential and Reciprocal Instructions)
-*  **AVX512CD** (AVX-512 Conflict Detection Instructions)
-*  **AVX512BW** (AVX-512 Byte and Word Instructions)
-*  **AVX512VL** (AVX-512 Vector Length Extensions)
-*  **AVX512VBMI** (AVX-512 Vector Bit Manipulation Instructions)
-*  **MPX** (Intel MPX (Memory Protection Extensions))
-*  **ERMS** (Enhanced REP MOVSB/STOSB)
-*  **RDTSCP** (RDTSCP Instruction)
-*  **CX16** (CMPXCHG16B Instruction)
-*  **SGX** (Software Guard Extensions, with activation details)
-
-## Performance
-*  **RDTSCP()** Returns current cycle count. Can be used for benchmarking.
-*  **SSE2SLOW** (SSE2 is supported, but usually not faster)
-*  **SSE3SLOW** (SSE3 is supported, but usually not faster)
-*  **ATOM** (Atom processor, some SSSE3 instructions are slower)
-*  **Cache line** (Probable size of a cache line).
-*  **L1, L2, L3 Cache size** on newer Intel/AMD CPUs.
-
-## Cpu Vendor/VM
-* **Intel**
-* **AMD**
-* **VIA**
-* **Transmeta**
-* **NSC**
-* **KVM**  (Kernel-based Virtual Machine)
-* **MSVM** (Microsoft Hyper-V or Windows Virtual PC)
-* **VMware**
-* **XenHVM**
-
-# installing
-
-```go get github.com/klauspost/cpuid```
-
-# example
-
-```Go
-package main
-
-import (
-	"fmt"
-	"github.com/klauspost/cpuid"
-)
-
-func main() {
-	// Print basic CPU information:
-	fmt.Println("Name:", cpuid.CPU.BrandName)
-	fmt.Println("PhysicalCores:", cpuid.CPU.PhysicalCores)
-	fmt.Println("ThreadsPerCore:", cpuid.CPU.ThreadsPerCore)
-	fmt.Println("LogicalCores:", cpuid.CPU.LogicalCores)
-	fmt.Println("Family", cpuid.CPU.Family, "Model:", cpuid.CPU.Model)
-	fmt.Println("Features:", cpuid.CPU.Features)
-	fmt.Println("Cacheline bytes:", cpuid.CPU.CacheLine)
-	fmt.Println("L1 Data Cache:", cpuid.CPU.Cache.L1D, "bytes")
-	fmt.Println("L1 Instruction Cache:", cpuid.CPU.Cache.L1D, "bytes")
-	fmt.Println("L2 Cache:", cpuid.CPU.Cache.L2, "bytes")
-	fmt.Println("L3 Cache:", cpuid.CPU.Cache.L3, "bytes")
-
-	// Test if we have a specific feature:
-	if cpuid.CPU.SSE() {
-		fmt.Println("We have Streaming SIMD Extensions")
-	}
-}
-```
-
-Sample output:
-```
->go run main.go
-Name: Intel(R) Core(TM) i5-2540M CPU @ 2.60GHz
-PhysicalCores: 2
-ThreadsPerCore: 2
-LogicalCores: 4
-Family 6 Model: 42
-Features: CMOV,MMX,MMXEXT,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AESNI,CLMUL
-Cacheline bytes: 64
-We have Streaming SIMD Extensions
-```
-
-# private package
-
-In the "private" folder you can find an autogenerated version of the library you can include in your own packages.
-
-For this purpose all exports are removed, and functions and constants are lowercased.
-
-This is not a recommended way of using the library, but provided for convenience, if it is difficult for you to use external packages.
-
-# license
-
-This code is published under an MIT license. See LICENSE file for more information.
diff --git a/vendor/github.com/klauspost/cpuid/cpuid.go b/vendor/github.com/klauspost/cpuid/cpuid.go
deleted file mode 100644
index 9230ca56..00000000
--- a/vendor/github.com/klauspost/cpuid/cpuid.go
+++ /dev/null
@@ -1,1022 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-// Package cpuid provides information about the CPU running the current program.
-//
-// CPU features are detected on startup, and kept for fast access through the life of the application.
-// Currently x86 / x64 (AMD64) is supported.
-//
-// You can access the CPU information by accessing the shared CPU variable of the cpuid library.
-//
-// Package home: https://github.com/klauspost/cpuid
-package cpuid
-
-import "strings"
-
-// Vendor is a representation of a CPU vendor.
-type Vendor int
-
-const (
-	Other Vendor = iota
-	Intel
-	AMD
-	VIA
-	Transmeta
-	NSC
-	KVM  // Kernel-based Virtual Machine
-	MSVM // Microsoft Hyper-V or Windows Virtual PC
-	VMware
-	XenHVM
-)
-
-const (
-	CMOV        = 1 << iota // i686 CMOV
-	NX                      // NX (No-Execute) bit
-	AMD3DNOW                // AMD 3DNOW
-	AMD3DNOWEXT             // AMD 3DNowExt
-	MMX                     // standard MMX
-	MMXEXT                  // SSE integer functions or AMD MMX ext
-	SSE                     // SSE functions
-	SSE2                    // P4 SSE functions
-	SSE3                    // Prescott SSE3 functions
-	SSSE3                   // Conroe SSSE3 functions
-	SSE4                    // Penryn SSE4.1 functions
-	SSE4A                   // AMD Barcelona microarchitecture SSE4a instructions
-	SSE42                   // Nehalem SSE4.2 functions
-	AVX                     // AVX functions
-	AVX2                    // AVX2 functions
-	FMA3                    // Intel FMA 3
-	FMA4                    // Bulldozer FMA4 functions
-	XOP                     // Bulldozer XOP functions
-	F16C                    // Half-precision floating-point conversion
-	BMI1                    // Bit Manipulation Instruction Set 1
-	BMI2                    // Bit Manipulation Instruction Set 2
-	TBM                     // AMD Trailing Bit Manipulation
-	LZCNT                   // LZCNT instruction
-	POPCNT                  // POPCNT instruction
-	AESNI                   // Advanced Encryption Standard New Instructions
-	CLMUL                   // Carry-less Multiplication
-	HTT                     // Hyperthreading (enabled)
-	HLE                     // Hardware Lock Elision
-	RTM                     // Restricted Transactional Memory
-	RDRAND                  // RDRAND instruction is available
-	RDSEED                  // RDSEED instruction is available
-	ADX                     // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
-	SHA                     // Intel SHA Extensions
-	AVX512F                 // AVX-512 Foundation
-	AVX512DQ                // AVX-512 Doubleword and Quadword Instructions
-	AVX512IFMA              // AVX-512 Integer Fused Multiply-Add Instructions
-	AVX512PF                // AVX-512 Prefetch Instructions
-	AVX512ER                // AVX-512 Exponential and Reciprocal Instructions
-	AVX512CD                // AVX-512 Conflict Detection Instructions
-	AVX512BW                // AVX-512 Byte and Word Instructions
-	AVX512VL                // AVX-512 Vector Length Extensions
-	AVX512VBMI              // AVX-512 Vector Bit Manipulation Instructions
-	MPX                     // Intel MPX (Memory Protection Extensions)
-	ERMS                    // Enhanced REP MOVSB/STOSB
-	RDTSCP                  // RDTSCP Instruction
-	CX16                    // CMPXCHG16B Instruction
-	SGX                     // Software Guard Extensions
-
-	// Performance indicators
-	SSE2SLOW // SSE2 is supported, but usually not faster
-	SSE3SLOW // SSE3 is supported, but usually not faster
-	ATOM     // Atom processor, some SSSE3 instructions are slower
-)
-
-var flagNames = map[Flags]string{
-	CMOV:        "CMOV",        // i686 CMOV
-	NX:          "NX",          // NX (No-Execute) bit
-	AMD3DNOW:    "AMD3DNOW",    // AMD 3DNOW
-	AMD3DNOWEXT: "AMD3DNOWEXT", // AMD 3DNowExt
-	MMX:         "MMX",         // Standard MMX
-	MMXEXT:      "MMXEXT",      // SSE integer functions or AMD MMX ext
-	SSE:         "SSE",         // SSE functions
-	SSE2:        "SSE2",        // P4 SSE2 functions
-	SSE3:        "SSE3",        // Prescott SSE3 functions
-	SSSE3:       "SSSE3",       // Conroe SSSE3 functions
-	SSE4:        "SSE4.1",      // Penryn SSE4.1 functions
-	SSE4A:       "SSE4A",       // AMD Barcelona microarchitecture SSE4a instructions
-	SSE42:       "SSE4.2",      // Nehalem SSE4.2 functions
-	AVX:         "AVX",         // AVX functions
-	AVX2:        "AVX2",        // AVX functions
-	FMA3:        "FMA3",        // Intel FMA 3
-	FMA4:        "FMA4",        // Bulldozer FMA4 functions
-	XOP:         "XOP",         // Bulldozer XOP functions
-	F16C:        "F16C",        // Half-precision floating-point conversion
-	BMI1:        "BMI1",        // Bit Manipulation Instruction Set 1
-	BMI2:        "BMI2",        // Bit Manipulation Instruction Set 2
-	TBM:         "TBM",         // AMD Trailing Bit Manipulation
-	LZCNT:       "LZCNT",       // LZCNT instruction
-	POPCNT:      "POPCNT",      // POPCNT instruction
-	AESNI:       "AESNI",       // Advanced Encryption Standard New Instructions
-	CLMUL:       "CLMUL",       // Carry-less Multiplication
-	HTT:         "HTT",         // Hyperthreading (enabled)
-	HLE:         "HLE",         // Hardware Lock Elision
-	RTM:         "RTM",         // Restricted Transactional Memory
-	RDRAND:      "RDRAND",      // RDRAND instruction is available
-	RDSEED:      "RDSEED",      // RDSEED instruction is available
-	ADX:         "ADX",         // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
-	SHA:         "SHA",         // Intel SHA Extensions
-	AVX512F:     "AVX512F",     // AVX-512 Foundation
-	AVX512DQ:    "AVX512DQ",    // AVX-512 Doubleword and Quadword Instructions
-	AVX512IFMA:  "AVX512IFMA",  // AVX-512 Integer Fused Multiply-Add Instructions
-	AVX512PF:    "AVX512PF",    // AVX-512 Prefetch Instructions
-	AVX512ER:    "AVX512ER",    // AVX-512 Exponential and Reciprocal Instructions
-	AVX512CD:    "AVX512CD",    // AVX-512 Conflict Detection Instructions
-	AVX512BW:    "AVX512BW",    // AVX-512 Byte and Word Instructions
-	AVX512VL:    "AVX512VL",    // AVX-512 Vector Length Extensions
-	AVX512VBMI:  "AVX512VBMI",  // AVX-512 Vector Bit Manipulation Instructions
-	MPX:         "MPX",         // Intel MPX (Memory Protection Extensions)
-	ERMS:        "ERMS",        // Enhanced REP MOVSB/STOSB
-	RDTSCP:      "RDTSCP",      // RDTSCP Instruction
-	CX16:        "CX16",        // CMPXCHG16B Instruction
-	SGX:         "SGX",         // Software Guard Extensions
-
-	// Performance indicators
-	SSE2SLOW: "SSE2SLOW", // SSE2 supported, but usually not faster
-	SSE3SLOW: "SSE3SLOW", // SSE3 supported, but usually not faster
-	ATOM:     "ATOM",     // Atom processor, some SSSE3 instructions are slower
-
-}
-
-// CPUInfo contains information about the detected system CPU.
-type CPUInfo struct {
-	BrandName      string // Brand name reported by the CPU
-	VendorID       Vendor // Comparable CPU vendor ID
-	Features       Flags  // Features of the CPU
-	PhysicalCores  int    // Number of physical processor cores in your CPU. Will be 0 if undetectable.
-	ThreadsPerCore int    // Number of threads per physical core. Will be 1 if undetectable.
-	LogicalCores   int    // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable.
-	Family         int    // CPU family number
-	Model          int    // CPU model number
-	CacheLine      int    // Cache line size in bytes. Will be 0 if undetectable.
-	Cache          struct {
-		L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected
-		L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected
-		L2  int // L2 Cache (per core or shared). Will be -1 if undetected
-		L3  int // L3 Instruction Cache (per core or shared). Will be -1 if undetected
-	}
-	SGX       SGXSupport
-	maxFunc   uint32
-	maxExFunc uint32
-}
-
-var cpuid func(op uint32) (eax, ebx, ecx, edx uint32)
-var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-var xgetbv func(index uint32) (eax, edx uint32)
-var rdtscpAsm func() (eax, ebx, ecx, edx uint32)
-
-// CPU contains information about the CPU as detected on startup,
-// or when Detect last was called.
-//
-// Use this as the primary entry point to you data,
-// this way queries are
-var CPU CPUInfo
-
-func init() {
-	initCPU()
-	Detect()
-}
-
-// Detect will re-detect current CPU info.
-// This will replace the content of the exported CPU variable.
-//
-// Unless you expect the CPU to change while you are running your program
-// you should not need to call this function.
-// If you call this, you must ensure that no other goroutine is accessing the
-// exported CPU variable.
-func Detect() {
-	CPU.maxFunc = maxFunctionID()
-	CPU.maxExFunc = maxExtendedFunction()
-	CPU.BrandName = brandName()
-	CPU.CacheLine = cacheLine()
-	CPU.Family, CPU.Model = familyModel()
-	CPU.Features = support()
-	CPU.SGX = sgx(CPU.Features&SGX != 0)
-	CPU.ThreadsPerCore = threadsPerCore()
-	CPU.LogicalCores = logicalCores()
-	CPU.PhysicalCores = physicalCores()
-	CPU.VendorID = vendorID()
-	CPU.cacheSize()
-}
-
-// Generated here: http://play.golang.org/p/BxFH2Gdc0G
-
-// Cmov indicates support of CMOV instructions
-func (c CPUInfo) Cmov() bool {
-	return c.Features&CMOV != 0
-}
-
-// Amd3dnow indicates support of AMD 3DNOW! instructions
-func (c CPUInfo) Amd3dnow() bool {
-	return c.Features&AMD3DNOW != 0
-}
-
-// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions
-func (c CPUInfo) Amd3dnowExt() bool {
-	return c.Features&AMD3DNOWEXT != 0
-}
-
-// MMX indicates support of MMX instructions
-func (c CPUInfo) MMX() bool {
-	return c.Features&MMX != 0
-}
-
-// MMXExt indicates support of MMXEXT instructions
-// (SSE integer functions or AMD MMX ext)
-func (c CPUInfo) MMXExt() bool {
-	return c.Features&MMXEXT != 0
-}
-
-// SSE indicates support of SSE instructions
-func (c CPUInfo) SSE() bool {
-	return c.Features&SSE != 0
-}
-
-// SSE2 indicates support of SSE 2 instructions
-func (c CPUInfo) SSE2() bool {
-	return c.Features&SSE2 != 0
-}
-
-// SSE3 indicates support of SSE 3 instructions
-func (c CPUInfo) SSE3() bool {
-	return c.Features&SSE3 != 0
-}
-
-// SSSE3 indicates support of SSSE 3 instructions
-func (c CPUInfo) SSSE3() bool {
-	return c.Features&SSSE3 != 0
-}
-
-// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions
-func (c CPUInfo) SSE4() bool {
-	return c.Features&SSE4 != 0
-}
-
-// SSE42 indicates support of SSE4.2 instructions
-func (c CPUInfo) SSE42() bool {
-	return c.Features&SSE42 != 0
-}
-
-// AVX indicates support of AVX instructions
-// and operating system support of AVX instructions
-func (c CPUInfo) AVX() bool {
-	return c.Features&AVX != 0
-}
-
-// AVX2 indicates support of AVX2 instructions
-func (c CPUInfo) AVX2() bool {
-	return c.Features&AVX2 != 0
-}
-
-// FMA3 indicates support of FMA3 instructions
-func (c CPUInfo) FMA3() bool {
-	return c.Features&FMA3 != 0
-}
-
-// FMA4 indicates support of FMA4 instructions
-func (c CPUInfo) FMA4() bool {
-	return c.Features&FMA4 != 0
-}
-
-// XOP indicates support of XOP instructions
-func (c CPUInfo) XOP() bool {
-	return c.Features&XOP != 0
-}
-
-// F16C indicates support of F16C instructions
-func (c CPUInfo) F16C() bool {
-	return c.Features&F16C != 0
-}
-
-// BMI1 indicates support of BMI1 instructions
-func (c CPUInfo) BMI1() bool {
-	return c.Features&BMI1 != 0
-}
-
-// BMI2 indicates support of BMI2 instructions
-func (c CPUInfo) BMI2() bool {
-	return c.Features&BMI2 != 0
-}
-
-// TBM indicates support of TBM instructions
-// (AMD Trailing Bit Manipulation)
-func (c CPUInfo) TBM() bool {
-	return c.Features&TBM != 0
-}
-
-// Lzcnt indicates support of LZCNT instruction
-func (c CPUInfo) Lzcnt() bool {
-	return c.Features&LZCNT != 0
-}
-
-// Popcnt indicates support of POPCNT instruction
-func (c CPUInfo) Popcnt() bool {
-	return c.Features&POPCNT != 0
-}
-
-// HTT indicates the processor has Hyperthreading enabled
-func (c CPUInfo) HTT() bool {
-	return c.Features&HTT != 0
-}
-
-// SSE2Slow indicates that SSE2 may be slow on this processor
-func (c CPUInfo) SSE2Slow() bool {
-	return c.Features&SSE2SLOW != 0
-}
-
-// SSE3Slow indicates that SSE3 may be slow on this processor
-func (c CPUInfo) SSE3Slow() bool {
-	return c.Features&SSE3SLOW != 0
-}
-
-// AesNi indicates support of AES-NI instructions
-// (Advanced Encryption Standard New Instructions)
-func (c CPUInfo) AesNi() bool {
-	return c.Features&AESNI != 0
-}
-
-// Clmul indicates support of CLMUL instructions
-// (Carry-less Multiplication)
-func (c CPUInfo) Clmul() bool {
-	return c.Features&CLMUL != 0
-}
-
-// NX indicates support of NX (No-Execute) bit
-func (c CPUInfo) NX() bool {
-	return c.Features&NX != 0
-}
-
-// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions
-func (c CPUInfo) SSE4A() bool {
-	return c.Features&SSE4A != 0
-}
-
-// HLE indicates support of Hardware Lock Elision
-func (c CPUInfo) HLE() bool {
-	return c.Features&HLE != 0
-}
-
-// RTM indicates support of Restricted Transactional Memory
-func (c CPUInfo) RTM() bool {
-	return c.Features&RTM != 0
-}
-
-// Rdrand indicates support of RDRAND instruction is available
-func (c CPUInfo) Rdrand() bool {
-	return c.Features&RDRAND != 0
-}
-
-// Rdseed indicates support of RDSEED instruction is available
-func (c CPUInfo) Rdseed() bool {
-	return c.Features&RDSEED != 0
-}
-
-// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
-func (c CPUInfo) ADX() bool {
-	return c.Features&ADX != 0
-}
-
-// SHA indicates support of Intel SHA Extensions
-func (c CPUInfo) SHA() bool {
-	return c.Features&SHA != 0
-}
-
-// AVX512F indicates support of AVX-512 Foundation
-func (c CPUInfo) AVX512F() bool {
-	return c.Features&AVX512F != 0
-}
-
-// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions
-func (c CPUInfo) AVX512DQ() bool {
-	return c.Features&AVX512DQ != 0
-}
-
-// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions
-func (c CPUInfo) AVX512IFMA() bool {
-	return c.Features&AVX512IFMA != 0
-}
-
-// AVX512PF indicates support of AVX-512 Prefetch Instructions
-func (c CPUInfo) AVX512PF() bool {
-	return c.Features&AVX512PF != 0
-}
-
-// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions
-func (c CPUInfo) AVX512ER() bool {
-	return c.Features&AVX512ER != 0
-}
-
-// AVX512CD indicates support of AVX-512 Conflict Detection Instructions
-func (c CPUInfo) AVX512CD() bool {
-	return c.Features&AVX512CD != 0
-}
-
-// AVX512BW indicates support of AVX-512 Byte and Word Instructions
-func (c CPUInfo) AVX512BW() bool {
-	return c.Features&AVX512BW != 0
-}
-
-// AVX512VL indicates support of AVX-512 Vector Length Extensions
-func (c CPUInfo) AVX512VL() bool {
-	return c.Features&AVX512VL != 0
-}
-
-// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions
-func (c CPUInfo) AVX512VBMI() bool {
-	return c.Features&AVX512VBMI != 0
-}
-
-// MPX indicates support of Intel MPX (Memory Protection Extensions)
-func (c CPUInfo) MPX() bool {
-	return c.Features&MPX != 0
-}
-
-// ERMS indicates support of Enhanced REP MOVSB/STOSB
-func (c CPUInfo) ERMS() bool {
-	return c.Features&ERMS != 0
-}
-
-func (c CPUInfo) RDTSCP() bool {
-	return c.Features&RDTSCP != 0
-}
-
-func (c CPUInfo) CX16() bool {
-	return c.Features&CX16 != 0
-}
-
-// Atom indicates an Atom processor
-func (c CPUInfo) Atom() bool {
-	return c.Features&ATOM != 0
-}
-
-// Intel returns true if vendor is recognized as Intel
-func (c CPUInfo) Intel() bool {
-	return c.VendorID == Intel
-}
-
-// AMD returns true if vendor is recognized as AMD
-func (c CPUInfo) AMD() bool {
-	return c.VendorID == AMD
-}
-
-// Transmeta returns true if vendor is recognized as Transmeta
-func (c CPUInfo) Transmeta() bool {
-	return c.VendorID == Transmeta
-}
-
-// NSC returns true if vendor is recognized as National Semiconductor
-func (c CPUInfo) NSC() bool {
-	return c.VendorID == NSC
-}
-
-// VIA returns true if vendor is recognized as VIA
-func (c CPUInfo) VIA() bool {
-	return c.VendorID == VIA
-}
-
-// RTCounter returns the 64-bit time-stamp counter
-// Uses the RDTSCP instruction. The value 0 is returned
-// if the CPU does not support the instruction.
-func (c CPUInfo) RTCounter() uint64 {
-	if !c.RDTSCP() {
-		return 0
-	}
-	a, _, _, d := rdtscpAsm()
-	return uint64(a) | (uint64(d) << 32)
-}
-
-// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP.
-// This variable is OS dependent, but on Linux contains information
-// about the current cpu/core the code is running on.
-// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned.
-func (c CPUInfo) Ia32TscAux() uint32 {
-	if !c.RDTSCP() {
-		return 0
-	}
-	_, _, ecx, _ := rdtscpAsm()
-	return ecx
-}
-
-// LogicalCPU will return the Logical CPU the code is currently executing on.
-// This is likely to change when the OS re-schedules the running thread
-// to another CPU.
-// If the current core cannot be detected, -1 will be returned.
-func (c CPUInfo) LogicalCPU() int {
-	if c.maxFunc < 1 {
-		return -1
-	}
-	_, ebx, _, _ := cpuid(1)
-	return int(ebx >> 24)
-}
-
-// VM Will return true if the cpu id indicates we are in
-// a virtual machine. This is only a hint, and will very likely
-// have many false negatives.
-func (c CPUInfo) VM() bool {
-	switch c.VendorID {
-	case MSVM, KVM, VMware, XenHVM:
-		return true
-	}
-	return false
-}
-
-// Flags contains detected cpu features and caracteristics
-type Flags uint64
-
-// String returns a string representation of the detected
-// CPU features.
-func (f Flags) String() string {
-	return strings.Join(f.Strings(), ",")
-}
-
-// Strings returns and array of the detected features.
-func (f Flags) Strings() []string {
-	s := support()
-	r := make([]string, 0, 20)
-	for i := uint(0); i < 64; i++ {
-		key := Flags(1 << i)
-		val := flagNames[key]
-		if s&key != 0 {
-			r = append(r, val)
-		}
-	}
-	return r
-}
-
-func maxExtendedFunction() uint32 {
-	eax, _, _, _ := cpuid(0x80000000)
-	return eax
-}
-
-func maxFunctionID() uint32 {
-	a, _, _, _ := cpuid(0)
-	return a
-}
-
-func brandName() string {
-	if maxExtendedFunction() >= 0x80000004 {
-		v := make([]uint32, 0, 48)
-		for i := uint32(0); i < 3; i++ {
-			a, b, c, d := cpuid(0x80000002 + i)
-			v = append(v, a, b, c, d)
-		}
-		return strings.Trim(string(valAsString(v...)), " ")
-	}
-	return "unknown"
-}
-
-func threadsPerCore() int {
-	mfi := maxFunctionID()
-	if mfi < 0x4 || vendorID() != Intel {
-		return 1
-	}
-
-	if mfi < 0xb {
-		_, b, _, d := cpuid(1)
-		if (d & (1 << 28)) != 0 {
-			// v will contain logical core count
-			v := (b >> 16) & 255
-			if v > 1 {
-				a4, _, _, _ := cpuid(4)
-				// physical cores
-				v2 := (a4 >> 26) + 1
-				if v2 > 0 {
-					return int(v) / int(v2)
-				}
-			}
-		}
-		return 1
-	}
-	_, b, _, _ := cpuidex(0xb, 0)
-	if b&0xffff == 0 {
-		return 1
-	}
-	return int(b & 0xffff)
-}
-
-func logicalCores() int {
-	mfi := maxFunctionID()
-	switch vendorID() {
-	case Intel:
-		// Use this on old Intel processors
-		if mfi < 0xb {
-			if mfi < 1 {
-				return 0
-			}
-			// CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID)
-			// that can be assigned to logical processors in a physical package.
-			// The value may not be the same as the number of logical processors that are present in the hardware of a physical package.
-			_, ebx, _, _ := cpuid(1)
-			logical := (ebx >> 16) & 0xff
-			return int(logical)
-		}
-		_, b, _, _ := cpuidex(0xb, 1)
-		return int(b & 0xffff)
-	case AMD:
-		_, b, _, _ := cpuid(1)
-		return int((b >> 16) & 0xff)
-	default:
-		return 0
-	}
-}
-
-func familyModel() (int, int) {
-	if maxFunctionID() < 0x1 {
-		return 0, 0
-	}
-	eax, _, _, _ := cpuid(1)
-	family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff)
-	model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0)
-	return int(family), int(model)
-}
-
-func physicalCores() int {
-	switch vendorID() {
-	case Intel:
-		return logicalCores() / threadsPerCore()
-	case AMD:
-		if maxExtendedFunction() >= 0x80000008 {
-			_, _, c, _ := cpuid(0x80000008)
-			return int(c&0xff) + 1
-		}
-	}
-	return 0
-}
-
-// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
-var vendorMapping = map[string]Vendor{
-	"AMDisbetter!": AMD,
-	"AuthenticAMD": AMD,
-	"CentaurHauls": VIA,
-	"GenuineIntel": Intel,
-	"TransmetaCPU": Transmeta,
-	"GenuineTMx86": Transmeta,
-	"Geode by NSC": NSC,
-	"VIA VIA VIA ": VIA,
-	"KVMKVMKVMKVM": KVM,
-	"Microsoft Hv": MSVM,
-	"VMwareVMware": VMware,
-	"XenVMMXenVMM": XenHVM,
-}
-
-func vendorID() Vendor {
-	_, b, c, d := cpuid(0)
-	v := valAsString(b, d, c)
-	vend, ok := vendorMapping[string(v)]
-	if !ok {
-		return Other
-	}
-	return vend
-}
-
-func cacheLine() int {
-	if maxFunctionID() < 0x1 {
-		return 0
-	}
-
-	_, ebx, _, _ := cpuid(1)
-	cache := (ebx & 0xff00) >> 5 // cflush size
-	if cache == 0 && maxExtendedFunction() >= 0x80000006 {
-		_, _, ecx, _ := cpuid(0x80000006)
-		cache = ecx & 0xff // cacheline size
-	}
-	// TODO: Read from Cache and TLB Information
-	return int(cache)
-}
-
-func (c *CPUInfo) cacheSize() {
-	c.Cache.L1D = -1
-	c.Cache.L1I = -1
-	c.Cache.L2 = -1
-	c.Cache.L3 = -1
-	vendor := vendorID()
-	switch vendor {
-	case Intel:
-		if maxFunctionID() < 4 {
-			return
-		}
-		for i := uint32(0); ; i++ {
-			eax, ebx, ecx, _ := cpuidex(4, i)
-			cacheType := eax & 15
-			if cacheType == 0 {
-				break
-			}
-			cacheLevel := (eax >> 5) & 7
-			coherency := int(ebx&0xfff) + 1
-			partitions := int((ebx>>12)&0x3ff) + 1
-			associativity := int((ebx>>22)&0x3ff) + 1
-			sets := int(ecx) + 1
-			size := associativity * partitions * coherency * sets
-			switch cacheLevel {
-			case 1:
-				if cacheType == 1 {
-					// 1 = Data Cache
-					c.Cache.L1D = size
-				} else if cacheType == 2 {
-					// 2 = Instruction Cache
-					c.Cache.L1I = size
-				} else {
-					if c.Cache.L1D < 0 {
-						c.Cache.L1I = size
-					}
-					if c.Cache.L1I < 0 {
-						c.Cache.L1I = size
-					}
-				}
-			case 2:
-				c.Cache.L2 = size
-			case 3:
-				c.Cache.L3 = size
-			}
-		}
-	case AMD:
-		// Untested.
-		if maxExtendedFunction() < 0x80000005 {
-			return
-		}
-		_, _, ecx, edx := cpuid(0x80000005)
-		c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024)
-		c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024)
-
-		if maxExtendedFunction() < 0x80000006 {
-			return
-		}
-		_, _, ecx, _ = cpuid(0x80000006)
-		c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
-	}
-
-	return
-}
-
-type SGXSupport struct {
-	Available           bool
-	SGX1Supported       bool
-	SGX2Supported       bool
-	MaxEnclaveSizeNot64 int64
-	MaxEnclaveSize64    int64
-}
-
-func sgx(available bool) (rval SGXSupport) {
-	rval.Available = available
-
-	if !available {
-		return
-	}
-
-	a, _, _, d := cpuidex(0x12, 0)
-	rval.SGX1Supported = a&0x01 != 0
-	rval.SGX2Supported = a&0x02 != 0
-	rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF)     // pow 2
-	rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2
-
-	return
-}
-
-func support() Flags {
-	mfi := maxFunctionID()
-	vend := vendorID()
-	if mfi < 0x1 {
-		return 0
-	}
-	rval := uint64(0)
-	_, _, c, d := cpuid(1)
-	if (d & (1 << 15)) != 0 {
-		rval |= CMOV
-	}
-	if (d & (1 << 23)) != 0 {
-		rval |= MMX
-	}
-	if (d & (1 << 25)) != 0 {
-		rval |= MMXEXT
-	}
-	if (d & (1 << 25)) != 0 {
-		rval |= SSE
-	}
-	if (d & (1 << 26)) != 0 {
-		rval |= SSE2
-	}
-	if (c & 1) != 0 {
-		rval |= SSE3
-	}
-	if (c & 0x00000200) != 0 {
-		rval |= SSSE3
-	}
-	if (c & 0x00080000) != 0 {
-		rval |= SSE4
-	}
-	if (c & 0x00100000) != 0 {
-		rval |= SSE42
-	}
-	if (c & (1 << 25)) != 0 {
-		rval |= AESNI
-	}
-	if (c & (1 << 1)) != 0 {
-		rval |= CLMUL
-	}
-	if c&(1<<23) != 0 {
-		rval |= POPCNT
-	}
-	if c&(1<<30) != 0 {
-		rval |= RDRAND
-	}
-	if c&(1<<29) != 0 {
-		rval |= F16C
-	}
-	if c&(1<<13) != 0 {
-		rval |= CX16
-	}
-	if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 {
-		if threadsPerCore() > 1 {
-			rval |= HTT
-		}
-	}
-
-	// Check XGETBV, OXSAVE and AVX bits
-	if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
-		// Check for OS support
-		eax, _ := xgetbv(0)
-		if (eax & 0x6) == 0x6 {
-			rval |= AVX
-			if (c & 0x00001000) != 0 {
-				rval |= FMA3
-			}
-		}
-	}
-
-	// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
-	if mfi >= 7 {
-		_, ebx, ecx, _ := cpuidex(7, 0)
-		if (rval&AVX) != 0 && (ebx&0x00000020) != 0 {
-			rval |= AVX2
-		}
-		if (ebx & 0x00000008) != 0 {
-			rval |= BMI1
-			if (ebx & 0x00000100) != 0 {
-				rval |= BMI2
-			}
-		}
-		if ebx&(1<<2) != 0 {
-			rval |= SGX
-		}
-		if ebx&(1<<4) != 0 {
-			rval |= HLE
-		}
-		if ebx&(1<<9) != 0 {
-			rval |= ERMS
-		}
-		if ebx&(1<<11) != 0 {
-			rval |= RTM
-		}
-		if ebx&(1<<14) != 0 {
-			rval |= MPX
-		}
-		if ebx&(1<<18) != 0 {
-			rval |= RDSEED
-		}
-		if ebx&(1<<19) != 0 {
-			rval |= ADX
-		}
-		if ebx&(1<<29) != 0 {
-			rval |= SHA
-		}
-
-		// Only detect AVX-512 features if XGETBV is supported
-		if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
-			// Check for OS support
-			eax, _ := xgetbv(0)
-
-			// Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
-			// ZMM16-ZMM31 state are enabled by OS)
-			/// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
-			if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 {
-				if ebx&(1<<16) != 0 {
-					rval |= AVX512F
-				}
-				if ebx&(1<<17) != 0 {
-					rval |= AVX512DQ
-				}
-				if ebx&(1<<21) != 0 {
-					rval |= AVX512IFMA
-				}
-				if ebx&(1<<26) != 0 {
-					rval |= AVX512PF
-				}
-				if ebx&(1<<27) != 0 {
-					rval |= AVX512ER
-				}
-				if ebx&(1<<28) != 0 {
-					rval |= AVX512CD
-				}
-				if ebx&(1<<30) != 0 {
-					rval |= AVX512BW
-				}
-				if ebx&(1<<31) != 0 {
-					rval |= AVX512VL
-				}
-				// ecx
-				if ecx&(1<<1) != 0 {
-					rval |= AVX512VBMI
-				}
-			}
-		}
-	}
-
-	if maxExtendedFunction() >= 0x80000001 {
-		_, _, c, d := cpuid(0x80000001)
-		if (c & (1 << 5)) != 0 {
-			rval |= LZCNT
-			rval |= POPCNT
-		}
-		if (d & (1 << 31)) != 0 {
-			rval |= AMD3DNOW
-		}
-		if (d & (1 << 30)) != 0 {
-			rval |= AMD3DNOWEXT
-		}
-		if (d & (1 << 23)) != 0 {
-			rval |= MMX
-		}
-		if (d & (1 << 22)) != 0 {
-			rval |= MMXEXT
-		}
-		if (c & (1 << 6)) != 0 {
-			rval |= SSE4A
-		}
-		if d&(1<<20) != 0 {
-			rval |= NX
-		}
-		if d&(1<<27) != 0 {
-			rval |= RDTSCP
-		}
-
-		/* Allow for selectively disabling SSE2 functions on AMD processors
-		   with SSE2 support but not SSE4a. This includes Athlon64, some
-		   Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
-		   than SSE2 often enough to utilize this special-case flag.
-		   AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
-		   so that SSE2 is used unless explicitly disabled by checking
-		   AV_CPU_FLAG_SSE2SLOW. */
-		if vendorID() != Intel &&
-			rval&SSE2 != 0 && (c&0x00000040) == 0 {
-			rval |= SSE2SLOW
-		}
-
-		/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
-		 * used unless the OS has AVX support. */
-		if (rval & AVX) != 0 {
-			if (c & 0x00000800) != 0 {
-				rval |= XOP
-			}
-			if (c & 0x00010000) != 0 {
-				rval |= FMA4
-			}
-		}
-
-		if vendorID() == Intel {
-			family, model := familyModel()
-			if family == 6 && (model == 9 || model == 13 || model == 14) {
-				/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
-				 * 6/14 (core1 "yonah") theoretically support sse2, but it's
-				 * usually slower than mmx. */
-				if (rval & SSE2) != 0 {
-					rval |= SSE2SLOW
-				}
-				if (rval & SSE3) != 0 {
-					rval |= SSE3SLOW
-				}
-			}
-			/* The Atom processor has SSSE3 support, which is useful in many cases,
-			 * but sometimes the SSSE3 version is slower than the SSE2 equivalent
-			 * on the Atom, but is generally faster on other processors supporting
-			 * SSSE3. This flag allows for selectively disabling certain SSSE3
-			 * functions on the Atom. */
-			if family == 6 && model == 28 {
-				rval |= ATOM
-			}
-		}
-	}
-	return Flags(rval)
-}
-
-func valAsString(values ...uint32) []byte {
-	r := make([]byte, 4*len(values))
-	for i, v := range values {
-		dst := r[i*4:]
-		dst[0] = byte(v & 0xff)
-		dst[1] = byte((v >> 8) & 0xff)
-		dst[2] = byte((v >> 16) & 0xff)
-		dst[3] = byte((v >> 24) & 0xff)
-		switch {
-		case dst[0] == 0:
-			return r[:i*4]
-		case dst[1] == 0:
-			return r[:i*4+1]
-		case dst[2] == 0:
-			return r[:i*4+2]
-		case dst[3] == 0:
-			return r[:i*4+3]
-		}
-	}
-	return r
-}
diff --git a/vendor/github.com/klauspost/cpuid/cpuid_386.s b/vendor/github.com/klauspost/cpuid/cpuid_386.s
deleted file mode 100644
index 4d731711..00000000
--- a/vendor/github.com/klauspost/cpuid/cpuid_386.s
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-// +build 386,!gccgo
-
-// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuid(SB), 7, $0
-	XORL CX, CX
-	MOVL op+0(FP), AX
-	CPUID
-	MOVL AX, eax+4(FP)
-	MOVL BX, ebx+8(FP)
-	MOVL CX, ecx+12(FP)
-	MOVL DX, edx+16(FP)
-	RET
-
-// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuidex(SB), 7, $0
-	MOVL op+0(FP), AX
-	MOVL op2+4(FP), CX
-	CPUID
-	MOVL AX, eax+8(FP)
-	MOVL BX, ebx+12(FP)
-	MOVL CX, ecx+16(FP)
-	MOVL DX, edx+20(FP)
-	RET
-
-// func xgetbv(index uint32) (eax, edx uint32)
-TEXT ·asmXgetbv(SB), 7, $0
-	MOVL index+0(FP), CX
-	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
-	MOVL AX, eax+4(FP)
-	MOVL DX, edx+8(FP)
-	RET
-
-// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
-TEXT ·asmRdtscpAsm(SB), 7, $0
-	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
-	MOVL AX, eax+0(FP)
-	MOVL BX, ebx+4(FP)
-	MOVL CX, ecx+8(FP)
-	MOVL DX, edx+12(FP)
-	RET
diff --git a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
deleted file mode 100644
index 3c1d60e4..00000000
--- a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-//+build amd64,!gccgo
-
-// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuid(SB), 7, $0
-	XORQ CX, CX
-	MOVL op+0(FP), AX
-	CPUID
-	MOVL AX, eax+8(FP)
-	MOVL BX, ebx+12(FP)
-	MOVL CX, ecx+16(FP)
-	MOVL DX, edx+20(FP)
-	RET
-
-// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuidex(SB), 7, $0
-	MOVL op+0(FP), AX
-	MOVL op2+4(FP), CX
-	CPUID
-	MOVL AX, eax+8(FP)
-	MOVL BX, ebx+12(FP)
-	MOVL CX, ecx+16(FP)
-	MOVL DX, edx+20(FP)
-	RET
-
-// func asmXgetbv(index uint32) (eax, edx uint32)
-TEXT ·asmXgetbv(SB), 7, $0
-	MOVL index+0(FP), CX
-	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
-	MOVL AX, eax+8(FP)
-	MOVL DX, edx+12(FP)
-	RET
-
-// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
-TEXT ·asmRdtscpAsm(SB), 7, $0
-	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
-	MOVL AX, eax+0(FP)
-	MOVL BX, ebx+4(FP)
-	MOVL CX, ecx+8(FP)
-	MOVL DX, edx+12(FP)
-	RET
diff --git a/vendor/github.com/klauspost/cpuid/detect_intel.go b/vendor/github.com/klauspost/cpuid/detect_intel.go
deleted file mode 100644
index a5f04dd6..00000000
--- a/vendor/github.com/klauspost/cpuid/detect_intel.go
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-// +build 386,!gccgo amd64,!gccgo
-
-package cpuid
-
-func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
-func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-func asmXgetbv(index uint32) (eax, edx uint32)
-func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
-
-func initCPU() {
-	cpuid = asmCpuid
-	cpuidex = asmCpuidex
-	xgetbv = asmXgetbv
-	rdtscpAsm = asmRdtscpAsm
-}
diff --git a/vendor/github.com/klauspost/cpuid/detect_ref.go b/vendor/github.com/klauspost/cpuid/detect_ref.go
deleted file mode 100644
index 909c5d9a..00000000
--- a/vendor/github.com/klauspost/cpuid/detect_ref.go
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-// +build !amd64,!386 gccgo
-
-package cpuid
-
-func initCPU() {
-	cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) {
-		return 0, 0, 0, 0
-	}
-
-	cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
-		return 0, 0, 0, 0
-	}
-
-	xgetbv = func(index uint32) (eax, edx uint32) {
-		return 0, 0
-	}
-
-	rdtscpAsm = func() (eax, ebx, ecx, edx uint32) {
-		return 0, 0, 0, 0
-	}
-}
diff --git a/vendor/github.com/klauspost/cpuid/generate.go b/vendor/github.com/klauspost/cpuid/generate.go
deleted file mode 100644
index c060b816..00000000
--- a/vendor/github.com/klauspost/cpuid/generate.go
+++ /dev/null
@@ -1,3 +0,0 @@
-package cpuid
-
-//go:generate go run private-gen.go
diff --git a/vendor/github.com/klauspost/cpuid/private-gen.go b/vendor/github.com/klauspost/cpuid/private-gen.go
deleted file mode 100644
index 437333d2..00000000
--- a/vendor/github.com/klauspost/cpuid/private-gen.go
+++ /dev/null
@@ -1,476 +0,0 @@
-// +build ignore
-
-package main
-
-import (
-	"bytes"
-	"fmt"
-	"go/ast"
-	"go/parser"
-	"go/printer"
-	"go/token"
-	"io"
-	"io/ioutil"
-	"log"
-	"os"
-	"reflect"
-	"strings"
-	"unicode"
-	"unicode/utf8"
-)
-
-var inFiles = []string{"cpuid.go", "cpuid_test.go"}
-var copyFiles = []string{"cpuid_amd64.s", "cpuid_386.s", "detect_ref.go", "detect_intel.go"}
-var fileSet = token.NewFileSet()
-var reWrites = []rewrite{
-	initRewrite("CPUInfo -> cpuInfo"),
-	initRewrite("Vendor -> vendor"),
-	initRewrite("Flags -> flags"),
-	initRewrite("Detect -> detect"),
-	initRewrite("CPU -> cpu"),
-}
-var excludeNames = map[string]bool{"string": true, "join": true, "trim": true,
-	// cpuid_test.go
-	"t": true, "println": true, "logf": true, "log": true, "fatalf": true, "fatal": true,
-}
-
-var excludePrefixes = []string{"test", "benchmark"}
-
-func main() {
-	Package := "private"
-	parserMode := parser.ParseComments
-	exported := make(map[string]rewrite)
-	for _, file := range inFiles {
-		in, err := os.Open(file)
-		if err != nil {
-			log.Fatalf("opening input", err)
-		}
-
-		src, err := ioutil.ReadAll(in)
-		if err != nil {
-			log.Fatalf("reading input", err)
-		}
-
-		astfile, err := parser.ParseFile(fileSet, file, src, parserMode)
-		if err != nil {
-			log.Fatalf("parsing input", err)
-		}
-
-		for _, rw := range reWrites {
-			astfile = rw(astfile)
-		}
-
-		// Inspect the AST and print all identifiers and literals.
-		var startDecl token.Pos
-		var endDecl token.Pos
-		ast.Inspect(astfile, func(n ast.Node) bool {
-			var s string
-			switch x := n.(type) {
-			case *ast.Ident:
-				if x.IsExported() {
-					t := strings.ToLower(x.Name)
-					for _, pre := range excludePrefixes {
-						if strings.HasPrefix(t, pre) {
-							return true
-						}
-					}
-					if excludeNames[t] != true {
-						//if x.Pos() > startDecl && x.Pos() < endDecl {
-						exported[x.Name] = initRewrite(x.Name + " -> " + t)
-					}
-				}
-
-			case *ast.GenDecl:
-				if x.Tok == token.CONST && x.Lparen > 0 {
-					startDecl = x.Lparen
-					endDecl = x.Rparen
-					// fmt.Printf("Decl:%s -> %s\n", fileSet.Position(startDecl), fileSet.Position(endDecl))
-				}
-			}
-			if s != "" {
-				fmt.Printf("%s:\t%s\n", fileSet.Position(n.Pos()), s)
-			}
-			return true
-		})
-
-		for _, rw := range exported {
-			astfile = rw(astfile)
-		}
-
-		var buf bytes.Buffer
-
-		printer.Fprint(&buf, fileSet, astfile)
-
-		// Remove package documentation and insert information
-		s := buf.String()
-		ind := strings.Index(buf.String(), "\npackage cpuid")
-		s = s[ind:]
-		s = "// Generated, DO NOT EDIT,\n" +
-			"// but copy it to your own project and rename the package.\n" +
-			"// See more at http://github.com/klauspost/cpuid\n" +
-			s
-
-		outputName := Package + string(os.PathSeparator) + file
-
-		err = ioutil.WriteFile(outputName, []byte(s), 0644)
-		if err != nil {
-			log.Fatalf("writing output: %s", err)
-		}
-		log.Println("Generated", outputName)
-	}
-
-	for _, file := range copyFiles {
-		dst := ""
-		if strings.HasPrefix(file, "cpuid") {
-			dst = Package + string(os.PathSeparator) + file
-		} else {
-			dst = Package + string(os.PathSeparator) + "cpuid_" + file
-		}
-		err := copyFile(file, dst)
-		if err != nil {
-			log.Fatalf("copying file: %s", err)
-		}
-		log.Println("Copied", dst)
-	}
-}
-
-// CopyFile copies a file from src to dst. If src and dst files exist, and are
-// the same, then return success. Copy the file contents from src to dst.
-func copyFile(src, dst string) (err error) {
-	sfi, err := os.Stat(src)
-	if err != nil {
-		return
-	}
-	if !sfi.Mode().IsRegular() {
-		// cannot copy non-regular files (e.g., directories,
-		// symlinks, devices, etc.)
-		return fmt.Errorf("CopyFile: non-regular source file %s (%q)", sfi.Name(), sfi.Mode().String())
-	}
-	dfi, err := os.Stat(dst)
-	if err != nil {
-		if !os.IsNotExist(err) {
-			return
-		}
-	} else {
-		if !(dfi.Mode().IsRegular()) {
-			return fmt.Errorf("CopyFile: non-regular destination file %s (%q)", dfi.Name(), dfi.Mode().String())
-		}
-		if os.SameFile(sfi, dfi) {
-			return
-		}
-	}
-	err = copyFileContents(src, dst)
-	return
-}
-
-// copyFileContents copies the contents of the file named src to the file named
-// by dst. The file will be created if it does not already exist. If the
-// destination file exists, all it's contents will be replaced by the contents
-// of the source file.
-func copyFileContents(src, dst string) (err error) {
-	in, err := os.Open(src)
-	if err != nil {
-		return
-	}
-	defer in.Close()
-	out, err := os.Create(dst)
-	if err != nil {
-		return
-	}
-	defer func() {
-		cerr := out.Close()
-		if err == nil {
-			err = cerr
-		}
-	}()
-	if _, err = io.Copy(out, in); err != nil {
-		return
-	}
-	err = out.Sync()
-	return
-}
-
-type rewrite func(*ast.File) *ast.File
-
-// Mostly copied from gofmt
-func initRewrite(rewriteRule string) rewrite {
-	f := strings.Split(rewriteRule, "->")
-	if len(f) != 2 {
-		fmt.Fprintf(os.Stderr, "rewrite rule must be of the form 'pattern -> replacement'\n")
-		os.Exit(2)
-	}
-	pattern := parseExpr(f[0], "pattern")
-	replace := parseExpr(f[1], "replacement")
-	return func(p *ast.File) *ast.File { return rewriteFile(pattern, replace, p) }
-}
-
-// parseExpr parses s as an expression.
-// It might make sense to expand this to allow statement patterns,
-// but there are problems with preserving formatting and also
-// with what a wildcard for a statement looks like.
-func parseExpr(s, what string) ast.Expr {
-	x, err := parser.ParseExpr(s)
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "parsing %s %s at %s\n", what, s, err)
-		os.Exit(2)
-	}
-	return x
-}
-
-// Keep this function for debugging.
-/*
-func dump(msg string, val reflect.Value) {
-	fmt.Printf("%s:\n", msg)
-	ast.Print(fileSet, val.Interface())
-	fmt.Println()
-}
-*/
-
-// rewriteFile applies the rewrite rule 'pattern -> replace' to an entire file.
-func rewriteFile(pattern, replace ast.Expr, p *ast.File) *ast.File {
-	cmap := ast.NewCommentMap(fileSet, p, p.Comments)
-	m := make(map[string]reflect.Value)
-	pat := reflect.ValueOf(pattern)
-	repl := reflect.ValueOf(replace)
-
-	var rewriteVal func(val reflect.Value) reflect.Value
-	rewriteVal = func(val reflect.Value) reflect.Value {
-		// don't bother if val is invalid to start with
-		if !val.IsValid() {
-			return reflect.Value{}
-		}
-		for k := range m {
-			delete(m, k)
-		}
-		val = apply(rewriteVal, val)
-		if match(m, pat, val) {
-			val = subst(m, repl, reflect.ValueOf(val.Interface().(ast.Node).Pos()))
-		}
-		return val
-	}
-
-	r := apply(rewriteVal, reflect.ValueOf(p)).Interface().(*ast.File)
-	r.Comments = cmap.Filter(r).Comments() // recreate comments list
-	return r
-}
-
-// set is a wrapper for x.Set(y); it protects the caller from panics if x cannot be changed to y.
-func set(x, y reflect.Value) {
-	// don't bother if x cannot be set or y is invalid
-	if !x.CanSet() || !y.IsValid() {
-		return
-	}
-	defer func() {
-		if x := recover(); x != nil {
-			if s, ok := x.(string); ok &&
-				(strings.Contains(s, "type mismatch") || strings.Contains(s, "not assignable")) {
-				// x cannot be set to y - ignore this rewrite
-				return
-			}
-			panic(x)
-		}
-	}()
-	x.Set(y)
-}
-
-// Values/types for special cases.
-var (
-	objectPtrNil = reflect.ValueOf((*ast.Object)(nil))
-	scopePtrNil  = reflect.ValueOf((*ast.Scope)(nil))
-
-	identType     = reflect.TypeOf((*ast.Ident)(nil))
-	objectPtrType = reflect.TypeOf((*ast.Object)(nil))
-	positionType  = reflect.TypeOf(token.NoPos)
-	callExprType  = reflect.TypeOf((*ast.CallExpr)(nil))
-	scopePtrType  = reflect.TypeOf((*ast.Scope)(nil))
-)
-
-// apply replaces each AST field x in val with f(x), returning val.
-// To avoid extra conversions, f operates on the reflect.Value form.
-func apply(f func(reflect.Value) reflect.Value, val reflect.Value) reflect.Value {
-	if !val.IsValid() {
-		return reflect.Value{}
-	}
-
-	// *ast.Objects introduce cycles and are likely incorrect after
-	// rewrite; don't follow them but replace with nil instead
-	if val.Type() == objectPtrType {
-		return objectPtrNil
-	}
-
-	// similarly for scopes: they are likely incorrect after a rewrite;
-	// replace them with nil
-	if val.Type() == scopePtrType {
-		return scopePtrNil
-	}
-
-	switch v := reflect.Indirect(val); v.Kind() {
-	case reflect.Slice:
-		for i := 0; i < v.Len(); i++ {
-			e := v.Index(i)
-			set(e, f(e))
-		}
-	case reflect.Struct:
-		for i := 0; i < v.NumField(); i++ {
-			e := v.Field(i)
-			set(e, f(e))
-		}
-	case reflect.Interface:
-		e := v.Elem()
-		set(v, f(e))
-	}
-	return val
-}
-
-func isWildcard(s string) bool {
-	rune, size := utf8.DecodeRuneInString(s)
-	return size == len(s) && unicode.IsLower(rune)
-}
-
-// match returns true if pattern matches val,
-// recording wildcard submatches in m.
-// If m == nil, match checks whether pattern == val.
-func match(m map[string]reflect.Value, pattern, val reflect.Value) bool {
-	// Wildcard matches any expression.  If it appears multiple
-	// times in the pattern, it must match the same expression
-	// each time.
-	if m != nil && pattern.IsValid() && pattern.Type() == identType {
-		name := pattern.Interface().(*ast.Ident).Name
-		if isWildcard(name) && val.IsValid() {
-			// wildcards only match valid (non-nil) expressions.
-			if _, ok := val.Interface().(ast.Expr); ok && !val.IsNil() {
-				if old, ok := m[name]; ok {
-					return match(nil, old, val)
-				}
-				m[name] = val
-				return true
-			}
-		}
-	}
-
-	// Otherwise, pattern and val must match recursively.
-	if !pattern.IsValid() || !val.IsValid() {
-		return !pattern.IsValid() && !val.IsValid()
-	}
-	if pattern.Type() != val.Type() {
-		return false
-	}
-
-	// Special cases.
-	switch pattern.Type() {
-	case identType:
-		// For identifiers, only the names need to match
-		// (and none of the other *ast.Object information).
-		// This is a common case, handle it all here instead
-		// of recursing down any further via reflection.
-		p := pattern.Interface().(*ast.Ident)
-		v := val.Interface().(*ast.Ident)
-		return p == nil && v == nil || p != nil && v != nil && p.Name == v.Name
-	case objectPtrType, positionType:
-		// object pointers and token positions always match
-		return true
-	case callExprType:
-		// For calls, the Ellipsis fields (token.Position) must
-		// match since that is how f(x) and f(x...) are different.
-		// Check them here but fall through for the remaining fields.
-		p := pattern.Interface().(*ast.CallExpr)
-		v := val.Interface().(*ast.CallExpr)
-		if p.Ellipsis.IsValid() != v.Ellipsis.IsValid() {
-			return false
-		}
-	}
-
-	p := reflect.Indirect(pattern)
-	v := reflect.Indirect(val)
-	if !p.IsValid() || !v.IsValid() {
-		return !p.IsValid() && !v.IsValid()
-	}
-
-	switch p.Kind() {
-	case reflect.Slice:
-		if p.Len() != v.Len() {
-			return false
-		}
-		for i := 0; i < p.Len(); i++ {
-			if !match(m, p.Index(i), v.Index(i)) {
-				return false
-			}
-		}
-		return true
-
-	case reflect.Struct:
-		for i := 0; i < p.NumField(); i++ {
-			if !match(m, p.Field(i), v.Field(i)) {
-				return false
-			}
-		}
-		return true
-
-	case reflect.Interface:
-		return match(m, p.Elem(), v.Elem())
-	}
-
-	// Handle token integers, etc.
-	return p.Interface() == v.Interface()
-}
-
-// subst returns a copy of pattern with values from m substituted in place
-// of wildcards and pos used as the position of tokens from the pattern.
-// if m == nil, subst returns a copy of pattern and doesn't change the line
-// number information.
-func subst(m map[string]reflect.Value, pattern reflect.Value, pos reflect.Value) reflect.Value {
-	if !pattern.IsValid() {
-		return reflect.Value{}
-	}
-
-	// Wildcard gets replaced with map value.
-	if m != nil && pattern.Type() == identType {
-		name := pattern.Interface().(*ast.Ident).Name
-		if isWildcard(name) {
-			if old, ok := m[name]; ok {
-				return subst(nil, old, reflect.Value{})
-			}
-		}
-	}
-
-	if pos.IsValid() && pattern.Type() == positionType {
-		// use new position only if old position was valid in the first place
-		if old := pattern.Interface().(token.Pos); !old.IsValid() {
-			return pattern
-		}
-		return pos
-	}
-
-	// Otherwise copy.
-	switch p := pattern; p.Kind() {
-	case reflect.Slice:
-		v := reflect.MakeSlice(p.Type(), p.Len(), p.Len())
-		for i := 0; i < p.Len(); i++ {
-			v.Index(i).Set(subst(m, p.Index(i), pos))
-		}
-		return v
-
-	case reflect.Struct:
-		v := reflect.New(p.Type()).Elem()
-		for i := 0; i < p.NumField(); i++ {
-			v.Field(i).Set(subst(m, p.Field(i), pos))
-		}
-		return v
-
-	case reflect.Ptr:
-		v := reflect.New(p.Type()).Elem()
-		if elem := p.Elem(); elem.IsValid() {
-			v.Set(subst(m, elem, pos).Addr())
-		}
-		return v
-
-	case reflect.Interface:
-		v := reflect.New(p.Type()).Elem()
-		if elem := p.Elem(); elem.IsValid() {
-			v.Set(subst(m, elem, pos))
-		}
-		return v
-	}
-
-	return pattern
-}
diff --git a/vendor/github.com/klauspost/crc32/LICENSE b/vendor/github.com/klauspost/crc32/LICENSE
deleted file mode 100644
index 4fd5963e..00000000
--- a/vendor/github.com/klauspost/crc32/LICENSE
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright (c) 2012 The Go Authors. All rights reserved.
-Copyright (c) 2015 Klaus Post
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/crc32/README.md b/vendor/github.com/klauspost/crc32/README.md
deleted file mode 100644
index 029625d3..00000000
--- a/vendor/github.com/klauspost/crc32/README.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# crc32
-CRC32 hash with x64 optimizations
-
-This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup.
-
-[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32)
-
-# usage
-
-Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer.
-
-Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go.
-
-# changes
-* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match.
-* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable.
-
-
-# performance
-
-For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back.
-
-
-For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction:
-```
-benchmark            old ns/op     new ns/op     delta
-BenchmarkCrc32KB     99955         10258         -89.74%
-
-benchmark            old MB/s     new MB/s     speedup
-BenchmarkCrc32KB     327.83       3194.20      9.74x
-```
-
-For other tables and "CLMUL"  capable machines the performance is the same as the standard library.
-
-Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled.
-
-```
-Std:   Standard Go 1.5 library
-Crc:   Indicates IEEE type CRC.
-40B:   Size of each slice encoded.
-NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine).
-Castagnoli: Castagnoli CRC type.
-
-BenchmarkStdCrc40B-4            10000000               158 ns/op         252.88 MB/s
-BenchmarkCrc40BNoAsm-4          20000000               105 ns/op         377.38 MB/s (slice8)
-BenchmarkCrc40B-4               20000000               105 ns/op         378.77 MB/s (slice8)
-
-BenchmarkStdCrc1KB-4              500000              3604 ns/op         284.10 MB/s
-BenchmarkCrc1KBNoAsm-4           1000000              1463 ns/op         699.79 MB/s (slice8)
-BenchmarkCrc1KB-4                3000000               396 ns/op        2583.69 MB/s (asm)
-
-BenchmarkStdCrc8KB-4              200000             11417 ns/op         717.48 MB/s (slice8)
-BenchmarkCrc8KBNoAsm-4            200000             11317 ns/op         723.85 MB/s (slice8)
-BenchmarkCrc8KB-4                 500000              2919 ns/op        2805.73 MB/s (asm)
-
-BenchmarkStdCrc32KB-4              30000             45749 ns/op         716.24 MB/s (slice8)
-BenchmarkCrc32KBNoAsm-4            30000             45109 ns/op         726.42 MB/s (slice8)
-BenchmarkCrc32KB-4                100000             11497 ns/op        2850.09 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnol40B-4 10000000               161 ns/op         246.94 MB/s
-BenchmarkStdCastagnoli40B-4     50000000              28.4 ns/op        1410.69 MB/s (asm)
-BenchmarkCastagnoli40BNoAsm-4   20000000               100 ns/op         398.01 MB/s (slice8)
-BenchmarkCastagnoli40B-4        50000000              28.2 ns/op        1419.54 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnoli1KB-4  500000              3622 ns/op        282.67 MB/s
-BenchmarkStdCastagnoli1KB-4     10000000               144 ns/op        7099.78 MB/s (asm)
-BenchmarkCastagnoli1KBNoAsm-4    1000000              1475 ns/op         694.14 MB/s (slice8)
-BenchmarkCastagnoli1KB-4        10000000               146 ns/op        6993.35 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnoli8KB-4  50000              28781 ns/op         284.63 MB/s
-BenchmarkStdCastagnoli8KB-4      1000000              1029 ns/op        7957.89 MB/s (asm)
-BenchmarkCastagnoli8KBNoAsm-4     200000             11410 ns/op         717.94 MB/s (slice8)
-BenchmarkCastagnoli8KB-4         1000000              1000 ns/op        8188.71 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnoli32KB-4  10000            115426 ns/op         283.89 MB/s
-BenchmarkStdCastagnoli32KB-4      300000              4065 ns/op        8059.13 MB/s (asm)
-BenchmarkCastagnoli32KBNoAsm-4     30000             45171 ns/op         725.41 MB/s (slice8)
-BenchmarkCastagnoli32KB-4         500000              4077 ns/op        8035.89 MB/s (asm)
-```
-
-The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library.
-
-However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7.
-
-# license
-
-Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions.
diff --git a/vendor/github.com/klauspost/crc32/crc32.go b/vendor/github.com/klauspost/crc32/crc32.go
deleted file mode 100644
index 8aa91b17..00000000
--- a/vendor/github.com/klauspost/crc32/crc32.go
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
-// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
-// information.
-//
-// Polynomials are represented in LSB-first form also known as reversed representation.
-//
-// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
-// for information.
-package crc32
-
-import (
-	"hash"
-	"sync"
-)
-
-// The size of a CRC-32 checksum in bytes.
-const Size = 4
-
-// Predefined polynomials.
-const (
-	// IEEE is by far and away the most common CRC-32 polynomial.
-	// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
-	IEEE = 0xedb88320
-
-	// Castagnoli's polynomial, used in iSCSI.
-	// Has better error detection characteristics than IEEE.
-	// http://dx.doi.org/10.1109/26.231911
-	Castagnoli = 0x82f63b78
-
-	// Koopman's polynomial.
-	// Also has better error detection characteristics than IEEE.
-	// http://dx.doi.org/10.1109/DSN.2002.1028931
-	Koopman = 0xeb31d82e
-)
-
-// Table is a 256-word table representing the polynomial for efficient processing.
-type Table [256]uint32
-
-// This file makes use of functions implemented in architecture-specific files.
-// The interface that they implement is as follows:
-//
-//    // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE
-//    // algorithm is available.
-//    archAvailableIEEE() bool
-//
-//    // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm.
-//    // It can only be called if archAvailableIEEE() returns true.
-//    archInitIEEE()
-//
-//    // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if
-//    // archInitIEEE() was previously called.
-//    archUpdateIEEE(crc uint32, p []byte) uint32
-//
-//    // archAvailableCastagnoli reports whether an architecture-specific
-//    // CRC32-C algorithm is available.
-//    archAvailableCastagnoli() bool
-//
-//    // archInitCastagnoli initializes the architecture-specific CRC32-C
-//    // algorithm. It can only be called if archAvailableCastagnoli() returns
-//    // true.
-//    archInitCastagnoli()
-//
-//    // archUpdateCastagnoli updates the given CRC32-C. It can only be called
-//    // if archInitCastagnoli() was previously called.
-//    archUpdateCastagnoli(crc uint32, p []byte) uint32
-
-// castagnoliTable points to a lazily initialized Table for the Castagnoli
-// polynomial. MakeTable will always return this value when asked to make a
-// Castagnoli table so we can compare against it to find when the caller is
-// using this polynomial.
-var castagnoliTable *Table
-var castagnoliTable8 *slicing8Table
-var castagnoliArchImpl bool
-var updateCastagnoli func(crc uint32, p []byte) uint32
-var castagnoliOnce sync.Once
-
-func castagnoliInit() {
-	castagnoliTable = simpleMakeTable(Castagnoli)
-	castagnoliArchImpl = archAvailableCastagnoli()
-
-	if castagnoliArchImpl {
-		archInitCastagnoli()
-		updateCastagnoli = archUpdateCastagnoli
-	} else {
-		// Initialize the slicing-by-8 table.
-		castagnoliTable8 = slicingMakeTable(Castagnoli)
-		updateCastagnoli = func(crc uint32, p []byte) uint32 {
-			return slicingUpdate(crc, castagnoliTable8, p)
-		}
-	}
-}
-
-// IEEETable is the table for the IEEE polynomial.
-var IEEETable = simpleMakeTable(IEEE)
-
-// ieeeTable8 is the slicing8Table for IEEE
-var ieeeTable8 *slicing8Table
-var ieeeArchImpl bool
-var updateIEEE func(crc uint32, p []byte) uint32
-var ieeeOnce sync.Once
-
-func ieeeInit() {
-	ieeeArchImpl = archAvailableIEEE()
-
-	if ieeeArchImpl {
-		archInitIEEE()
-		updateIEEE = archUpdateIEEE
-	} else {
-		// Initialize the slicing-by-8 table.
-		ieeeTable8 = slicingMakeTable(IEEE)
-		updateIEEE = func(crc uint32, p []byte) uint32 {
-			return slicingUpdate(crc, ieeeTable8, p)
-		}
-	}
-}
-
-// MakeTable returns a Table constructed from the specified polynomial.
-// The contents of this Table must not be modified.
-func MakeTable(poly uint32) *Table {
-	switch poly {
-	case IEEE:
-		ieeeOnce.Do(ieeeInit)
-		return IEEETable
-	case Castagnoli:
-		castagnoliOnce.Do(castagnoliInit)
-		return castagnoliTable
-	}
-	return simpleMakeTable(poly)
-}
-
-// digest represents the partial evaluation of a checksum.
-type digest struct {
-	crc uint32
-	tab *Table
-}
-
-// New creates a new hash.Hash32 computing the CRC-32 checksum
-// using the polynomial represented by the Table.
-// Its Sum method will lay the value out in big-endian byte order.
-func New(tab *Table) hash.Hash32 {
-	if tab == IEEETable {
-		ieeeOnce.Do(ieeeInit)
-	}
-	return &digest{0, tab}
-}
-
-// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
-// using the IEEE polynomial.
-// Its Sum method will lay the value out in big-endian byte order.
-func NewIEEE() hash.Hash32 { return New(IEEETable) }
-
-func (d *digest) Size() int { return Size }
-
-func (d *digest) BlockSize() int { return 1 }
-
-func (d *digest) Reset() { d.crc = 0 }
-
-// Update returns the result of adding the bytes in p to the crc.
-func Update(crc uint32, tab *Table, p []byte) uint32 {
-	switch tab {
-	case castagnoliTable:
-		return updateCastagnoli(crc, p)
-	case IEEETable:
-		// Unfortunately, because IEEETable is exported, IEEE may be used without a
-		// call to MakeTable. We have to make sure it gets initialized in that case.
-		ieeeOnce.Do(ieeeInit)
-		return updateIEEE(crc, p)
-	default:
-		return simpleUpdate(crc, tab, p)
-	}
-}
-
-func (d *digest) Write(p []byte) (n int, err error) {
-	switch d.tab {
-	case castagnoliTable:
-		d.crc = updateCastagnoli(d.crc, p)
-	case IEEETable:
-		// We only create digest objects through New() which takes care of
-		// initialization in this case.
-		d.crc = updateIEEE(d.crc, p)
-	default:
-		d.crc = simpleUpdate(d.crc, d.tab, p)
-	}
-	return len(p), nil
-}
-
-func (d *digest) Sum32() uint32 { return d.crc }
-
-func (d *digest) Sum(in []byte) []byte {
-	s := d.Sum32()
-	return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
-}
-
-// Checksum returns the CRC-32 checksum of data
-// using the polynomial represented by the Table.
-func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
-
-// ChecksumIEEE returns the CRC-32 checksum of data
-// using the IEEE polynomial.
-func ChecksumIEEE(data []byte) uint32 {
-	ieeeOnce.Do(ieeeInit)
-	return updateIEEE(0, data)
-}
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.go b/vendor/github.com/klauspost/crc32/crc32_amd64.go
deleted file mode 100644
index af2a0b84..00000000
--- a/vendor/github.com/klauspost/crc32/crc32_amd64.go
+++ /dev/null
@@ -1,230 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine,!gccgo
-
-// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
-// description of the interface that each architecture-specific file
-// implements.
-
-package crc32
-
-import "unsafe"
-
-// This file contains the code to call the SSE 4.2 version of the Castagnoli
-// and IEEE CRC.
-
-// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
-// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
-func haveSSE41() bool
-func haveSSE42() bool
-func haveCLMUL() bool
-
-// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
-// instruction.
-//go:noescape
-func castagnoliSSE42(crc uint32, p []byte) uint32
-
-// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
-// instruction.
-//go:noescape
-func castagnoliSSE42Triple(
-	crcA, crcB, crcC uint32,
-	a, b, c []byte,
-	rounds uint32,
-) (retA uint32, retB uint32, retC uint32)
-
-// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
-// instruction as well as SSE 4.1.
-//go:noescape
-func ieeeCLMUL(crc uint32, p []byte) uint32
-
-var sse42 = haveSSE42()
-var useFastIEEE = haveCLMUL() && haveSSE41()
-
-const castagnoliK1 = 168
-const castagnoliK2 = 1344
-
-type sse42Table [4]Table
-
-var castagnoliSSE42TableK1 *sse42Table
-var castagnoliSSE42TableK2 *sse42Table
-
-func archAvailableCastagnoli() bool {
-	return sse42
-}
-
-func archInitCastagnoli() {
-	if !sse42 {
-		panic("arch-specific Castagnoli not available")
-	}
-	castagnoliSSE42TableK1 = new(sse42Table)
-	castagnoliSSE42TableK2 = new(sse42Table)
-	// See description in updateCastagnoli.
-	//    t[0][i] = CRC(i000, O)
-	//    t[1][i] = CRC(0i00, O)
-	//    t[2][i] = CRC(00i0, O)
-	//    t[3][i] = CRC(000i, O)
-	// where O is a sequence of K zeros.
-	var tmp [castagnoliK2]byte
-	for b := 0; b < 4; b++ {
-		for i := 0; i < 256; i++ {
-			val := uint32(i) << uint32(b*8)
-			castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1])
-			castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:])
-		}
-	}
-}
-
-// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
-// table given) with the given initial crc value. This corresponds to
-// CRC(crc, O) in the description in updateCastagnoli.
-func castagnoliShift(table *sse42Table, crc uint32) uint32 {
-	return table[3][crc>>24] ^
-		table[2][(crc>>16)&0xFF] ^
-		table[1][(crc>>8)&0xFF] ^
-		table[0][crc&0xFF]
-}
-
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
-	if !sse42 {
-		panic("not available")
-	}
-
-	// This method is inspired from the algorithm in Intel's white paper:
-	//    "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
-	// The same strategy of splitting the buffer in three is used but the
-	// combining calculation is different; the complete derivation is explained
-	// below.
-	//
-	// -- The basic idea --
-	//
-	// The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
-	// time. In recent Intel architectures the instruction takes 3 cycles;
-	// however the processor can pipeline up to three instructions if they
-	// don't depend on each other.
-	//
-	// Roughly this means that we can process three buffers in about the same
-	// time we can process one buffer.
-	//
-	// The idea is then to split the buffer in three, CRC the three pieces
-	// separately and then combine the results.
-	//
-	// Combining the results requires precomputed tables, so we must choose a
-	// fixed buffer length to optimize. The longer the length, the faster; but
-	// only buffers longer than this length will use the optimization. We choose
-	// two cutoffs and compute tables for both:
-	//  - one around 512: 168*3=504
-	//  - one around 4KB: 1344*3=4032
-	//
-	// -- The nitty gritty --
-	//
-	// Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
-	// initial non-inverted CRC I). This function has the following properties:
-	//   (a) CRC(I, AB) = CRC(CRC(I, A), B)
-	//   (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
-	//
-	// Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
-	// K bytes each, where K is a fixed constant. Let O be the sequence of K zero
-	// bytes.
-	//
-	// CRC(I, ABC) = CRC(I, ABO xor C)
-	//             = CRC(I, ABO) xor CRC(0, C)
-	//             = CRC(CRC(I, AB), O) xor CRC(0, C)
-	//             = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
-	//             = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
-	//             = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
-	//
-	// The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
-	// and CRC(0, C) efficiently.  We just need to find a way to quickly compute
-	// CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
-	// values; since we can't have a 32-bit table, we break it up into four
-	// 8-bit tables:
-	//
-	//    CRC(uvwx, O) = CRC(u000, O) xor
-	//                   CRC(0v00, O) xor
-	//                   CRC(00w0, O) xor
-	//                   CRC(000x, O)
-	//
-	// We can compute tables corresponding to the four terms for all 8-bit
-	// values.
-
-	crc = ^crc
-
-	// If a buffer is long enough to use the optimization, process the first few
-	// bytes to align the buffer to an 8 byte boundary (if necessary).
-	if len(p) >= castagnoliK1*3 {
-		delta := int(uintptr(unsafe.Pointer(&p[0])) & 7)
-		if delta != 0 {
-			delta = 8 - delta
-			crc = castagnoliSSE42(crc, p[:delta])
-			p = p[delta:]
-		}
-	}
-
-	// Process 3*K2 at a time.
-	for len(p) >= castagnoliK2*3 {
-		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
-		crcA, crcB, crcC := castagnoliSSE42Triple(
-			crc, 0, 0,
-			p, p[castagnoliK2:], p[castagnoliK2*2:],
-			castagnoliK2/24)
-
-		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
-		crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB
-		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
-		crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC
-		p = p[castagnoliK2*3:]
-	}
-
-	// Process 3*K1 at a time.
-	for len(p) >= castagnoliK1*3 {
-		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
-		crcA, crcB, crcC := castagnoliSSE42Triple(
-			crc, 0, 0,
-			p, p[castagnoliK1:], p[castagnoliK1*2:],
-			castagnoliK1/24)
-
-		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
-		crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB
-		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
-		crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC
-		p = p[castagnoliK1*3:]
-	}
-
-	// Use the simple implementation for what's left.
-	crc = castagnoliSSE42(crc, p)
-	return ^crc
-}
-
-func archAvailableIEEE() bool {
-	return useFastIEEE
-}
-
-var archIeeeTable8 *slicing8Table
-
-func archInitIEEE() {
-	if !useFastIEEE {
-		panic("not available")
-	}
-	// We still use slicing-by-8 for small buffers.
-	archIeeeTable8 = slicingMakeTable(IEEE)
-}
-
-func archUpdateIEEE(crc uint32, p []byte) uint32 {
-	if !useFastIEEE {
-		panic("not available")
-	}
-
-	if len(p) >= 64 {
-		left := len(p) & 15
-		do := len(p) - left
-		crc = ^ieeeCLMUL(^crc, p[:do])
-		p = p[do:]
-	}
-	if len(p) == 0 {
-		return crc
-	}
-	return slicingUpdate(crc, archIeeeTable8, p)
-}
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.s b/vendor/github.com/klauspost/crc32/crc32_amd64.s
deleted file mode 100644
index e8a7941c..00000000
--- a/vendor/github.com/klauspost/crc32/crc32_amd64.s
+++ /dev/null
@@ -1,319 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build gc
-
-#define NOSPLIT 4
-#define RODATA 8
-
-// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
-//
-// func castagnoliSSE42(crc uint32, p []byte) uint32
-TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
-	MOVL crc+0(FP), AX    // CRC value
-	MOVQ p+8(FP), SI      // data pointer
-	MOVQ p_len+16(FP), CX // len(p)
-
-	// If there are fewer than 8 bytes to process, skip alignment.
-	CMPQ CX, $8
-	JL   less_than_8
-
-	MOVQ SI, BX
-	ANDQ $7, BX
-	JZ   aligned
-
-	// Process the first few bytes to 8-byte align the input.
-
-	// BX = 8 - BX. We need to process this many bytes to align.
-	SUBQ $1, BX
-	XORQ $7, BX
-
-	BTQ $0, BX
-	JNC align_2
-
-	CRC32B (SI), AX
-	DECQ   CX
-	INCQ   SI
-
-align_2:
-	BTQ $1, BX
-	JNC align_4
-
-	// CRC32W (SI), AX
-	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-
-	SUBQ $2, CX
-	ADDQ $2, SI
-
-align_4:
-	BTQ $2, BX
-	JNC aligned
-
-	// CRC32L (SI), AX
-	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-
-	SUBQ $4, CX
-	ADDQ $4, SI
-
-aligned:
-	// The input is now 8-byte aligned and we can process 8-byte chunks.
-	CMPQ CX, $8
-	JL   less_than_8
-
-	CRC32Q (SI), AX
-	ADDQ   $8, SI
-	SUBQ   $8, CX
-	JMP    aligned
-
-less_than_8:
-	// We may have some bytes left over; process 4 bytes, then 2, then 1.
-	BTQ $2, CX
-	JNC less_than_4
-
-	// CRC32L (SI), AX
-	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-	ADDQ $4, SI
-
-less_than_4:
-	BTQ $1, CX
-	JNC less_than_2
-
-	// CRC32W (SI), AX
-	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-	ADDQ $2, SI
-
-less_than_2:
-	BTQ $0, CX
-	JNC done
-
-	CRC32B (SI), AX
-
-done:
-	MOVL AX, ret+32(FP)
-	RET
-
-// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
-// bytes from each buffer.
-//
-// func castagnoliSSE42Triple(
-//     crc1, crc2, crc3 uint32,
-//     a, b, c []byte,
-//     rounds uint32,
-// ) (retA uint32, retB uint32, retC uint32)
-TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0
-	MOVL crcA+0(FP), AX
-	MOVL crcB+4(FP), CX
-	MOVL crcC+8(FP), DX
-
-	MOVQ a+16(FP), R8  // data pointer
-	MOVQ b+40(FP), R9  // data pointer
-	MOVQ c+64(FP), R10 // data pointer
-
-	MOVL rounds+88(FP), R11
-
-loop:
-	CRC32Q (R8), AX
-	CRC32Q (R9), CX
-	CRC32Q (R10), DX
-
-	CRC32Q 8(R8), AX
-	CRC32Q 8(R9), CX
-	CRC32Q 8(R10), DX
-
-	CRC32Q 16(R8), AX
-	CRC32Q 16(R9), CX
-	CRC32Q 16(R10), DX
-
-	ADDQ $24, R8
-	ADDQ $24, R9
-	ADDQ $24, R10
-
-	DECQ R11
-	JNZ  loop
-
-	MOVL AX, retA+96(FP)
-	MOVL CX, retB+100(FP)
-	MOVL DX, retC+104(FP)
-	RET
-
-// func haveSSE42() bool
-TEXT ·haveSSE42(SB), NOSPLIT, $0
-	XORQ AX, AX
-	INCL AX
-	CPUID
-	SHRQ $20, CX
-	ANDQ $1, CX
-	MOVB CX, ret+0(FP)
-	RET
-
-// func haveCLMUL() bool
-TEXT ·haveCLMUL(SB), NOSPLIT, $0
-	XORQ AX, AX
-	INCL AX
-	CPUID
-	SHRQ $1, CX
-	ANDQ $1, CX
-	MOVB CX, ret+0(FP)
-	RET
-
-// func haveSSE41() bool
-TEXT ·haveSSE41(SB), NOSPLIT, $0
-	XORQ AX, AX
-	INCL AX
-	CPUID
-	SHRQ $19, CX
-	ANDQ $1, CX
-	MOVB CX, ret+0(FP)
-	RET
-
-// CRC32 polynomial data
-//
-// These constants are lifted from the
-// Linux kernel, since they avoid the costly
-// PSHUFB 16 byte reversal proposed in the
-// original Intel paper.
-DATA r2r1kp<>+0(SB)/8, $0x154442bd4
-DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
-DATA r4r3kp<>+0(SB)/8, $0x1751997d0
-DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
-DATA rupolykp<>+0(SB)/8, $0x1db710641
-DATA rupolykp<>+8(SB)/8, $0x1f7011641
-DATA r5kp<>+0(SB)/8, $0x163cd6124
-
-GLOBL r2r1kp<>(SB), RODATA, $16
-GLOBL r4r3kp<>(SB), RODATA, $16
-GLOBL rupolykp<>(SB), RODATA, $16
-GLOBL r5kp<>(SB), RODATA, $8
-
-// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-// len(p) must be at least 64, and must be a multiple of 16.
-
-// func ieeeCLMUL(crc uint32, p []byte) uint32
-TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
-	MOVL crc+0(FP), X0    // Initial CRC value
-	MOVQ p+8(FP), SI      // data pointer
-	MOVQ p_len+16(FP), CX // len(p)
-
-	MOVOU (SI), X1
-	MOVOU 16(SI), X2
-	MOVOU 32(SI), X3
-	MOVOU 48(SI), X4
-	PXOR  X0, X1
-	ADDQ  $64, SI    // buf+=64
-	SUBQ  $64, CX    // len-=64
-	CMPQ  CX, $64    // Less than 64 bytes left
-	JB    remain64
-
-	MOVOA r2r1kp<>+0(SB), X0
-
-loopback64:
-	MOVOA X1, X5
-	MOVOA X2, X6
-	MOVOA X3, X7
-	MOVOA X4, X8
-
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0, X0, X2
-	PCLMULQDQ $0, X0, X3
-	PCLMULQDQ $0, X0, X4
-
-	// Load next early
-	MOVOU (SI), X11
-	MOVOU 16(SI), X12
-	MOVOU 32(SI), X13
-	MOVOU 48(SI), X14
-
-	PCLMULQDQ $0x11, X0, X5
-	PCLMULQDQ $0x11, X0, X6
-	PCLMULQDQ $0x11, X0, X7
-	PCLMULQDQ $0x11, X0, X8
-
-	PXOR X5, X1
-	PXOR X6, X2
-	PXOR X7, X3
-	PXOR X8, X4
-
-	PXOR X11, X1
-	PXOR X12, X2
-	PXOR X13, X3
-	PXOR X14, X4
-
-	ADDQ $0x40, DI
-	ADDQ $64, SI    // buf+=64
-	SUBQ $64, CX    // len-=64
-	CMPQ CX, $64    // Less than 64 bytes left?
-	JGE  loopback64
-
-	// Fold result into a single register (X1)
-remain64:
-	MOVOA r4r3kp<>+0(SB), X0
-
-	MOVOA     X1, X5
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0x11, X0, X5
-	PXOR      X5, X1
-	PXOR      X2, X1
-
-	MOVOA     X1, X5
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0x11, X0, X5
-	PXOR      X5, X1
-	PXOR      X3, X1
-
-	MOVOA     X1, X5
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0x11, X0, X5
-	PXOR      X5, X1
-	PXOR      X4, X1
-
-	// If there is less than 16 bytes left we are done
-	CMPQ CX, $16
-	JB   finish
-
-	// Encode 16 bytes
-remain16:
-	MOVOU     (SI), X10
-	MOVOA     X1, X5
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0x11, X0, X5
-	PXOR      X5, X1
-	PXOR      X10, X1
-	SUBQ      $16, CX
-	ADDQ      $16, SI
-	CMPQ      CX, $16
-	JGE       remain16
-
-finish:
-	// Fold final result into 32 bits and return it
-	PCMPEQB   X3, X3
-	PCLMULQDQ $1, X1, X0
-	PSRLDQ    $8, X1
-	PXOR      X0, X1
-
-	MOVOA X1, X2
-	MOVQ  r5kp<>+0(SB), X0
-
-	// Creates 32 bit mask. Note that we don't care about upper half.
-	PSRLQ $32, X3
-
-	PSRLDQ    $4, X2
-	PAND      X3, X1
-	PCLMULQDQ $0, X0, X1
-	PXOR      X2, X1
-
-	MOVOA rupolykp<>+0(SB), X0
-
-	MOVOA     X1, X2
-	PAND      X3, X1
-	PCLMULQDQ $0x10, X0, X1
-	PAND      X3, X1
-	PCLMULQDQ $0, X0, X1
-	PXOR      X2, X1
-
-	// PEXTRD   $1, X1, AX  (SSE 4.1)
-	BYTE $0x66; BYTE $0x0f; BYTE $0x3a
-	BYTE $0x16; BYTE $0xc8; BYTE $0x01
-	MOVL AX, ret+32(FP)
-
-	RET
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go b/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
deleted file mode 100644
index 3222b06a..00000000
--- a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine,!gccgo
-
-package crc32
-
-// This file contains the code to call the SSE 4.2 version of the Castagnoli
-// CRC.
-
-// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
-// support.
-func haveSSE42() bool
-
-// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32
-// instruction.
-//go:noescape
-func castagnoliSSE42(crc uint32, p []byte) uint32
-
-var sse42 = haveSSE42()
-
-func archAvailableCastagnoli() bool {
-	return sse42
-}
-
-func archInitCastagnoli() {
-	if !sse42 {
-		panic("not available")
-	}
-	// No initialization necessary.
-}
-
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
-	if !sse42 {
-		panic("not available")
-	}
-	return castagnoliSSE42(crc, p)
-}
-
-func archAvailableIEEE() bool                    { return false }
-func archInitIEEE()                              { panic("not available") }
-func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.s b/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
deleted file mode 100644
index a578d685..00000000
--- a/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build gc
-
-#define NOSPLIT 4
-#define RODATA 8
-
-// func castagnoliSSE42(crc uint32, p []byte) uint32
-TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
-	MOVL crc+0(FP), AX   // CRC value
-	MOVL p+4(FP), SI     // data pointer
-	MOVL p_len+8(FP), CX // len(p)
-
-	NOTL AX
-
-	// If there's less than 8 bytes to process, we do it byte-by-byte.
-	CMPQ CX, $8
-	JL   cleanup
-
-	// Process individual bytes until the input is 8-byte aligned.
-startup:
-	MOVQ SI, BX
-	ANDQ $7, BX
-	JZ   aligned
-
-	CRC32B (SI), AX
-	DECQ   CX
-	INCQ   SI
-	JMP    startup
-
-aligned:
-	// The input is now 8-byte aligned and we can process 8-byte chunks.
-	CMPQ CX, $8
-	JL   cleanup
-
-	CRC32Q (SI), AX
-	ADDQ   $8, SI
-	SUBQ   $8, CX
-	JMP    aligned
-
-cleanup:
-	// We may have some bytes left over that we process one at a time.
-	CMPQ CX, $0
-	JE   done
-
-	CRC32B (SI), AX
-	INCQ   SI
-	DECQ   CX
-	JMP    cleanup
-
-done:
-	NOTL AX
-	MOVL AX, ret+16(FP)
-	RET
-
-// func haveSSE42() bool
-TEXT ·haveSSE42(SB), NOSPLIT, $0
-	XORQ AX, AX
-	INCL AX
-	CPUID
-	SHRQ $20, CX
-	ANDQ $1, CX
-	MOVB CX, ret+0(FP)
-	RET
-
diff --git a/vendor/github.com/klauspost/crc32/crc32_generic.go b/vendor/github.com/klauspost/crc32/crc32_generic.go
deleted file mode 100644
index abacbb66..00000000
--- a/vendor/github.com/klauspost/crc32/crc32_generic.go
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file contains CRC32 algorithms that are not specific to any architecture
-// and don't use hardware acceleration.
-//
-// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table.
-//
-// The slicing-by-8 algorithm is a faster implementation that uses a bigger
-// table (8*256*4 bytes).
-
-package crc32
-
-// simpleMakeTable allocates and constructs a Table for the specified
-// polynomial. The table is suitable for use with the simple algorithm
-// (simpleUpdate).
-func simpleMakeTable(poly uint32) *Table {
-	t := new(Table)
-	simplePopulateTable(poly, t)
-	return t
-}
-
-// simplePopulateTable constructs a Table for the specified polynomial, suitable
-// for use with simpleUpdate.
-func simplePopulateTable(poly uint32, t *Table) {
-	for i := 0; i < 256; i++ {
-		crc := uint32(i)
-		for j := 0; j < 8; j++ {
-			if crc&1 == 1 {
-				crc = (crc >> 1) ^ poly
-			} else {
-				crc >>= 1
-			}
-		}
-		t[i] = crc
-	}
-}
-
-// simpleUpdate uses the simple algorithm to update the CRC, given a table that
-// was previously computed using simpleMakeTable.
-func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 {
-	crc = ^crc
-	for _, v := range p {
-		crc = tab[byte(crc)^v] ^ (crc >> 8)
-	}
-	return ^crc
-}
-
-// Use slicing-by-8 when payload >= this value.
-const slicing8Cutoff = 16
-
-// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm.
-type slicing8Table [8]Table
-
-// slicingMakeTable constructs a slicing8Table for the specified polynomial. The
-// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate).
-func slicingMakeTable(poly uint32) *slicing8Table {
-	t := new(slicing8Table)
-	simplePopulateTable(poly, &t[0])
-	for i := 0; i < 256; i++ {
-		crc := t[0][i]
-		for j := 1; j < 8; j++ {
-			crc = t[0][crc&0xFF] ^ (crc >> 8)
-			t[j][i] = crc
-		}
-	}
-	return t
-}
-
-// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a
-// table that was previously computed using slicingMakeTable.
-func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 {
-	if len(p) >= slicing8Cutoff {
-		crc = ^crc
-		for len(p) > 8 {
-			crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
-			crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
-				tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
-				tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
-			p = p[8:]
-		}
-		crc = ^crc
-	}
-	if len(p) == 0 {
-		return crc
-	}
-	return simpleUpdate(crc, &tab[0], p)
-}
diff --git a/vendor/github.com/klauspost/crc32/crc32_otherarch.go b/vendor/github.com/klauspost/crc32/crc32_otherarch.go
deleted file mode 100644
index cc960764..00000000
--- a/vendor/github.com/klauspost/crc32/crc32_otherarch.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!amd64p32,!s390x
-
-package crc32
-
-func archAvailableIEEE() bool                    { return false }
-func archInitIEEE()                              { panic("not available") }
-func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
-
-func archAvailableCastagnoli() bool                    { return false }
-func archInitCastagnoli()                              { panic("not available") }
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") }
diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.go b/vendor/github.com/klauspost/crc32/crc32_s390x.go
deleted file mode 100644
index ce96f032..00000000
--- a/vendor/github.com/klauspost/crc32/crc32_s390x.go
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x
-
-package crc32
-
-const (
-	vxMinLen    = 64
-	vxAlignMask = 15 // align to 16 bytes
-)
-
-// hasVectorFacility reports whether the machine has the z/Architecture
-// vector facility installed and enabled.
-func hasVectorFacility() bool
-
-var hasVX = hasVectorFacility()
-
-// vectorizedCastagnoli implements CRC32 using vector instructions.
-// It is defined in crc32_s390x.s.
-//go:noescape
-func vectorizedCastagnoli(crc uint32, p []byte) uint32
-
-// vectorizedIEEE implements CRC32 using vector instructions.
-// It is defined in crc32_s390x.s.
-//go:noescape
-func vectorizedIEEE(crc uint32, p []byte) uint32
-
-func archAvailableCastagnoli() bool {
-	return hasVX
-}
-
-var archCastagnoliTable8 *slicing8Table
-
-func archInitCastagnoli() {
-	if !hasVX {
-		panic("not available")
-	}
-	// We still use slicing-by-8 for small buffers.
-	archCastagnoliTable8 = slicingMakeTable(Castagnoli)
-}
-
-// archUpdateCastagnoli calculates the checksum of p using
-// vectorizedCastagnoli.
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
-	if !hasVX {
-		panic("not available")
-	}
-	// Use vectorized function if data length is above threshold.
-	if len(p) >= vxMinLen {
-		aligned := len(p) & ^vxAlignMask
-		crc = vectorizedCastagnoli(crc, p[:aligned])
-		p = p[aligned:]
-	}
-	if len(p) == 0 {
-		return crc
-	}
-	return slicingUpdate(crc, archCastagnoliTable8, p)
-}
-
-func archAvailableIEEE() bool {
-	return hasVX
-}
-
-var archIeeeTable8 *slicing8Table
-
-func archInitIEEE() {
-	if !hasVX {
-		panic("not available")
-	}
-	// We still use slicing-by-8 for small buffers.
-	archIeeeTable8 = slicingMakeTable(IEEE)
-}
-
-// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
-func archUpdateIEEE(crc uint32, p []byte) uint32 {
-	if !hasVX {
-		panic("not available")
-	}
-	// Use vectorized function if data length is above threshold.
-	if len(p) >= vxMinLen {
-		aligned := len(p) & ^vxAlignMask
-		crc = vectorizedIEEE(crc, p[:aligned])
-		p = p[aligned:]
-	}
-	if len(p) == 0 {
-		return crc
-	}
-	return slicingUpdate(crc, archIeeeTable8, p)
-}
diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.s b/vendor/github.com/klauspost/crc32/crc32_s390x.s
deleted file mode 100644
index e980ca29..00000000
--- a/vendor/github.com/klauspost/crc32/crc32_s390x.s
+++ /dev/null
@@ -1,249 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x
-
-#include "textflag.h"
-
-// Vector register range containing CRC-32 constants
-
-#define CONST_PERM_LE2BE        V9
-#define CONST_R2R1              V10
-#define CONST_R4R3              V11
-#define CONST_R5                V12
-#define CONST_RU_POLY           V13
-#define CONST_CRC_POLY          V14
-
-// The CRC-32 constant block contains reduction constants to fold and
-// process particular chunks of the input data stream in parallel.
-//
-// Note that the constant definitions below are extended in order to compute
-// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
-// The rightmost doubleword can be 0 to prevent contribution to the result or
-// can be multiplied by 1 to perform an XOR without the need for a separate
-// VECTOR EXCLUSIVE OR instruction.
-//
-// The polynomials used are bit-reflected:
-//
-//            IEEE: P'(x) = 0x0edb88320
-//      Castagnoli: P'(x) = 0x082f63b78
-
-// IEEE polynomial constants
-DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
-DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
-DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
-DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
-DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
-DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
-DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
-DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
-DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
-DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
-DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
-DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
-
-GLOBL ·crcleconskp(SB), RODATA, $144
-
-// Castagonli Polynomial constants
-DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
-DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
-DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
-DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
-DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
-DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
-DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
-DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
-DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
-DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
-DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
-DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
-
-GLOBL ·crccleconskp(SB), RODATA, $144
-
-// func hasVectorFacility() bool
-TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
-	MOVD  $x-24(SP), R1
-	XC    $24, 0(R1), 0(R1) // clear the storage
-	MOVD  $2, R0            // R0 is the number of double words stored -1
-	WORD  $0xB2B01000       // STFLE 0(R1)
-	XOR   R0, R0            // reset the value of R0
-	MOVBZ z-8(SP), R1
-	AND   $0x40, R1
-	BEQ   novector
-
-vectorinstalled:
-	// check if the vector instruction has been enabled
-	VLEIB  $0, $0xF, V16
-	VLGVB  $0, V16, R1
-	CMPBNE R1, $0xF, novector
-	MOVB   $1, ret+0(FP)      // have vx
-	RET
-
-novector:
-	MOVB $0, ret+0(FP) // no vx
-	RET
-
-// The CRC-32 function(s) use these calling conventions:
-//
-// Parameters:
-//
-//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
-//      R3:    Input buffer pointer, performance might be improved if the
-//             buffer is on a doubleword boundary.
-//      R4:    Length of the buffer, must be 64 bytes or greater.
-//
-// Register usage:
-//
-//      R5:     CRC-32 constant pool base pointer.
-//      V0:     Initial CRC value and intermediate constants and results.
-//      V1..V4: Data for CRC computation.
-//      V5..V8: Next data chunks that are fetched from the input buffer.
-//
-//      V9..V14: CRC-32 constants.
-
-// func vectorizedIEEE(crc uint32, p []byte) uint32
-TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
-	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
-	MOVD  p+8(FP), R3      // data pointer
-	MOVD  p_len+16(FP), R4 // len(p)
-
-	MOVD $·crcleconskp(SB), R5
-	BR   vectorizedBody<>(SB)
-
-// func vectorizedCastagnoli(crc uint32, p []byte) uint32
-TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
-	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
-	MOVD  p+8(FP), R3      // data pointer
-	MOVD  p_len+16(FP), R4 // len(p)
-
-	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
-	MOVD $·crccleconskp(SB), R5
-	BR   vectorizedBody<>(SB)
-
-TEXT vectorizedBody<>(SB), NOSPLIT, $0
-	XOR $0xffffffff, R2                         // NOTW R2
-	VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
-
-	// Load the initial CRC value into the rightmost word of V0
-	VZERO V0
-	VLVGF $3, R2, V0
-
-	// Crash if the input size is less than 64-bytes.
-	CMP R4, $64
-	BLT crash
-
-	// Load a 64-byte data chunk and XOR with CRC
-	VLM 0(R3), V1, V4 // 64-bytes into V1..V4
-
-	// Reflect the data if the CRC operation is in the bit-reflected domain
-	VPERM V1, V1, CONST_PERM_LE2BE, V1
-	VPERM V2, V2, CONST_PERM_LE2BE, V2
-	VPERM V3, V3, CONST_PERM_LE2BE, V3
-	VPERM V4, V4, CONST_PERM_LE2BE, V4
-
-	VX  V0, V1, V1 // V1 ^= CRC
-	ADD $64, R3    // BUF = BUF + 64
-	ADD $(-64), R4
-
-	// Check remaining buffer size and jump to proper folding method
-	CMP R4, $64
-	BLT less_than_64bytes
-
-fold_64bytes_loop:
-	// Load the next 64-byte data chunk into V5 to V8
-	VLM   0(R3), V5, V8
-	VPERM V5, V5, CONST_PERM_LE2BE, V5
-	VPERM V6, V6, CONST_PERM_LE2BE, V6
-	VPERM V7, V7, CONST_PERM_LE2BE, V7
-	VPERM V8, V8, CONST_PERM_LE2BE, V8
-
-	// Perform a GF(2) multiplication of the doublewords in V1 with
-	// the reduction constants in V0.  The intermediate result is
-	// then folded (accumulated) with the next data chunk in V5 and
-	// stored in V1.  Repeat this step for the register contents
-	// in V2, V3, and V4 respectively.
-
-	VGFMAG CONST_R2R1, V1, V5, V1
-	VGFMAG CONST_R2R1, V2, V6, V2
-	VGFMAG CONST_R2R1, V3, V7, V3
-	VGFMAG CONST_R2R1, V4, V8, V4
-
-	// Adjust buffer pointer and length for next loop
-	ADD $64, R3    // BUF = BUF + 64
-	ADD $(-64), R4 // LEN = LEN - 64
-
-	CMP R4, $64
-	BGE fold_64bytes_loop
-
-less_than_64bytes:
-	// Fold V1 to V4 into a single 128-bit value in V1
-	VGFMAG CONST_R4R3, V1, V2, V1
-	VGFMAG CONST_R4R3, V1, V3, V1
-	VGFMAG CONST_R4R3, V1, V4, V1
-
-	// Check whether to continue with 64-bit folding
-	CMP R4, $16
-	BLT final_fold
-
-fold_16bytes_loop:
-	VL    0(R3), V2                    // Load next data chunk
-	VPERM V2, V2, CONST_PERM_LE2BE, V2
-
-	VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
-
-	// Adjust buffer pointer and size for folding next data chunk
-	ADD $16, R3
-	ADD $-16, R4
-
-	// Process remaining data chunks
-	CMP R4, $16
-	BGE fold_16bytes_loop
-
-final_fold:
-	VLEIB $7, $0x40, V9
-	VSRLB V9, CONST_R4R3, V0
-	VLEIG $0, $1, V0
-
-	VGFMG V0, V1, V1
-
-	VLEIB  $7, $0x20, V9        // Shift by words
-	VSRLB  V9, V1, V2           // Store remaining bits in V2
-	VUPLLF V1, V1               // Split rightmost doubleword
-	VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
-
-	// The input values to the Barret reduction are the degree-63 polynomial
-	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
-	// constant u.  The Barret reduction result is the CRC value of R(x) mod
-	// P(x).
-	//
-	// The Barret reduction algorithm is defined as:
-	//
-	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
-	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
-	//    3. C(x)  = R(x) XOR T2(x) mod x^32
-	//
-	// Note: To compensate the division by x^32, use the vector unpack
-	// instruction to move the leftmost word into the leftmost doubleword
-	// of the vector register.  The rightmost doubleword is multiplied
-	// with zero to not contribute to the intermedate results.
-
-	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
-	VUPLLF V1, V2
-	VGFMG  CONST_RU_POLY, V2, V2
-
-	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
-	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
-	// The final result is in the rightmost word of V2.
-
-	VUPLLF V2, V2
-	VGFMAG CONST_CRC_POLY, V2, V1, V2
-
-done:
-	VLGVF $2, V2, R2
-	XOR   $0xffffffff, R2  // NOTW R2
-	MOVWZ R2, ret + 32(FP)
-	RET
-
-crash:
-	MOVD $0, (R0) // input size is less than 64-bytes