13 files changed, 920 insertions, 0 deletions
diff --git a/vendor/github.com/templexxx/xorsimd/.gitattributes b/vendor/github.com/templexxx/xorsimd/.gitattributes
new file mode 100644
index 0000000..68f7d04
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/.gitattributes
@@ -0,0 +1 @@
+*.s linguist-language=go:x
diff --git a/vendor/github.com/templexxx/xorsimd/.gitignore b/vendor/github.com/templexxx/xorsimd/.gitignore
new file mode 100644
index 0000000..43309f8
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/.gitignore
@@ -0,0 +1,13 @@
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+
+# Test binary, build with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+.idea
diff --git a/vendor/github.com/templexxx/xorsimd/LICENSE b/vendor/github.com/templexxx/xorsimd/LICENSE
new file mode 100644
index 0000000..08ee714
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Temple3x (temple3x@gmail.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/vendor/github.com/templexxx/xorsimd/README.md b/vendor/github.com/templexxx/xorsimd/README.md
new file mode 100644
index 0000000..9dce5c9
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/README.md
@@ -0,0 +1,46 @@
+# XOR SIMD
+
+[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8] [![Sourcegraph][9]][10]
+
+[1]: https://godoc.org/github.com/templexxx/xorsimd?status.svg
+[2]: https://godoc.org/github.com/templexxx/xorsimd
+[3]: https://img.shields.io/badge/license-MIT-blue.svg
+[4]: LICENSE
+[5]: https://github.com/templexxx/xorsimd/workflows/unit-test/badge.svg
+[6]: https://github.com/templexxx/xorsimd
+[7]: https://goreportcard.com/badge/github.com/templexxx/xorsimd
+[8]: https://goreportcard.com/report/github.com/templexxx/xorsimd
+[9]: https://sourcegraph.com/github.com/templexxx/xorsimd/-/badge.svg
+[10]: https://sourcegraph.com/github.com/templexxx/xorsimd?badge
+
+## Introduction:
+
+>- XOR code engine in pure Go.
+>
+>- [High Performance](https://github.com/templexxx/xorsimd#performance): 
+More than 270GB/s per physics core. 
+
+## Performance
+
+Performance depends mainly on:
+
+>- CPU instruction extension.
+>
+>- Number of source row vectors.
+
+**Platform:** 
+
+*AWS c5d.xlarge (Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz)*
+
+**All test run on a single Core.**
+
+`I/O = (src_num + 1) * vector_size / cost`
+
+| Src Num  | Vector size | AVX512 I/O (MB/S) |  AVX2 I/O (MB/S) |SSE2 I/O (MB/S) |
+|-------|-------------|-------------|---------------|---------------|
+|5|4KB|     270403.73    |     142825.25    |    74443.91    |
+|5|1MB|    26948.34     |   26887.37 	      |     26950.65     | 
+|5|8MB|     17881.32     |    17212.56      |  16402.97      | 
+|10|4KB|     190445.30    |   102953.59      |   53244.04       |  
+|10|1MB|   26424.44     |     26618.65   |    26094.39    |   
+|10|8MB|   15471.31      |     14866.72      |    13565.80      |  
diff --git a/vendor/github.com/templexxx/xorsimd/go.mod b/vendor/github.com/templexxx/xorsimd/go.mod
new file mode 100644
index 0000000..ac5f57f
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/go.mod
@@ -0,0 +1,5 @@
+module github.com/templexxx/xorsimd
+
+require github.com/templexxx/cpu v0.0.1
+
+go 1.13
diff --git a/vendor/github.com/templexxx/xorsimd/go.sum b/vendor/github.com/templexxx/xorsimd/go.sum
new file mode 100644
index 0000000..04d04de
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/go.sum
@@ -0,0 +1,2 @@
+github.com/templexxx/cpu v0.0.1 h1:hY4WdLOgKdc8y13EYklu9OUTXik80BkxHoWvTO6MQQY=
+github.com/templexxx/cpu v0.0.1/go.mod h1:w7Tb+7qgcAlIyX4NhLuDKt78AHA5SzPmq0Wj6HiEnnk=
diff --git a/vendor/github.com/templexxx/xorsimd/xor.go b/vendor/github.com/templexxx/xorsimd/xor.go
new file mode 100644
index 0000000..ae88911
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xor.go
@@ -0,0 +1,89 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+
+package xorsimd
+
+import "github.com/templexxx/cpu"
+
+// EnableAVX512 may slow down CPU Clock (maybe not).
+// TODO need more research:
+// https://lemire.me/blog/2018/04/19/by-how-much-does-avx-512-slow-down-your-cpu-a-first-experiment/
+var EnableAVX512 = true
+
+// cpuFeature indicates which instruction set will be used.
+var cpuFeature = getCPUFeature()
+
+const (
+	avx512 = iota
+	avx2
+	sse2
+	generic
+)
+
+// TODO: Add ARM feature...
+func getCPUFeature() int {
+	if hasAVX512() && EnableAVX512 {
+		return avx512
+	} else if cpu.X86.HasAVX2 {
+		return avx2
+	} else {
+		return sse2 // amd64 must has sse2
+	}
+}
+
+func hasAVX512() (ok bool) {
+
+	return cpu.X86.HasAVX512VL &&
+		cpu.X86.HasAVX512BW &&
+		cpu.X86.HasAVX512F &&
+		cpu.X86.HasAVX512DQ
+}
+
+// Encode encodes elements from source slice into a
+// destination slice. The source and destination may overlap.
+// Encode returns the number of bytes encoded, which will be the minimum of
+// len(src[i]) and len(dst).
+func Encode(dst []byte, src [][]byte) (n int) {
+	n = checkLen(dst, src)
+	if n == 0 {
+		return
+	}
+
+	dst = dst[:n]
+	for i := range src {
+		src[i] = src[i][:n]
+	}
+
+	if len(src) == 1 {
+		copy(dst, src[0])
+		return
+	}
+
+	encode(dst, src)
+	return
+}
+
+func checkLen(dst []byte, src [][]byte) int {
+	n := len(dst)
+	for i := range src {
+		if len(src[i]) < n {
+			n = len(src[i])
+		}
+	}
+
+	if n <= 0 {
+		return 0
+	}
+	return n
+}
+
+// Bytes XORs the bytes in a and b into a
+// destination slice. The source and destination may overlap.
+//
+// Bytes returns the number of bytes encoded, which will be the minimum of
+// len(dst), len(a), len(b).
+func Bytes(dst, a, b []byte) int {
+	return Encode(dst, [][]byte{a, b})
+}
diff --git a/vendor/github.com/templexxx/xorsimd/xor_amd64.go b/vendor/github.com/templexxx/xorsimd/xor_amd64.go
new file mode 100644
index 0000000..5d46df3
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xor_amd64.go
@@ -0,0 +1,95 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+
+package xorsimd
+
+func encode(dst []byte, src [][]byte) {
+
+	switch cpuFeature {
+	case avx512:
+		encodeAVX512(dst, src)
+	case avx2:
+		encodeAVX2(dst, src)
+	default:
+		encodeSSE2(dst, src)
+	}
+	return
+}
+
+// Bytes8 XORs of 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+func Bytes8(dst, a, b []byte) {
+
+	bytes8(&dst[0], &a[0], &b[0])
+}
+
+// Bytes16 XORs of packed 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+func Bytes16(dst, a, b []byte) {
+
+	bytes16(&dst[0], &a[0], &b[0])
+}
+
+// Bytes8Align XORs of 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+func Bytes8Align(dst, a, b []byte) {
+
+	bytes8(&dst[0], &a[0], &b[0])
+}
+
+// Bytes16Align XORs of packed 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+func Bytes16Align(dst, a, b []byte) {
+
+	bytes16(&dst[0], &a[0], &b[0])
+}
+
+// BytesA XORs the len(a) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesA(dst, a, b []byte) {
+
+	bytesN(&dst[0], &a[0], &b[0], len(a))
+}
+
+// BytesB XORs the len(b) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesB(dst, a, b []byte) {
+
+	bytesN(&dst[0], &a[0], &b[0], len(b))
+}
+
+//go:noescape
+func encodeAVX512(dst []byte, src [][]byte)
+
+//go:noescape
+func encodeAVX2(dst []byte, src [][]byte)
+
+//go:noescape
+func encodeSSE2(dst []byte, src [][]byte)
+
+//go:noescape
+func bytesN(dst, a, b *byte, n int)
+
+//go:noescape
+func bytes8(dst, a, b *byte)
+
+//go:noescape
+func bytes16(dst, a, b *byte)
diff --git a/vendor/github.com/templexxx/xorsimd/xor_generic.go b/vendor/github.com/templexxx/xorsimd/xor_generic.go
new file mode 100644
index 0000000..b12908f
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xor_generic.go
@@ -0,0 +1,205 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+//
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64
+
+package xorsimd
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
+
+func encode(dst []byte, src [][]byte) {
+	if supportsUnaligned {
+		fastEncode(dst, src, len(dst))
+	} else {
+		// TODO(hanwen): if (dst, a, b) have common alignment
+		// we could still try fastEncode. It is not clear
+		// how often this happens, and it's only worth it if
+		// the block encryption itself is hardware
+		// accelerated.
+		safeEncode(dst, src, len(dst))
+	}
+
+}
+
+// fastEncode xor in bulk. It only works on architectures that
+// support unaligned read/writes.
+func fastEncode(dst []byte, src [][]byte, n int) {
+	w := n / wordSize
+	if w > 0 {
+		wordBytes := w * wordSize
+
+		wordAlignSrc := make([][]byte, len(src))
+		for i := range src {
+			wordAlignSrc[i] = src[i][:wordBytes]
+		}
+		fastEnc(dst[:wordBytes], wordAlignSrc)
+	}
+
+	for i := n - n%wordSize; i < n; i++ {
+		s := src[0][i]
+		for j := 1; j < len(src); j++ {
+			s ^= src[j][i]
+		}
+		dst[i] = s
+	}
+}
+
+func fastEnc(dst []byte, src [][]byte) {
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	sw := make([][]uintptr, len(src))
+	for i := range src {
+		sw[i] = *(*[]uintptr)(unsafe.Pointer(&src[i]))
+	}
+
+	n := len(dst) / wordSize
+	for i := 0; i < n; i++ {
+		s := sw[0][i]
+		for j := 1; j < len(sw); j++ {
+			s ^= sw[j][i]
+		}
+		dw[i] = s
+	}
+}
+
+func safeEncode(dst []byte, src [][]byte, n int) {
+	for i := 0; i < n; i++ {
+		s := src[0][i]
+		for j := 1; j < len(src); j++ {
+			s ^= src[j][i]
+		}
+		dst[i] = s
+	}
+}
+
+// Bytes8 XORs of word 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+func Bytes8(dst, a, b []byte) {
+
+	bytesWords(dst[:8], a[:8], b[:8])
+}
+
+// Bytes16 XORs of packed doubleword 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+func Bytes16(dst, a, b []byte) {
+
+	bytesWords(dst[:16], a[:16], b[:16])
+}
+
+// bytesWords XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The slice arguments a and b are assumed to be of equal length.
+func bytesWords(dst, a, b []byte) {
+	if supportsUnaligned {
+		dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+		aw := *(*[]uintptr)(unsafe.Pointer(&a))
+		bw := *(*[]uintptr)(unsafe.Pointer(&b))
+		n := len(b) / wordSize
+		for i := 0; i < n; i++ {
+			dw[i] = aw[i] ^ bw[i]
+		}
+	} else {
+		n := len(b)
+		for i := 0; i < n; i++ {
+			dst[i] = a[i] ^ b[i]
+		}
+	}
+}
+
+// Bytes8Align XORs of 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+//
+// All the byte slices must be aligned to wordsize.
+func Bytes8Align(dst, a, b []byte) {
+
+	bytesWordsAlign(dst[:8], a[:8], b[:8])
+}
+
+// Bytes16Align XORs of packed 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+//
+// All the byte slices must be aligned to wordsize.
+func Bytes16Align(dst, a, b []byte) {
+
+	bytesWordsAlign(dst[:16], a[:16], b[:16])
+}
+
+// bytesWordsAlign XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The slice arguments a and b are assumed to be of equal length.
+//
+// All the byte slices must be aligned to wordsize.
+func bytesWordsAlign(dst, a, b []byte) {
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	aw := *(*[]uintptr)(unsafe.Pointer(&a))
+	bw := *(*[]uintptr)(unsafe.Pointer(&b))
+	n := len(b) / wordSize
+	for i := 0; i < n; i++ {
+		dw[i] = aw[i] ^ bw[i]
+	}
+}
+
+// BytesA XORs the len(a) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesA(dst, a, b []byte) {
+
+	n := len(a)
+	bytesN(dst[:n], a[:n], b[:n], n)
+}
+
+// BytesB XORs the len(b) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesB(dst, a, b []byte) {
+
+	n := len(b)
+	bytesN(dst[:n], a[:n], b[:n], n)
+}
+
+func bytesN(dst, a, b []byte, n int) {
+
+	switch {
+	case supportsUnaligned:
+		w := n / wordSize
+		if w > 0 {
+			dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+			aw := *(*[]uintptr)(unsafe.Pointer(&a))
+			bw := *(*[]uintptr)(unsafe.Pointer(&b))
+			for i := 0; i < w; i++ {
+				dw[i] = aw[i] ^ bw[i]
+			}
+		}
+
+		for i := (n - n%wordSize); i < n; i++ {
+			dst[i] = a[i] ^ b[i]
+		}
+	default:
+		for i := 0; i < n; i++ {
+			dst[i] = a[i] ^ b[i]
+		}
+	}
+}
diff --git a/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s
new file mode 100644
index 0000000..23cf924
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s
@@ -0,0 +1,124 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+ 
+#include "textflag.h"
+
+#define dst BX // parity's address
+#define d2src SI // two-dimension src_slice's address
+#define csrc CX // cnt of src
+#define len DX // len of vect
+#define pos R8 // job position in vect
+
+#define csrc_tmp R9
+#define d2src_off R10
+#define src_tmp R11
+#define not_aligned_len R12
+#define src_val0 R13
+#define src_val1 R14
+
+// func encodeAVX2(dst []byte, src [][]byte)
+TEXT ·encodeAVX2(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), dst
+	MOVQ  s+24(FP), d2src
+	MOVQ  c+32(FP), csrc
+	MOVQ  l+8(FP), len
+	TESTQ $127, len
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, pos
+
+loop128b:
+	MOVQ    csrc, csrc_tmp                // store src_cnt -> csrc_tmp
+	SUBQ    $2, csrc_tmp
+	MOVQ    $0, d2src_off
+	MOVQ    (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp
+	VMOVDQU (src_tmp)(pos*1), Y0
+	VMOVDQU 32(src_tmp)(pos*1), Y1
+	VMOVDQU 64(src_tmp)(pos*1), Y2
+	VMOVDQU 96(src_tmp)(pos*1), Y3
+
+next_vect:
+	ADDQ    $24, d2src_off                // len(slice) = 24
+	MOVQ    (d2src)(d2src_off*1), src_tmp // next data_vect
+	VMOVDQU (src_tmp)(pos*1), Y4
+	VMOVDQU 32(src_tmp)(pos*1), Y5
+	VMOVDQU 64(src_tmp)(pos*1), Y6
+	VMOVDQU 96(src_tmp)(pos*1), Y7
+	VPXOR   Y4, Y0, Y0
+	VPXOR   Y5, Y1, Y1
+	VPXOR   Y6, Y2, Y2
+	VPXOR   Y7, Y3, Y3
+	SUBQ    $1, csrc_tmp
+	JGE     next_vect
+
+	VMOVDQU Y0, (dst)(pos*1)
+	VMOVDQU Y1, 32(dst)(pos*1)
+	VMOVDQU Y2, 64(dst)(pos*1)
+	VMOVDQU Y3, 96(dst)(pos*1)
+
+	ADDQ $128, pos
+	CMPQ len, pos
+	JNE  loop128b
+	VZEROUPPER
+	RET
+
+loop_1b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVB -1(src_tmp)(len*1), src_val0  // encode from the end of src
+
+next_vect_1b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVB -1(src_tmp)(len*1), src_val1
+	XORB src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_1b
+
+	MOVB  src_val0, -1(dst)(len*1)
+	SUBQ  $1, len
+	TESTQ $7, len
+	JNZ   loop_1b
+
+	CMPQ  len, $0
+	JE    ret
+	TESTQ $127, len
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, len
+	JNE   loop_1b
+	MOVQ  len, not_aligned_len
+	ANDQ  $127, not_aligned_len
+
+loop_8b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVQ -8(src_tmp)(len*1), src_val0
+
+next_vect_8b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVQ -8(src_tmp)(len*1), src_val1
+	XORQ src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_8b
+
+	MOVQ src_val0, -8(dst)(len*1)
+	SUBQ $8, len
+	SUBQ $8, not_aligned_len
+	JG   loop_8b
+
+	CMPQ len, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
diff --git a/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s b/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s
new file mode 100644
index 0000000..2ba6b75
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s
@@ -0,0 +1,124 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+ 
+#include "textflag.h"
+
+#define dst BX // parity's address
+#define d2src SI // two-dimension src_slice's address
+#define csrc CX // cnt of src
+#define len DX // len of vect
+#define pos R8 // job position in vect
+
+#define csrc_tmp R9
+#define d2src_off R10
+#define src_tmp R11
+#define not_aligned_len R12
+#define src_val0 R13
+#define src_val1 R14
+
+// func encodeAVX512(dst []byte, src [][]byte)
+TEXT ·encodeAVX512(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), dst
+	MOVQ  src+24(FP), d2src
+	MOVQ  c+32(FP), csrc
+	MOVQ  l+8(FP), len
+	TESTQ $255, len
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, pos
+
+loop256b:
+	MOVQ     csrc, csrc_tmp                // store src_cnt -> csrc_tmp
+	SUBQ     $2, csrc_tmp
+	MOVQ     $0, d2src_off
+	MOVQ     (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp
+	VMOVDQU8 (src_tmp)(pos*1), Z0
+	VMOVDQU8 64(src_tmp)(pos*1), Z1
+	VMOVDQU8 128(src_tmp)(pos*1), Z2
+	VMOVDQU8 192(src_tmp)(pos*1), Z3
+
+next_vect:
+	ADDQ     $24, d2src_off                // len(slice) = 24
+	MOVQ     (d2src)(d2src_off*1), src_tmp // next data_vect
+	VMOVDQU8 (src_tmp)(pos*1), Z4
+	VMOVDQU8 64(src_tmp)(pos*1), Z5
+	VMOVDQU8 128(src_tmp)(pos*1), Z6
+	VMOVDQU8 192(src_tmp)(pos*1), Z7
+	VPXORQ   Z4, Z0, Z0
+	VPXORQ   Z5, Z1, Z1
+	VPXORQ   Z6, Z2, Z2
+	VPXORQ   Z7, Z3, Z3
+	SUBQ     $1, csrc_tmp
+	JGE      next_vect
+
+	VMOVDQU8 Z0, (dst)(pos*1)
+	VMOVDQU8 Z1, 64(dst)(pos*1)
+	VMOVDQU8 Z2, 128(dst)(pos*1)
+	VMOVDQU8 Z3, 192(dst)(pos*1)
+
+	ADDQ $256, pos
+	CMPQ len, pos
+	JNE  loop256b
+	VZEROUPPER
+	RET
+
+loop_1b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVB -1(src_tmp)(len*1), src_val0  // encode from the end of src
+
+next_vect_1b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVB -1(src_tmp)(len*1), src_val1
+	XORB src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_1b
+
+	MOVB  src_val0, -1(dst)(len*1)
+	SUBQ  $1, len
+	TESTQ $7, len
+	JNZ   loop_1b
+
+	CMPQ  len, $0
+	JE    ret
+	TESTQ $255, len
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, len
+	JNE   loop_1b
+	MOVQ  len, not_aligned_len
+	ANDQ  $255, not_aligned_len
+
+loop_8b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVQ -8(src_tmp)(len*1), src_val0
+
+next_vect_8b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVQ -8(src_tmp)(len*1), src_val1
+	XORQ src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_8b
+
+	MOVQ src_val0, -8(dst)(len*1)
+	SUBQ $8, len
+	SUBQ $8, not_aligned_len
+	JG   loop_8b
+
+	CMPQ len, $256
+	JGE  aligned
+	RET
+
+ret:
+	RET
diff --git a/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
new file mode 100644
index 0000000..8f67edd
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
@@ -0,0 +1,72 @@
+#include "textflag.h"
+
+// func bytesN(dst, a, b *byte, n int)
+TEXT ·bytesN(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), BX
+	MOVQ  a+8(FP), SI
+	MOVQ  b+16(FP), CX
+	MOVQ  n+24(FP), DX
+	TESTQ $15, DX            // AND 15 & len, if not zero jump to not_aligned.
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, AX // position in slices
+
+loop16b:
+	MOVOU (SI)(AX*1), X0   // XOR 16byte forwards.
+	MOVOU (CX)(AX*1), X1
+	PXOR  X1, X0
+	MOVOU X0, (BX)(AX*1)
+	ADDQ  $16, AX
+	CMPQ  DX, AX
+	JNE   loop16b
+	RET
+
+loop_1b:
+	SUBQ  $1, DX           // XOR 1byte backwards.
+	MOVB  (SI)(DX*1), DI
+	MOVB  (CX)(DX*1), AX
+	XORB  AX, DI
+	MOVB  DI, (BX)(DX*1)
+	TESTQ $7, DX           // AND 7 & len, if not zero jump to loop_1b.
+	JNZ   loop_1b
+	CMPQ  DX, $0           // if len is 0, ret.
+	JE    ret
+	TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, DX           // AND $7 & len, if not zero jump to loop_1b.
+	JNE   loop_1b
+	SUBQ  $8, DX           // XOR 8bytes backwards.
+	MOVQ  (SI)(DX*1), DI
+	MOVQ  (CX)(DX*1), AX
+	XORQ  AX, DI
+	MOVQ  DI, (BX)(DX*1)
+	CMPQ  DX, $16          // if len is greater or equal 16 here, it must be aligned.
+	JGE   aligned
+
+ret:
+	RET
+
+// func bytes8(dst, a, b *byte)
+TEXT ·bytes8(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), BX
+	MOVQ  a+8(FP), SI
+	MOVQ  b+16(FP), CX
+	MOVQ  (SI), DI
+    MOVQ  (CX), AX
+    XORQ  AX, DI
+    MOVQ  DI, (BX)
+    RET
+
+// func bytes16(dst, a, b *byte)
+TEXT ·bytes16(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), BX
+	MOVQ  a+8(FP), SI
+	MOVQ  b+16(FP), CX
+	MOVOU (SI), X0
+    MOVOU (CX), X1
+    PXOR  X1, X0
+    MOVOU X0, (BX)
+    RET
diff --git a/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s b/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s
new file mode 100644
index 0000000..38df948
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s
@@ -0,0 +1,123 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+ 
+#include "textflag.h"
+
+#define dst BX // parity's address
+#define d2src SI // two-dimension src_slice's address
+#define csrc CX // cnt of src
+#define len DX // len of vect
+#define pos R8 // job position in vect
+
+#define csrc_tmp R9
+#define d2src_off R10
+#define src_tmp R11
+#define not_aligned_len R12
+#define src_val0 R13
+#define src_val1 R14
+
+// func encodeSSE2(dst []byte, src [][]byte)
+TEXT ·encodeSSE2(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), dst
+	MOVQ  src+24(FP), d2src
+	MOVQ  c+32(FP), csrc
+	MOVQ  l+8(FP), len
+	TESTQ $63, len
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, pos
+
+loop64b:
+	MOVQ  csrc, csrc_tmp
+	SUBQ  $2, csrc_tmp
+	MOVQ  $0, d2src_off
+	MOVQ  (d2src)(d2src_off*1), src_tmp
+	MOVOU (src_tmp)(pos*1), X0
+	MOVOU 16(src_tmp)(pos*1), X1
+	MOVOU 32(src_tmp)(pos*1), X2
+	MOVOU 48(src_tmp)(pos*1), X3
+
+next_vect:
+	ADDQ  $24, d2src_off
+	MOVQ  (d2src)(d2src_off*1), src_tmp
+	MOVOU (src_tmp)(pos*1), X4
+	MOVOU 16(src_tmp)(pos*1), X5
+	MOVOU 32(src_tmp)(pos*1), X6
+	MOVOU 48(src_tmp)(pos*1), X7
+	PXOR  X4, X0
+	PXOR  X5, X1
+	PXOR  X6, X2
+	PXOR  X7, X3
+	SUBQ  $1, csrc_tmp
+	JGE   next_vect
+
+	MOVOU X0, (dst)(pos*1)
+	MOVOU X1, 16(dst)(pos*1)
+	MOVOU X2, 32(dst)(pos*1)
+	MOVOU X3, 48(dst)(pos*1)
+
+	ADDQ $64, pos
+	CMPQ len, pos
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVB -1(src_tmp)(len*1), src_val0
+
+next_vect_1b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVB -1(src_tmp)(len*1), src_val1
+	XORB src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_1b
+
+	MOVB  src_val0, -1(dst)(len*1)
+	SUBQ  $1, len
+	TESTQ $7, len
+	JNZ   loop_1b
+
+	CMPQ  len, $0
+	JE    ret
+	TESTQ $63, len
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, len
+	JNE   loop_1b
+	MOVQ  len, not_aligned_len
+	ANDQ  $63, not_aligned_len
+
+loop_8b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVQ -8(src_tmp)(len*1), src_val0
+
+next_vect_8b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVQ -8(src_tmp)(len*1), src_val1
+	XORQ src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_8b
+
+	MOVQ src_val0, -8(dst)(len*1)
+	SUBQ $8, len
+	SUBQ $8, not_aligned_len
+	JG   loop_8b
+
+	CMPQ len, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET