summaryrefslogtreecommitdiff
path: root/vendor/github.com/templexxx/xorsimd
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/templexxx/xorsimd')
-rw-r--r--vendor/github.com/templexxx/xorsimd/.gitattributes1
-rw-r--r--vendor/github.com/templexxx/xorsimd/.gitignore13
-rw-r--r--vendor/github.com/templexxx/xorsimd/LICENSE21
-rw-r--r--vendor/github.com/templexxx/xorsimd/README.md46
-rw-r--r--vendor/github.com/templexxx/xorsimd/go.mod5
-rw-r--r--vendor/github.com/templexxx/xorsimd/go.sum2
-rw-r--r--vendor/github.com/templexxx/xorsimd/xor.go89
-rw-r--r--vendor/github.com/templexxx/xorsimd/xor_amd64.go95
-rw-r--r--vendor/github.com/templexxx/xorsimd/xor_generic.go205
-rw-r--r--vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s124
-rw-r--r--vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s124
-rw-r--r--vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s72
-rw-r--r--vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s123
13 files changed, 920 insertions, 0 deletions
diff --git a/vendor/github.com/templexxx/xorsimd/.gitattributes b/vendor/github.com/templexxx/xorsimd/.gitattributes
new file mode 100644
index 0000000..68f7d04
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/.gitattributes
@@ -0,0 +1 @@
+*.s linguist-language=go:x
diff --git a/vendor/github.com/templexxx/xorsimd/.gitignore b/vendor/github.com/templexxx/xorsimd/.gitignore
new file mode 100644
index 0000000..43309f8
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/.gitignore
@@ -0,0 +1,13 @@
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+
+# Test binary, build with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+.idea
diff --git a/vendor/github.com/templexxx/xorsimd/LICENSE b/vendor/github.com/templexxx/xorsimd/LICENSE
new file mode 100644
index 0000000..08ee714
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Temple3x (temple3x@gmail.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/vendor/github.com/templexxx/xorsimd/README.md b/vendor/github.com/templexxx/xorsimd/README.md
new file mode 100644
index 0000000..9dce5c9
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/README.md
@@ -0,0 +1,46 @@
+# XOR SIMD
+
+[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8] [![Sourcegraph][9]][10]
+
+[1]: https://godoc.org/github.com/templexxx/xorsimd?status.svg
+[2]: https://godoc.org/github.com/templexxx/xorsimd
+[3]: https://img.shields.io/badge/license-MIT-blue.svg
+[4]: LICENSE
+[5]: https://github.com/templexxx/xorsimd/workflows/unit-test/badge.svg
+[6]: https://github.com/templexxx/xorsimd
+[7]: https://goreportcard.com/badge/github.com/templexxx/xorsimd
+[8]: https://goreportcard.com/report/github.com/templexxx/xorsimd
+[9]: https://sourcegraph.com/github.com/templexxx/xorsimd/-/badge.svg
+[10]: https://sourcegraph.com/github.com/templexxx/xorsimd?badge
+
+## Introduction:
+
+>- XOR code engine in pure Go.
+>
+>- [High Performance](https://github.com/templexxx/xorsimd#performance):
+More than 270GB/s per physics core.
+
+## Performance
+
+Performance depends mainly on:
+
+>- CPU instruction extension.
+>
+>- Number of source row vectors.
+
+**Platform:**
+
+*AWS c5d.xlarge (Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz)*
+
+**All test run on a single Core.**
+
+`I/O = (src_num + 1) * vector_size / cost`
+
+| Src Num | Vector size | AVX512 I/O (MB/S) | AVX2 I/O (MB/S) |SSE2 I/O (MB/S) |
+|-------|-------------|-------------|---------------|---------------|
+|5|4KB| 270403.73 | 142825.25 | 74443.91 |
+|5|1MB| 26948.34 | 26887.37 | 26950.65 |
+|5|8MB| 17881.32 | 17212.56 | 16402.97 |
+|10|4KB| 190445.30 | 102953.59 | 53244.04 |
+|10|1MB| 26424.44 | 26618.65 | 26094.39 |
+|10|8MB| 15471.31 | 14866.72 | 13565.80 |
diff --git a/vendor/github.com/templexxx/xorsimd/go.mod b/vendor/github.com/templexxx/xorsimd/go.mod
new file mode 100644
index 0000000..ac5f57f
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/go.mod
@@ -0,0 +1,5 @@
+module github.com/templexxx/xorsimd
+
+require github.com/templexxx/cpu v0.0.1
+
+go 1.13
diff --git a/vendor/github.com/templexxx/xorsimd/go.sum b/vendor/github.com/templexxx/xorsimd/go.sum
new file mode 100644
index 0000000..04d04de
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/go.sum
@@ -0,0 +1,2 @@
+github.com/templexxx/cpu v0.0.1 h1:hY4WdLOgKdc8y13EYklu9OUTXik80BkxHoWvTO6MQQY=
+github.com/templexxx/cpu v0.0.1/go.mod h1:w7Tb+7qgcAlIyX4NhLuDKt78AHA5SzPmq0Wj6HiEnnk=
diff --git a/vendor/github.com/templexxx/xorsimd/xor.go b/vendor/github.com/templexxx/xorsimd/xor.go
new file mode 100644
index 0000000..ae88911
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xor.go
@@ -0,0 +1,89 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+
+package xorsimd
+
+import "github.com/templexxx/cpu"
+
+// EnableAVX512 may slow down CPU Clock (maybe not).
+// TODO need more research:
+// https://lemire.me/blog/2018/04/19/by-how-much-does-avx-512-slow-down-your-cpu-a-first-experiment/
+var EnableAVX512 = true
+
+// cpuFeature indicates which instruction set will be used.
+var cpuFeature = getCPUFeature()
+
+const (
+ avx512 = iota
+ avx2
+ sse2
+ generic
+)
+
+// TODO: Add ARM feature...
+func getCPUFeature() int {
+ if hasAVX512() && EnableAVX512 {
+ return avx512
+ } else if cpu.X86.HasAVX2 {
+ return avx2
+ } else {
+ return sse2 // amd64 must has sse2
+ }
+}
+
+func hasAVX512() (ok bool) {
+
+ return cpu.X86.HasAVX512VL &&
+ cpu.X86.HasAVX512BW &&
+ cpu.X86.HasAVX512F &&
+ cpu.X86.HasAVX512DQ
+}
+
+// Encode encodes elements from source slice into a
+// destination slice. The source and destination may overlap.
+// Encode returns the number of bytes encoded, which will be the minimum of
+// len(src[i]) and len(dst).
+func Encode(dst []byte, src [][]byte) (n int) {
+ n = checkLen(dst, src)
+ if n == 0 {
+ return
+ }
+
+ dst = dst[:n]
+ for i := range src {
+ src[i] = src[i][:n]
+ }
+
+ if len(src) == 1 {
+ copy(dst, src[0])
+ return
+ }
+
+ encode(dst, src)
+ return
+}
+
+func checkLen(dst []byte, src [][]byte) int {
+ n := len(dst)
+ for i := range src {
+ if len(src[i]) < n {
+ n = len(src[i])
+ }
+ }
+
+ if n <= 0 {
+ return 0
+ }
+ return n
+}
+
+// Bytes XORs the bytes in a and b into a
+// destination slice. The source and destination may overlap.
+//
+// Bytes returns the number of bytes encoded, which will be the minimum of
+// len(dst), len(a), len(b).
+func Bytes(dst, a, b []byte) int {
+ return Encode(dst, [][]byte{a, b})
+}
diff --git a/vendor/github.com/templexxx/xorsimd/xor_amd64.go b/vendor/github.com/templexxx/xorsimd/xor_amd64.go
new file mode 100644
index 0000000..5d46df3
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xor_amd64.go
@@ -0,0 +1,95 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+
+package xorsimd
+
+func encode(dst []byte, src [][]byte) {
+
+ switch cpuFeature {
+ case avx512:
+ encodeAVX512(dst, src)
+ case avx2:
+ encodeAVX2(dst, src)
+ default:
+ encodeSSE2(dst, src)
+ }
+ return
+}
+
+// Bytes8 XORs of 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+func Bytes8(dst, a, b []byte) {
+
+ bytes8(&dst[0], &a[0], &b[0])
+}
+
+// Bytes16 XORs of packed 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+func Bytes16(dst, a, b []byte) {
+
+ bytes16(&dst[0], &a[0], &b[0])
+}
+
+// Bytes8Align XORs of 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+func Bytes8Align(dst, a, b []byte) {
+
+ bytes8(&dst[0], &a[0], &b[0])
+}
+
+// Bytes16Align XORs of packed 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+func Bytes16Align(dst, a, b []byte) {
+
+ bytes16(&dst[0], &a[0], &b[0])
+}
+
+// BytesA XORs the len(a) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesA(dst, a, b []byte) {
+
+ bytesN(&dst[0], &a[0], &b[0], len(a))
+}
+
+// BytesB XORs the len(b) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesB(dst, a, b []byte) {
+
+ bytesN(&dst[0], &a[0], &b[0], len(b))
+}
+
+//go:noescape
+func encodeAVX512(dst []byte, src [][]byte)
+
+//go:noescape
+func encodeAVX2(dst []byte, src [][]byte)
+
+//go:noescape
+func encodeSSE2(dst []byte, src [][]byte)
+
+//go:noescape
+func bytesN(dst, a, b *byte, n int)
+
+//go:noescape
+func bytes8(dst, a, b *byte)
+
+//go:noescape
+func bytes16(dst, a, b *byte)
diff --git a/vendor/github.com/templexxx/xorsimd/xor_generic.go b/vendor/github.com/templexxx/xorsimd/xor_generic.go
new file mode 100644
index 0000000..b12908f
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xor_generic.go
@@ -0,0 +1,205 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+//
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64
+
+package xorsimd
+
+import (
+ "runtime"
+ "unsafe"
+)
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
+
+func encode(dst []byte, src [][]byte) {
+ if supportsUnaligned {
+ fastEncode(dst, src, len(dst))
+ } else {
+ // TODO(hanwen): if (dst, a, b) have common alignment
+ // we could still try fastEncode. It is not clear
+ // how often this happens, and it's only worth it if
+ // the block encryption itself is hardware
+ // accelerated.
+ safeEncode(dst, src, len(dst))
+ }
+
+}
+
+// fastEncode xor in bulk. It only works on architectures that
+// support unaligned read/writes.
+func fastEncode(dst []byte, src [][]byte, n int) {
+ w := n / wordSize
+ if w > 0 {
+ wordBytes := w * wordSize
+
+ wordAlignSrc := make([][]byte, len(src))
+ for i := range src {
+ wordAlignSrc[i] = src[i][:wordBytes]
+ }
+ fastEnc(dst[:wordBytes], wordAlignSrc)
+ }
+
+ for i := n - n%wordSize; i < n; i++ {
+ s := src[0][i]
+ for j := 1; j < len(src); j++ {
+ s ^= src[j][i]
+ }
+ dst[i] = s
+ }
+}
+
+func fastEnc(dst []byte, src [][]byte) {
+ dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+ sw := make([][]uintptr, len(src))
+ for i := range src {
+ sw[i] = *(*[]uintptr)(unsafe.Pointer(&src[i]))
+ }
+
+ n := len(dst) / wordSize
+ for i := 0; i < n; i++ {
+ s := sw[0][i]
+ for j := 1; j < len(sw); j++ {
+ s ^= sw[j][i]
+ }
+ dw[i] = s
+ }
+}
+
+func safeEncode(dst []byte, src [][]byte, n int) {
+ for i := 0; i < n; i++ {
+ s := src[0][i]
+ for j := 1; j < len(src); j++ {
+ s ^= src[j][i]
+ }
+ dst[i] = s
+ }
+}
+
+// Bytes8 XORs of word 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+func Bytes8(dst, a, b []byte) {
+
+ bytesWords(dst[:8], a[:8], b[:8])
+}
+
+// Bytes16 XORs of packed doubleword 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+func Bytes16(dst, a, b []byte) {
+
+ bytesWords(dst[:16], a[:16], b[:16])
+}
+
+// bytesWords XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The slice arguments a and b are assumed to be of equal length.
+func bytesWords(dst, a, b []byte) {
+ if supportsUnaligned {
+ dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+ aw := *(*[]uintptr)(unsafe.Pointer(&a))
+ bw := *(*[]uintptr)(unsafe.Pointer(&b))
+ n := len(b) / wordSize
+ for i := 0; i < n; i++ {
+ dw[i] = aw[i] ^ bw[i]
+ }
+ } else {
+ n := len(b)
+ for i := 0; i < n; i++ {
+ dst[i] = a[i] ^ b[i]
+ }
+ }
+}
+
+// Bytes8Align XORs of 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+//
+// All the byte slices must be aligned to wordsize.
+func Bytes8Align(dst, a, b []byte) {
+
+ bytesWordsAlign(dst[:8], a[:8], b[:8])
+}
+
+// Bytes16Align XORs of packed 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+//
+// All the byte slices must be aligned to wordsize.
+func Bytes16Align(dst, a, b []byte) {
+
+ bytesWordsAlign(dst[:16], a[:16], b[:16])
+}
+
+// bytesWordsAlign XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The slice arguments a and b are assumed to be of equal length.
+//
+// All the byte slices must be aligned to wordsize.
+func bytesWordsAlign(dst, a, b []byte) {
+ dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+ aw := *(*[]uintptr)(unsafe.Pointer(&a))
+ bw := *(*[]uintptr)(unsafe.Pointer(&b))
+ n := len(b) / wordSize
+ for i := 0; i < n; i++ {
+ dw[i] = aw[i] ^ bw[i]
+ }
+}
+
+// BytesA XORs the len(a) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesA(dst, a, b []byte) {
+
+ n := len(a)
+ bytesN(dst[:n], a[:n], b[:n], n)
+}
+
+// BytesB XORs the len(b) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesB(dst, a, b []byte) {
+
+ n := len(b)
+ bytesN(dst[:n], a[:n], b[:n], n)
+}
+
+func bytesN(dst, a, b []byte, n int) {
+
+ switch {
+ case supportsUnaligned:
+ w := n / wordSize
+ if w > 0 {
+ dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+ aw := *(*[]uintptr)(unsafe.Pointer(&a))
+ bw := *(*[]uintptr)(unsafe.Pointer(&b))
+ for i := 0; i < w; i++ {
+ dw[i] = aw[i] ^ bw[i]
+ }
+ }
+
+ for i := (n - n%wordSize); i < n; i++ {
+ dst[i] = a[i] ^ b[i]
+ }
+ default:
+ for i := 0; i < n; i++ {
+ dst[i] = a[i] ^ b[i]
+ }
+ }
+}
diff --git a/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s
new file mode 100644
index 0000000..23cf924
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s
@@ -0,0 +1,124 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define dst BX // parity's address
+#define d2src SI // two-dimension src_slice's address
+#define csrc CX // cnt of src
+#define len DX // len of vect
+#define pos R8 // job position in vect
+
+#define csrc_tmp R9
+#define d2src_off R10
+#define src_tmp R11
+#define not_aligned_len R12
+#define src_val0 R13
+#define src_val1 R14
+
+// func encodeAVX2(dst []byte, src [][]byte)
+TEXT ·encodeAVX2(SB), NOSPLIT, $0
+ MOVQ d+0(FP), dst
+ MOVQ s+24(FP), d2src
+ MOVQ c+32(FP), csrc
+ MOVQ l+8(FP), len
+ TESTQ $127, len
+ JNZ not_aligned
+
+aligned:
+ MOVQ $0, pos
+
+loop128b:
+ MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp
+ SUBQ $2, csrc_tmp
+ MOVQ $0, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp
+ VMOVDQU (src_tmp)(pos*1), Y0
+ VMOVDQU 32(src_tmp)(pos*1), Y1
+ VMOVDQU 64(src_tmp)(pos*1), Y2
+ VMOVDQU 96(src_tmp)(pos*1), Y3
+
+next_vect:
+ ADDQ $24, d2src_off // len(slice) = 24
+ MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect
+ VMOVDQU (src_tmp)(pos*1), Y4
+ VMOVDQU 32(src_tmp)(pos*1), Y5
+ VMOVDQU 64(src_tmp)(pos*1), Y6
+ VMOVDQU 96(src_tmp)(pos*1), Y7
+ VPXOR Y4, Y0, Y0
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ SUBQ $1, csrc_tmp
+ JGE next_vect
+
+ VMOVDQU Y0, (dst)(pos*1)
+ VMOVDQU Y1, 32(dst)(pos*1)
+ VMOVDQU Y2, 64(dst)(pos*1)
+ VMOVDQU Y3, 96(dst)(pos*1)
+
+ ADDQ $128, pos
+ CMPQ len, pos
+ JNE loop128b
+ VZEROUPPER
+ RET
+
+loop_1b:
+ MOVQ csrc, csrc_tmp
+ MOVQ $0, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ SUBQ $2, csrc_tmp
+ MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src
+
+next_vect_1b:
+ ADDQ $24, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ MOVB -1(src_tmp)(len*1), src_val1
+ XORB src_val1, src_val0
+ SUBQ $1, csrc_tmp
+ JGE next_vect_1b
+
+ MOVB src_val0, -1(dst)(len*1)
+ SUBQ $1, len
+ TESTQ $7, len
+ JNZ loop_1b
+
+ CMPQ len, $0
+ JE ret
+ TESTQ $127, len
+ JZ aligned
+
+not_aligned:
+ TESTQ $7, len
+ JNE loop_1b
+ MOVQ len, not_aligned_len
+ ANDQ $127, not_aligned_len
+
+loop_8b:
+ MOVQ csrc, csrc_tmp
+ MOVQ $0, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ SUBQ $2, csrc_tmp
+ MOVQ -8(src_tmp)(len*1), src_val0
+
+next_vect_8b:
+ ADDQ $24, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ MOVQ -8(src_tmp)(len*1), src_val1
+ XORQ src_val1, src_val0
+ SUBQ $1, csrc_tmp
+ JGE next_vect_8b
+
+ MOVQ src_val0, -8(dst)(len*1)
+ SUBQ $8, len
+ SUBQ $8, not_aligned_len
+ JG loop_8b
+
+ CMPQ len, $128
+ JGE aligned
+ RET
+
+ret:
+ RET
diff --git a/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s b/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s
new file mode 100644
index 0000000..2ba6b75
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s
@@ -0,0 +1,124 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define dst BX // parity's address
+#define d2src SI // two-dimension src_slice's address
+#define csrc CX // cnt of src
+#define len DX // len of vect
+#define pos R8 // job position in vect
+
+#define csrc_tmp R9
+#define d2src_off R10
+#define src_tmp R11
+#define not_aligned_len R12
+#define src_val0 R13
+#define src_val1 R14
+
+// func encodeAVX512(dst []byte, src [][]byte)
+TEXT ·encodeAVX512(SB), NOSPLIT, $0
+ MOVQ d+0(FP), dst
+ MOVQ src+24(FP), d2src
+ MOVQ c+32(FP), csrc
+ MOVQ l+8(FP), len
+ TESTQ $255, len
+ JNZ not_aligned
+
+aligned:
+ MOVQ $0, pos
+
+loop256b:
+ MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp
+ SUBQ $2, csrc_tmp
+ MOVQ $0, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp
+ VMOVDQU8 (src_tmp)(pos*1), Z0
+ VMOVDQU8 64(src_tmp)(pos*1), Z1
+ VMOVDQU8 128(src_tmp)(pos*1), Z2
+ VMOVDQU8 192(src_tmp)(pos*1), Z3
+
+next_vect:
+ ADDQ $24, d2src_off // len(slice) = 24
+ MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect
+ VMOVDQU8 (src_tmp)(pos*1), Z4
+ VMOVDQU8 64(src_tmp)(pos*1), Z5
+ VMOVDQU8 128(src_tmp)(pos*1), Z6
+ VMOVDQU8 192(src_tmp)(pos*1), Z7
+ VPXORQ Z4, Z0, Z0
+ VPXORQ Z5, Z1, Z1
+ VPXORQ Z6, Z2, Z2
+ VPXORQ Z7, Z3, Z3
+ SUBQ $1, csrc_tmp
+ JGE next_vect
+
+ VMOVDQU8 Z0, (dst)(pos*1)
+ VMOVDQU8 Z1, 64(dst)(pos*1)
+ VMOVDQU8 Z2, 128(dst)(pos*1)
+ VMOVDQU8 Z3, 192(dst)(pos*1)
+
+ ADDQ $256, pos
+ CMPQ len, pos
+ JNE loop256b
+ VZEROUPPER
+ RET
+
+loop_1b:
+ MOVQ csrc, csrc_tmp
+ MOVQ $0, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ SUBQ $2, csrc_tmp
+ MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src
+
+next_vect_1b:
+ ADDQ $24, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ MOVB -1(src_tmp)(len*1), src_val1
+ XORB src_val1, src_val0
+ SUBQ $1, csrc_tmp
+ JGE next_vect_1b
+
+ MOVB src_val0, -1(dst)(len*1)
+ SUBQ $1, len
+ TESTQ $7, len
+ JNZ loop_1b
+
+ CMPQ len, $0
+ JE ret
+ TESTQ $255, len
+ JZ aligned
+
+not_aligned:
+ TESTQ $7, len
+ JNE loop_1b
+ MOVQ len, not_aligned_len
+ ANDQ $255, not_aligned_len
+
+loop_8b:
+ MOVQ csrc, csrc_tmp
+ MOVQ $0, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ SUBQ $2, csrc_tmp
+ MOVQ -8(src_tmp)(len*1), src_val0
+
+next_vect_8b:
+ ADDQ $24, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ MOVQ -8(src_tmp)(len*1), src_val1
+ XORQ src_val1, src_val0
+ SUBQ $1, csrc_tmp
+ JGE next_vect_8b
+
+ MOVQ src_val0, -8(dst)(len*1)
+ SUBQ $8, len
+ SUBQ $8, not_aligned_len
+ JG loop_8b
+
+ CMPQ len, $256
+ JGE aligned
+ RET
+
+ret:
+ RET
diff --git a/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
new file mode 100644
index 0000000..8f67edd
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
@@ -0,0 +1,72 @@
+#include "textflag.h"
+
+// func bytesN(dst, a, b *byte, n int)
+TEXT ·bytesN(SB), NOSPLIT, $0
+ MOVQ d+0(FP), BX
+ MOVQ a+8(FP), SI
+ MOVQ b+16(FP), CX
+ MOVQ n+24(FP), DX
+ TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
+ JNZ not_aligned
+
+aligned:
+ MOVQ $0, AX // position in slices
+
+loop16b:
+ MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
+ MOVOU (CX)(AX*1), X1
+ PXOR X1, X0
+ MOVOU X0, (BX)(AX*1)
+ ADDQ $16, AX
+ CMPQ DX, AX
+ JNE loop16b
+ RET
+
+loop_1b:
+ SUBQ $1, DX // XOR 1byte backwards.
+ MOVB (SI)(DX*1), DI
+ MOVB (CX)(DX*1), AX
+ XORB AX, DI
+ MOVB DI, (BX)(DX*1)
+ TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
+ JNZ loop_1b
+ CMPQ DX, $0 // if len is 0, ret.
+ JE ret
+ TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
+ JZ aligned
+
+not_aligned:
+ TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
+ JNE loop_1b
+ SUBQ $8, DX // XOR 8bytes backwards.
+ MOVQ (SI)(DX*1), DI
+ MOVQ (CX)(DX*1), AX
+ XORQ AX, DI
+ MOVQ DI, (BX)(DX*1)
+ CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
+ JGE aligned
+
+ret:
+ RET
+
+// func bytes8(dst, a, b *byte)
+TEXT ·bytes8(SB), NOSPLIT, $0
+ MOVQ d+0(FP), BX
+ MOVQ a+8(FP), SI
+ MOVQ b+16(FP), CX
+ MOVQ (SI), DI
+ MOVQ (CX), AX
+ XORQ AX, DI
+ MOVQ DI, (BX)
+ RET
+
+// func bytes16(dst, a, b *byte)
+TEXT ·bytes16(SB), NOSPLIT, $0
+ MOVQ d+0(FP), BX
+ MOVQ a+8(FP), SI
+ MOVQ b+16(FP), CX
+ MOVOU (SI), X0
+ MOVOU (CX), X1
+ PXOR X1, X0
+ MOVOU X0, (BX)
+ RET
diff --git a/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s b/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s
new file mode 100644
index 0000000..38df948
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s
@@ -0,0 +1,123 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define dst BX // parity's address
+#define d2src SI // two-dimension src_slice's address
+#define csrc CX // cnt of src
+#define len DX // len of vect
+#define pos R8 // job position in vect
+
+#define csrc_tmp R9
+#define d2src_off R10
+#define src_tmp R11
+#define not_aligned_len R12
+#define src_val0 R13
+#define src_val1 R14
+
+// func encodeSSE2(dst []byte, src [][]byte)
+TEXT ·encodeSSE2(SB), NOSPLIT, $0
+ MOVQ d+0(FP), dst
+ MOVQ src+24(FP), d2src
+ MOVQ c+32(FP), csrc
+ MOVQ l+8(FP), len
+ TESTQ $63, len
+ JNZ not_aligned
+
+aligned:
+ MOVQ $0, pos
+
+loop64b:
+ MOVQ csrc, csrc_tmp
+ SUBQ $2, csrc_tmp
+ MOVQ $0, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ MOVOU (src_tmp)(pos*1), X0
+ MOVOU 16(src_tmp)(pos*1), X1
+ MOVOU 32(src_tmp)(pos*1), X2
+ MOVOU 48(src_tmp)(pos*1), X3
+
+next_vect:
+ ADDQ $24, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ MOVOU (src_tmp)(pos*1), X4
+ MOVOU 16(src_tmp)(pos*1), X5
+ MOVOU 32(src_tmp)(pos*1), X6
+ MOVOU 48(src_tmp)(pos*1), X7
+ PXOR X4, X0
+ PXOR X5, X1
+ PXOR X6, X2
+ PXOR X7, X3
+ SUBQ $1, csrc_tmp
+ JGE next_vect
+
+ MOVOU X0, (dst)(pos*1)
+ MOVOU X1, 16(dst)(pos*1)
+ MOVOU X2, 32(dst)(pos*1)
+ MOVOU X3, 48(dst)(pos*1)
+
+ ADDQ $64, pos
+ CMPQ len, pos
+ JNE loop64b
+ RET
+
+loop_1b:
+ MOVQ csrc, csrc_tmp
+ MOVQ $0, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ SUBQ $2, csrc_tmp
+ MOVB -1(src_tmp)(len*1), src_val0
+
+next_vect_1b:
+ ADDQ $24, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ MOVB -1(src_tmp)(len*1), src_val1
+ XORB src_val1, src_val0
+ SUBQ $1, csrc_tmp
+ JGE next_vect_1b
+
+ MOVB src_val0, -1(dst)(len*1)
+ SUBQ $1, len
+ TESTQ $7, len
+ JNZ loop_1b
+
+ CMPQ len, $0
+ JE ret
+ TESTQ $63, len
+ JZ aligned
+
+not_aligned:
+ TESTQ $7, len
+ JNE loop_1b
+ MOVQ len, not_aligned_len
+ ANDQ $63, not_aligned_len
+
+loop_8b:
+ MOVQ csrc, csrc_tmp
+ MOVQ $0, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ SUBQ $2, csrc_tmp
+ MOVQ -8(src_tmp)(len*1), src_val0
+
+next_vect_8b:
+ ADDQ $24, d2src_off
+ MOVQ (d2src)(d2src_off*1), src_tmp
+ MOVQ -8(src_tmp)(len*1), src_val1
+ XORQ src_val1, src_val0
+ SUBQ $1, csrc_tmp
+ JGE next_vect_8b
+
+ MOVQ src_val0, -8(dst)(len*1)
+ SUBQ $8, len
+ SUBQ $8, not_aligned_len
+ JG loop_8b
+
+ CMPQ len, $64
+ JGE aligned
+ RET
+
+ret:
+ RET