diff options
Diffstat (limited to 'vendor/github.com/templexxx/xorsimd')
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/.gitattributes | 1 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/.gitignore | 13 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/LICENSE | 21 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/README.md | 46 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/go.mod | 5 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/go.sum | 2 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/xor.go | 89 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/xor_amd64.go | 95 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/xor_generic.go | 205 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s | 124 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s | 124 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s | 72 | ||||
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s | 123 |
13 files changed, 920 insertions, 0 deletions
diff --git a/vendor/github.com/templexxx/xorsimd/.gitattributes b/vendor/github.com/templexxx/xorsimd/.gitattributes new file mode 100644 index 0000000..68f7d04 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/.gitattributes @@ -0,0 +1 @@ +*.s linguist-language=go:x diff --git a/vendor/github.com/templexxx/xorsimd/.gitignore b/vendor/github.com/templexxx/xorsimd/.gitignore new file mode 100644 index 0000000..43309f8 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/.gitignore @@ -0,0 +1,13 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out +.idea diff --git a/vendor/github.com/templexxx/xorsimd/LICENSE b/vendor/github.com/templexxx/xorsimd/LICENSE new file mode 100644 index 0000000..08ee714 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Temple3x (temple3x@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/templexxx/xorsimd/README.md b/vendor/github.com/templexxx/xorsimd/README.md new file mode 100644 index 0000000..9dce5c9 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/README.md @@ -0,0 +1,46 @@ +# XOR SIMD + +[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8] [![Sourcegraph][9]][10] + +[1]: https://godoc.org/github.com/templexxx/xorsimd?status.svg +[2]: https://godoc.org/github.com/templexxx/xorsimd +[3]: https://img.shields.io/badge/license-MIT-blue.svg +[4]: LICENSE +[5]: https://github.com/templexxx/xorsimd/workflows/unit-test/badge.svg +[6]: https://github.com/templexxx/xorsimd +[7]: https://goreportcard.com/badge/github.com/templexxx/xorsimd +[8]: https://goreportcard.com/report/github.com/templexxx/xorsimd +[9]: https://sourcegraph.com/github.com/templexxx/xorsimd/-/badge.svg +[10]: https://sourcegraph.com/github.com/templexxx/xorsimd?badge + +## Introduction: + +>- XOR code engine in pure Go. +> +>- [High Performance](https://github.com/templexxx/xorsimd#performance): +More than 270GB/s per physics core. + +## Performance + +Performance depends mainly on: + +>- CPU instruction extension. +> +>- Number of source row vectors. + +**Platform:** + +*AWS c5d.xlarge (Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz)* + +**All test run on a single Core.** + +`I/O = (src_num + 1) * vector_size / cost` + +| Src Num | Vector size | AVX512 I/O (MB/S) | AVX2 I/O (MB/S) |SSE2 I/O (MB/S) | +|-------|-------------|-------------|---------------|---------------| +|5|4KB| 270403.73 | 142825.25 | 74443.91 | +|5|1MB| 26948.34 | 26887.37 | 26950.65 | +|5|8MB| 17881.32 | 17212.56 | 16402.97 | +|10|4KB| 190445.30 | 102953.59 | 53244.04 | +|10|1MB| 26424.44 | 26618.65 | 26094.39 | +|10|8MB| 15471.31 | 14866.72 | 13565.80 | diff --git a/vendor/github.com/templexxx/xorsimd/go.mod b/vendor/github.com/templexxx/xorsimd/go.mod new file mode 100644 index 0000000..ac5f57f --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/go.mod @@ -0,0 +1,5 @@ +module github.com/templexxx/xorsimd + +require github.com/templexxx/cpu v0.0.1 + +go 1.13 diff --git a/vendor/github.com/templexxx/xorsimd/go.sum b/vendor/github.com/templexxx/xorsimd/go.sum new file mode 100644 index 0000000..04d04de --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/go.sum @@ -0,0 +1,2 @@ +github.com/templexxx/cpu v0.0.1 h1:hY4WdLOgKdc8y13EYklu9OUTXik80BkxHoWvTO6MQQY= +github.com/templexxx/cpu v0.0.1/go.mod h1:w7Tb+7qgcAlIyX4NhLuDKt78AHA5SzPmq0Wj6HiEnnk= diff --git a/vendor/github.com/templexxx/xorsimd/xor.go b/vendor/github.com/templexxx/xorsimd/xor.go new file mode 100644 index 0000000..ae88911 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xor.go @@ -0,0 +1,89 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +package xorsimd + +import "github.com/templexxx/cpu" + +// EnableAVX512 may slow down CPU Clock (maybe not). +// TODO need more research: +// https://lemire.me/blog/2018/04/19/by-how-much-does-avx-512-slow-down-your-cpu-a-first-experiment/ +var EnableAVX512 = true + +// cpuFeature indicates which instruction set will be used. +var cpuFeature = getCPUFeature() + +const ( + avx512 = iota + avx2 + sse2 + generic +) + +// TODO: Add ARM feature... +func getCPUFeature() int { + if hasAVX512() && EnableAVX512 { + return avx512 + } else if cpu.X86.HasAVX2 { + return avx2 + } else { + return sse2 // amd64 must has sse2 + } +} + +func hasAVX512() (ok bool) { + + return cpu.X86.HasAVX512VL && + cpu.X86.HasAVX512BW && + cpu.X86.HasAVX512F && + cpu.X86.HasAVX512DQ +} + +// Encode encodes elements from source slice into a +// destination slice. The source and destination may overlap. +// Encode returns the number of bytes encoded, which will be the minimum of +// len(src[i]) and len(dst). +func Encode(dst []byte, src [][]byte) (n int) { + n = checkLen(dst, src) + if n == 0 { + return + } + + dst = dst[:n] + for i := range src { + src[i] = src[i][:n] + } + + if len(src) == 1 { + copy(dst, src[0]) + return + } + + encode(dst, src) + return +} + +func checkLen(dst []byte, src [][]byte) int { + n := len(dst) + for i := range src { + if len(src[i]) < n { + n = len(src[i]) + } + } + + if n <= 0 { + return 0 + } + return n +} + +// Bytes XORs the bytes in a and b into a +// destination slice. The source and destination may overlap. +// +// Bytes returns the number of bytes encoded, which will be the minimum of +// len(dst), len(a), len(b). +func Bytes(dst, a, b []byte) int { + return Encode(dst, [][]byte{a, b}) +} diff --git a/vendor/github.com/templexxx/xorsimd/xor_amd64.go b/vendor/github.com/templexxx/xorsimd/xor_amd64.go new file mode 100644 index 0000000..5d46df3 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xor_amd64.go @@ -0,0 +1,95 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +package xorsimd + +func encode(dst []byte, src [][]byte) { + + switch cpuFeature { + case avx512: + encodeAVX512(dst, src) + case avx2: + encodeAVX2(dst, src) + default: + encodeSSE2(dst, src) + } + return +} + +// Bytes8 XORs of 8 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 8, +// if not, Bytes8 will panic. +func Bytes8(dst, a, b []byte) { + + bytes8(&dst[0], &a[0], &b[0]) +} + +// Bytes16 XORs of packed 16 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 16, +// if not, Bytes16 will panic. +func Bytes16(dst, a, b []byte) { + + bytes16(&dst[0], &a[0], &b[0]) +} + +// Bytes8Align XORs of 8 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 8, +// if not, Bytes8 will panic. +func Bytes8Align(dst, a, b []byte) { + + bytes8(&dst[0], &a[0], &b[0]) +} + +// Bytes16Align XORs of packed 16 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 16, +// if not, Bytes16 will panic. +func Bytes16Align(dst, a, b []byte) { + + bytes16(&dst[0], &a[0], &b[0]) +} + +// BytesA XORs the len(a) bytes in a and b into a +// destination slice. +// The destination should have enough space. +// +// It's used for encoding small bytes slices (< dozens bytes), +// and the slices may not be aligned to 8 bytes or 16 bytes. +// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead +// for gain better performance. +func BytesA(dst, a, b []byte) { + + bytesN(&dst[0], &a[0], &b[0], len(a)) +} + +// BytesB XORs the len(b) bytes in a and b into a +// destination slice. +// The destination should have enough space. +// +// It's used for encoding small bytes slices (< dozens bytes), +// and the slices may not be aligned to 8 bytes or 16 bytes. +// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead +// for gain better performance. +func BytesB(dst, a, b []byte) { + + bytesN(&dst[0], &a[0], &b[0], len(b)) +} + +//go:noescape +func encodeAVX512(dst []byte, src [][]byte) + +//go:noescape +func encodeAVX2(dst []byte, src [][]byte) + +//go:noescape +func encodeSSE2(dst []byte, src [][]byte) + +//go:noescape +func bytesN(dst, a, b *byte, n int) + +//go:noescape +func bytes8(dst, a, b *byte) + +//go:noescape +func bytes16(dst, a, b *byte) diff --git a/vendor/github.com/templexxx/xorsimd/xor_generic.go b/vendor/github.com/templexxx/xorsimd/xor_generic.go new file mode 100644 index 0000000..b12908f --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xor_generic.go @@ -0,0 +1,205 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. +// +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 + +package xorsimd + +import ( + "runtime" + "unsafe" +) + +const wordSize = int(unsafe.Sizeof(uintptr(0))) +const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" + +func encode(dst []byte, src [][]byte) { + if supportsUnaligned { + fastEncode(dst, src, len(dst)) + } else { + // TODO(hanwen): if (dst, a, b) have common alignment + // we could still try fastEncode. It is not clear + // how often this happens, and it's only worth it if + // the block encryption itself is hardware + // accelerated. + safeEncode(dst, src, len(dst)) + } + +} + +// fastEncode xor in bulk. It only works on architectures that +// support unaligned read/writes. +func fastEncode(dst []byte, src [][]byte, n int) { + w := n / wordSize + if w > 0 { + wordBytes := w * wordSize + + wordAlignSrc := make([][]byte, len(src)) + for i := range src { + wordAlignSrc[i] = src[i][:wordBytes] + } + fastEnc(dst[:wordBytes], wordAlignSrc) + } + + for i := n - n%wordSize; i < n; i++ { + s := src[0][i] + for j := 1; j < len(src); j++ { + s ^= src[j][i] + } + dst[i] = s + } +} + +func fastEnc(dst []byte, src [][]byte) { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + sw := make([][]uintptr, len(src)) + for i := range src { + sw[i] = *(*[]uintptr)(unsafe.Pointer(&src[i])) + } + + n := len(dst) / wordSize + for i := 0; i < n; i++ { + s := sw[0][i] + for j := 1; j < len(sw); j++ { + s ^= sw[j][i] + } + dw[i] = s + } +} + +func safeEncode(dst []byte, src [][]byte, n int) { + for i := 0; i < n; i++ { + s := src[0][i] + for j := 1; j < len(src); j++ { + s ^= src[j][i] + } + dst[i] = s + } +} + +// Bytes8 XORs of word 8 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 8, +// if not, Bytes8 will panic. +func Bytes8(dst, a, b []byte) { + + bytesWords(dst[:8], a[:8], b[:8]) +} + +// Bytes16 XORs of packed doubleword 16 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 16, +// if not, Bytes16 will panic. +func Bytes16(dst, a, b []byte) { + + bytesWords(dst[:16], a[:16], b[:16]) +} + +// bytesWords XORs multiples of 4 or 8 bytes (depending on architecture.) +// The slice arguments a and b are assumed to be of equal length. +func bytesWords(dst, a, b []byte) { + if supportsUnaligned { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + n := len(b) / wordSize + for i := 0; i < n; i++ { + dw[i] = aw[i] ^ bw[i] + } + } else { + n := len(b) + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b[i] + } + } +} + +// Bytes8Align XORs of 8 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 8, +// if not, Bytes8 will panic. +// +// All the byte slices must be aligned to wordsize. +func Bytes8Align(dst, a, b []byte) { + + bytesWordsAlign(dst[:8], a[:8], b[:8]) +} + +// Bytes16Align XORs of packed 16 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 16, +// if not, Bytes16 will panic. +// +// All the byte slices must be aligned to wordsize. +func Bytes16Align(dst, a, b []byte) { + + bytesWordsAlign(dst[:16], a[:16], b[:16]) +} + +// bytesWordsAlign XORs multiples of 4 or 8 bytes (depending on architecture.) +// The slice arguments a and b are assumed to be of equal length. +// +// All the byte slices must be aligned to wordsize. +func bytesWordsAlign(dst, a, b []byte) { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + n := len(b) / wordSize + for i := 0; i < n; i++ { + dw[i] = aw[i] ^ bw[i] + } +} + +// BytesA XORs the len(a) bytes in a and b into a +// destination slice. +// The destination should have enough space. +// +// It's used for encoding small bytes slices (< dozens bytes), +// and the slices may not be aligned to 8 bytes or 16 bytes. +// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead +// for gain better performance. +func BytesA(dst, a, b []byte) { + + n := len(a) + bytesN(dst[:n], a[:n], b[:n], n) +} + +// BytesB XORs the len(b) bytes in a and b into a +// destination slice. +// The destination should have enough space. +// +// It's used for encoding small bytes slices (< dozens bytes), +// and the slices may not be aligned to 8 bytes or 16 bytes. +// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead +// for gain better performance. +func BytesB(dst, a, b []byte) { + + n := len(b) + bytesN(dst[:n], a[:n], b[:n], n) +} + +func bytesN(dst, a, b []byte, n int) { + + switch { + case supportsUnaligned: + w := n / wordSize + if w > 0 { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + for i := 0; i < w; i++ { + dw[i] = aw[i] ^ bw[i] + } + } + + for i := (n - n%wordSize); i < n; i++ { + dst[i] = a[i] ^ b[i] + } + default: + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b[i] + } + } +} diff --git a/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s new file mode 100644 index 0000000..23cf924 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s @@ -0,0 +1,124 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +#include "textflag.h" + +#define dst BX // parity's address +#define d2src SI // two-dimension src_slice's address +#define csrc CX // cnt of src +#define len DX // len of vect +#define pos R8 // job position in vect + +#define csrc_tmp R9 +#define d2src_off R10 +#define src_tmp R11 +#define not_aligned_len R12 +#define src_val0 R13 +#define src_val1 R14 + +// func encodeAVX2(dst []byte, src [][]byte) +TEXT ·encodeAVX2(SB), NOSPLIT, $0 + MOVQ d+0(FP), dst + MOVQ s+24(FP), d2src + MOVQ c+32(FP), csrc + MOVQ l+8(FP), len + TESTQ $127, len + JNZ not_aligned + +aligned: + MOVQ $0, pos + +loop128b: + MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp + SUBQ $2, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp + VMOVDQU (src_tmp)(pos*1), Y0 + VMOVDQU 32(src_tmp)(pos*1), Y1 + VMOVDQU 64(src_tmp)(pos*1), Y2 + VMOVDQU 96(src_tmp)(pos*1), Y3 + +next_vect: + ADDQ $24, d2src_off // len(slice) = 24 + MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect + VMOVDQU (src_tmp)(pos*1), Y4 + VMOVDQU 32(src_tmp)(pos*1), Y5 + VMOVDQU 64(src_tmp)(pos*1), Y6 + VMOVDQU 96(src_tmp)(pos*1), Y7 + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + SUBQ $1, csrc_tmp + JGE next_vect + + VMOVDQU Y0, (dst)(pos*1) + VMOVDQU Y1, 32(dst)(pos*1) + VMOVDQU Y2, 64(dst)(pos*1) + VMOVDQU Y3, 96(dst)(pos*1) + + ADDQ $128, pos + CMPQ len, pos + JNE loop128b + VZEROUPPER + RET + +loop_1b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src + +next_vect_1b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVB -1(src_tmp)(len*1), src_val1 + XORB src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_1b + + MOVB src_val0, -1(dst)(len*1) + SUBQ $1, len + TESTQ $7, len + JNZ loop_1b + + CMPQ len, $0 + JE ret + TESTQ $127, len + JZ aligned + +not_aligned: + TESTQ $7, len + JNE loop_1b + MOVQ len, not_aligned_len + ANDQ $127, not_aligned_len + +loop_8b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVQ -8(src_tmp)(len*1), src_val0 + +next_vect_8b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVQ -8(src_tmp)(len*1), src_val1 + XORQ src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_8b + + MOVQ src_val0, -8(dst)(len*1) + SUBQ $8, len + SUBQ $8, not_aligned_len + JG loop_8b + + CMPQ len, $128 + JGE aligned + RET + +ret: + RET diff --git a/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s b/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s new file mode 100644 index 0000000..2ba6b75 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s @@ -0,0 +1,124 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +#include "textflag.h" + +#define dst BX // parity's address +#define d2src SI // two-dimension src_slice's address +#define csrc CX // cnt of src +#define len DX // len of vect +#define pos R8 // job position in vect + +#define csrc_tmp R9 +#define d2src_off R10 +#define src_tmp R11 +#define not_aligned_len R12 +#define src_val0 R13 +#define src_val1 R14 + +// func encodeAVX512(dst []byte, src [][]byte) +TEXT ·encodeAVX512(SB), NOSPLIT, $0 + MOVQ d+0(FP), dst + MOVQ src+24(FP), d2src + MOVQ c+32(FP), csrc + MOVQ l+8(FP), len + TESTQ $255, len + JNZ not_aligned + +aligned: + MOVQ $0, pos + +loop256b: + MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp + SUBQ $2, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp + VMOVDQU8 (src_tmp)(pos*1), Z0 + VMOVDQU8 64(src_tmp)(pos*1), Z1 + VMOVDQU8 128(src_tmp)(pos*1), Z2 + VMOVDQU8 192(src_tmp)(pos*1), Z3 + +next_vect: + ADDQ $24, d2src_off // len(slice) = 24 + MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect + VMOVDQU8 (src_tmp)(pos*1), Z4 + VMOVDQU8 64(src_tmp)(pos*1), Z5 + VMOVDQU8 128(src_tmp)(pos*1), Z6 + VMOVDQU8 192(src_tmp)(pos*1), Z7 + VPXORQ Z4, Z0, Z0 + VPXORQ Z5, Z1, Z1 + VPXORQ Z6, Z2, Z2 + VPXORQ Z7, Z3, Z3 + SUBQ $1, csrc_tmp + JGE next_vect + + VMOVDQU8 Z0, (dst)(pos*1) + VMOVDQU8 Z1, 64(dst)(pos*1) + VMOVDQU8 Z2, 128(dst)(pos*1) + VMOVDQU8 Z3, 192(dst)(pos*1) + + ADDQ $256, pos + CMPQ len, pos + JNE loop256b + VZEROUPPER + RET + +loop_1b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src + +next_vect_1b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVB -1(src_tmp)(len*1), src_val1 + XORB src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_1b + + MOVB src_val0, -1(dst)(len*1) + SUBQ $1, len + TESTQ $7, len + JNZ loop_1b + + CMPQ len, $0 + JE ret + TESTQ $255, len + JZ aligned + +not_aligned: + TESTQ $7, len + JNE loop_1b + MOVQ len, not_aligned_len + ANDQ $255, not_aligned_len + +loop_8b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVQ -8(src_tmp)(len*1), src_val0 + +next_vect_8b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVQ -8(src_tmp)(len*1), src_val1 + XORQ src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_8b + + MOVQ src_val0, -8(dst)(len*1) + SUBQ $8, len + SUBQ $8, not_aligned_len + JG loop_8b + + CMPQ len, $256 + JGE aligned + RET + +ret: + RET diff --git a/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s new file mode 100644 index 0000000..8f67edd --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s @@ -0,0 +1,72 @@ +#include "textflag.h" + +// func bytesN(dst, a, b *byte, n int) +TEXT ·bytesN(SB), NOSPLIT, $0 + MOVQ d+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVQ n+24(FP), DX + TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned. + JNZ not_aligned + +aligned: + MOVQ $0, AX // position in slices + +loop16b: + MOVOU (SI)(AX*1), X0 // XOR 16byte forwards. + MOVOU (CX)(AX*1), X1 + PXOR X1, X0 + MOVOU X0, (BX)(AX*1) + ADDQ $16, AX + CMPQ DX, AX + JNE loop16b + RET + +loop_1b: + SUBQ $1, DX // XOR 1byte backwards. + MOVB (SI)(DX*1), DI + MOVB (CX)(DX*1), AX + XORB AX, DI + MOVB DI, (BX)(DX*1) + TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b. + JNZ loop_1b + CMPQ DX, $0 // if len is 0, ret. + JE ret + TESTQ $15, DX // AND 15 & len, if zero jump to aligned. + JZ aligned + +not_aligned: + TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b. + JNE loop_1b + SUBQ $8, DX // XOR 8bytes backwards. + MOVQ (SI)(DX*1), DI + MOVQ (CX)(DX*1), AX + XORQ AX, DI + MOVQ DI, (BX)(DX*1) + CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned. + JGE aligned + +ret: + RET + +// func bytes8(dst, a, b *byte) +TEXT ·bytes8(SB), NOSPLIT, $0 + MOVQ d+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVQ (SI), DI + MOVQ (CX), AX + XORQ AX, DI + MOVQ DI, (BX) + RET + +// func bytes16(dst, a, b *byte) +TEXT ·bytes16(SB), NOSPLIT, $0 + MOVQ d+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVOU (SI), X0 + MOVOU (CX), X1 + PXOR X1, X0 + MOVOU X0, (BX) + RET diff --git a/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s b/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s new file mode 100644 index 0000000..38df948 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s @@ -0,0 +1,123 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +#include "textflag.h" + +#define dst BX // parity's address +#define d2src SI // two-dimension src_slice's address +#define csrc CX // cnt of src +#define len DX // len of vect +#define pos R8 // job position in vect + +#define csrc_tmp R9 +#define d2src_off R10 +#define src_tmp R11 +#define not_aligned_len R12 +#define src_val0 R13 +#define src_val1 R14 + +// func encodeSSE2(dst []byte, src [][]byte) +TEXT ·encodeSSE2(SB), NOSPLIT, $0 + MOVQ d+0(FP), dst + MOVQ src+24(FP), d2src + MOVQ c+32(FP), csrc + MOVQ l+8(FP), len + TESTQ $63, len + JNZ not_aligned + +aligned: + MOVQ $0, pos + +loop64b: + MOVQ csrc, csrc_tmp + SUBQ $2, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVOU (src_tmp)(pos*1), X0 + MOVOU 16(src_tmp)(pos*1), X1 + MOVOU 32(src_tmp)(pos*1), X2 + MOVOU 48(src_tmp)(pos*1), X3 + +next_vect: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVOU (src_tmp)(pos*1), X4 + MOVOU 16(src_tmp)(pos*1), X5 + MOVOU 32(src_tmp)(pos*1), X6 + MOVOU 48(src_tmp)(pos*1), X7 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + SUBQ $1, csrc_tmp + JGE next_vect + + MOVOU X0, (dst)(pos*1) + MOVOU X1, 16(dst)(pos*1) + MOVOU X2, 32(dst)(pos*1) + MOVOU X3, 48(dst)(pos*1) + + ADDQ $64, pos + CMPQ len, pos + JNE loop64b + RET + +loop_1b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVB -1(src_tmp)(len*1), src_val0 + +next_vect_1b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVB -1(src_tmp)(len*1), src_val1 + XORB src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_1b + + MOVB src_val0, -1(dst)(len*1) + SUBQ $1, len + TESTQ $7, len + JNZ loop_1b + + CMPQ len, $0 + JE ret + TESTQ $63, len + JZ aligned + +not_aligned: + TESTQ $7, len + JNE loop_1b + MOVQ len, not_aligned_len + ANDQ $63, not_aligned_len + +loop_8b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVQ -8(src_tmp)(len*1), src_val0 + +next_vect_8b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVQ -8(src_tmp)(len*1), src_val1 + XORQ src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_8b + + MOVQ src_val0, -8(dst)(len*1) + SUBQ $8, len + SUBQ $8, not_aligned_len + JG loop_8b + + CMPQ len, $64 + JGE aligned + RET + +ret: + RET |