diff options
author | kali kaneko (leap communications) <kali@leap.se> | 2021-11-29 01:46:27 +0100 |
---|---|---|
committer | kali kaneko (leap communications) <kali@leap.se> | 2021-11-29 18:14:16 +0100 |
commit | 18f52af5be3a9a0c73811706108f790d65ee9c67 (patch) | |
tree | e13cbacb47d56919caa9c44a2b45dec1497a7860 /vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s | |
parent | ebcef0d57b6ecb5a40c6579f6be07182dd3033ba (diff) |
[pkg] update vendor
Diffstat (limited to 'vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s')
-rw-r--r-- | vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s | 124 |
1 files changed, 124 insertions, 0 deletions
diff --git a/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s new file mode 100644 index 0000000..23cf924 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s @@ -0,0 +1,124 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +#include "textflag.h" + +#define dst BX // parity's address +#define d2src SI // two-dimension src_slice's address +#define csrc CX // cnt of src +#define len DX // len of vect +#define pos R8 // job position in vect + +#define csrc_tmp R9 +#define d2src_off R10 +#define src_tmp R11 +#define not_aligned_len R12 +#define src_val0 R13 +#define src_val1 R14 + +// func encodeAVX2(dst []byte, src [][]byte) +TEXT ·encodeAVX2(SB), NOSPLIT, $0 + MOVQ d+0(FP), dst + MOVQ s+24(FP), d2src + MOVQ c+32(FP), csrc + MOVQ l+8(FP), len + TESTQ $127, len + JNZ not_aligned + +aligned: + MOVQ $0, pos + +loop128b: + MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp + SUBQ $2, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp + VMOVDQU (src_tmp)(pos*1), Y0 + VMOVDQU 32(src_tmp)(pos*1), Y1 + VMOVDQU 64(src_tmp)(pos*1), Y2 + VMOVDQU 96(src_tmp)(pos*1), Y3 + +next_vect: + ADDQ $24, d2src_off // len(slice) = 24 + MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect + VMOVDQU (src_tmp)(pos*1), Y4 + VMOVDQU 32(src_tmp)(pos*1), Y5 + VMOVDQU 64(src_tmp)(pos*1), Y6 + VMOVDQU 96(src_tmp)(pos*1), Y7 + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + SUBQ $1, csrc_tmp + JGE next_vect + + VMOVDQU Y0, (dst)(pos*1) + VMOVDQU Y1, 32(dst)(pos*1) + VMOVDQU Y2, 64(dst)(pos*1) + VMOVDQU Y3, 96(dst)(pos*1) + + ADDQ $128, pos + CMPQ len, pos + JNE loop128b + VZEROUPPER + RET + +loop_1b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src + +next_vect_1b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVB -1(src_tmp)(len*1), src_val1 + XORB src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_1b + + MOVB src_val0, -1(dst)(len*1) + SUBQ $1, len + TESTQ $7, len + JNZ loop_1b + + CMPQ len, $0 + JE ret + TESTQ $127, len + JZ aligned + +not_aligned: + TESTQ $7, len + JNE loop_1b + MOVQ len, not_aligned_len + ANDQ $127, not_aligned_len + +loop_8b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVQ -8(src_tmp)(len*1), src_val0 + +next_vect_8b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVQ -8(src_tmp)(len*1), src_val1 + XORQ src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_8b + + MOVQ src_val0, -8(dst)(len*1) + SUBQ $8, len + SUBQ $8, not_aligned_len + JG loop_8b + + CMPQ len, $128 + JGE aligned + RET + +ret: + RET |