summaryrefslogtreecommitdiff
path: root/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s')
-rw-r--r--vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s72
1 files changed, 72 insertions, 0 deletions
diff --git a/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
new file mode 100644
index 0000000..8f67edd
--- /dev/null
+++ b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
@@ -0,0 +1,72 @@
+#include "textflag.h"
+
+// func bytesN(dst, a, b *byte, n int)
+TEXT ·bytesN(SB), NOSPLIT, $0
+ MOVQ d+0(FP), BX
+ MOVQ a+8(FP), SI
+ MOVQ b+16(FP), CX
+ MOVQ n+24(FP), DX
+ TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
+ JNZ not_aligned
+
+aligned:
+ MOVQ $0, AX // position in slices
+
+loop16b:
+ MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
+ MOVOU (CX)(AX*1), X1
+ PXOR X1, X0
+ MOVOU X0, (BX)(AX*1)
+ ADDQ $16, AX
+ CMPQ DX, AX
+ JNE loop16b
+ RET
+
+loop_1b:
+ SUBQ $1, DX // XOR 1byte backwards.
+ MOVB (SI)(DX*1), DI
+ MOVB (CX)(DX*1), AX
+ XORB AX, DI
+ MOVB DI, (BX)(DX*1)
+ TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
+ JNZ loop_1b
+ CMPQ DX, $0 // if len is 0, ret.
+ JE ret
+ TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
+ JZ aligned
+
+not_aligned:
+ TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
+ JNE loop_1b
+ SUBQ $8, DX // XOR 8bytes backwards.
+ MOVQ (SI)(DX*1), DI
+ MOVQ (CX)(DX*1), AX
+ XORQ AX, DI
+ MOVQ DI, (BX)(DX*1)
+ CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
+ JGE aligned
+
+ret:
+ RET
+
+// func bytes8(dst, a, b *byte)
+TEXT ·bytes8(SB), NOSPLIT, $0
+ MOVQ d+0(FP), BX
+ MOVQ a+8(FP), SI
+ MOVQ b+16(FP), CX
+ MOVQ (SI), DI
+ MOVQ (CX), AX
+ XORQ AX, DI
+ MOVQ DI, (BX)
+ RET
+
+// func bytes16(dst, a, b *byte)
+TEXT ·bytes16(SB), NOSPLIT, $0
+ MOVQ d+0(FP), BX
+ MOVQ a+8(FP), SI
+ MOVQ b+16(FP), CX
+ MOVOU (SI), X0
+ MOVOU (CX), X1
+ PXOR X1, X0
+ MOVOU X0, (BX)
+ RET