summaryrefslogtreecommitdiff
path: root/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
blob: 8f67edd23023f28f3b2e49e83836f09e28445201 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#include "textflag.h"

// func bytesN(dst, a, b *byte, n int)
TEXT ·bytesN(SB), NOSPLIT, $0
	MOVQ  d+0(FP), BX
	MOVQ  a+8(FP), SI
	MOVQ  b+16(FP), CX
	MOVQ  n+24(FP), DX
	TESTQ $15, DX            // AND 15 & len, if not zero jump to not_aligned.
	JNZ   not_aligned

aligned:
	MOVQ $0, AX // position in slices

loop16b:
	MOVOU (SI)(AX*1), X0   // XOR 16byte forwards.
	MOVOU (CX)(AX*1), X1
	PXOR  X1, X0
	MOVOU X0, (BX)(AX*1)
	ADDQ  $16, AX
	CMPQ  DX, AX
	JNE   loop16b
	RET

loop_1b:
	SUBQ  $1, DX           // XOR 1byte backwards.
	MOVB  (SI)(DX*1), DI
	MOVB  (CX)(DX*1), AX
	XORB  AX, DI
	MOVB  DI, (BX)(DX*1)
	TESTQ $7, DX           // AND 7 & len, if not zero jump to loop_1b.
	JNZ   loop_1b
	CMPQ  DX, $0           // if len is 0, ret.
	JE    ret
	TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
	JZ    aligned

not_aligned:
	TESTQ $7, DX           // AND $7 & len, if not zero jump to loop_1b.
	JNE   loop_1b
	SUBQ  $8, DX           // XOR 8bytes backwards.
	MOVQ  (SI)(DX*1), DI
	MOVQ  (CX)(DX*1), AX
	XORQ  AX, DI
	MOVQ  DI, (BX)(DX*1)
	CMPQ  DX, $16          // if len is greater or equal 16 here, it must be aligned.
	JGE   aligned

ret:
	RET

// func bytes8(dst, a, b *byte)
TEXT ·bytes8(SB), NOSPLIT, $0
	MOVQ  d+0(FP), BX
	MOVQ  a+8(FP), SI
	MOVQ  b+16(FP), CX
	MOVQ  (SI), DI
    MOVQ  (CX), AX
    XORQ  AX, DI
    MOVQ  DI, (BX)
    RET

// func bytes16(dst, a, b *byte)
TEXT ·bytes16(SB), NOSPLIT, $0
	MOVQ  d+0(FP), BX
	MOVQ  a+8(FP), SI
	MOVQ  b+16(FP), CX
	MOVOU (SI), X0
    MOVOU (CX), X1
    PXOR  X1, X0
    MOVOU X0, (BX)
    RET