summaryrefslogtreecommitdiff
path: root/vendor/github.com/dchest/siphash/hash128_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/dchest/siphash/hash128_amd64.s')
-rw-r--r--vendor/github.com/dchest/siphash/hash128_amd64.s292
1 files changed, 292 insertions, 0 deletions
diff --git a/vendor/github.com/dchest/siphash/hash128_amd64.s b/vendor/github.com/dchest/siphash/hash128_amd64.s
new file mode 100644
index 0000000..86605cc
--- /dev/null
+++ b/vendor/github.com/dchest/siphash/hash128_amd64.s
@@ -0,0 +1,292 @@
+// +build amd64,!appengine,!gccgo
+
+// This is a translation of the gcc output of FloodyBerry's pure-C public
+// domain siphash implementation at https://github.com/floodyberry/siphash
+
+// This assembly code has been modified from the 64-bit output to the experiment 128-bit output.
+
+// SI = v0
+// AX = v1
+// CX = v2
+// DX = v3
+
+// func Hash128(k0, k1 uint64, b []byte) (r0 uint64, r1 uint64)
+TEXT ·Hash128(SB),4,$0-56
+ MOVQ k0+0(FP),CX
+ MOVQ $0x736F6D6570736575,R9
+ MOVQ k1+8(FP),DI
+ MOVQ $0x6C7967656E657261,BX
+ MOVQ $0x646F72616E646F6D,AX
+ MOVQ b_len+24(FP),DX
+ XORQ $0xEE,AX
+ MOVQ DX,R11
+ MOVQ DX,R10
+ XORQ CX,R9
+ XORQ CX,BX
+ MOVQ $0x7465646279746573,CX
+ XORQ DI,AX
+ XORQ DI,CX
+ SHLQ $0x38,R11
+ XORQ DI,DI
+ MOVQ b_base+16(FP),SI
+ ANDQ $0xFFFFFFFFFFFFFFF8,R10
+ JE afterLoop
+ XCHGQ AX,AX
+loopBody:
+ MOVQ 0(SI)(DI*1),R8
+ ADDQ AX,R9
+ RORQ $0x33,AX
+ XORQ R9,AX
+ RORQ $0x20,R9
+ ADDQ $0x8,DI
+ XORQ R8,CX
+ ADDQ CX,BX
+ RORQ $0x30,CX
+ XORQ BX,CX
+ ADDQ AX,BX
+ RORQ $0x2F,AX
+ ADDQ CX,R9
+ RORQ $0x2B,CX
+ XORQ BX,AX
+ XORQ R9,CX
+ RORQ $0x20,BX
+ ADDQ AX,R9
+ ADDQ CX,BX
+ RORQ $0x33,AX
+ RORQ $0x30,CX
+ XORQ R9,AX
+ XORQ BX,CX
+ RORQ $0x20,R9
+ ADDQ AX,BX
+ ADDQ CX,R9
+ RORQ $0x2F,AX
+ RORQ $0x2B,CX
+ XORQ BX,AX
+ RORQ $0x20,BX
+ XORQ R9,CX
+ XORQ R8,R9
+ CMPQ R10,DI
+ JA loopBody
+afterLoop:
+ SUBQ R10,DX
+
+ CMPQ DX,$0x7
+ JA afterSwitch
+
+ // no support for jump tables
+
+ CMPQ DX,$0x7
+ JE sw7
+
+ CMPQ DX,$0x6
+ JE sw6
+
+ CMPQ DX,$0x5
+ JE sw5
+
+ CMPQ DX,$0x4
+ JE sw4
+
+ CMPQ DX,$0x3
+ JE sw3
+
+ CMPQ DX,$0x2
+ JE sw2
+
+ CMPQ DX,$0x1
+ JE sw1
+
+ JMP afterSwitch
+
+sw7: MOVBQZX 6(SI)(DI*1),DX
+ SHLQ $0x30,DX
+ ORQ DX,R11
+sw6: MOVBQZX 0x5(SI)(DI*1),DX
+ SHLQ $0x28,DX
+ ORQ DX,R11
+sw5: MOVBQZX 0x4(SI)(DI*1),DX
+ SHLQ $0x20,DX
+ ORQ DX,R11
+sw4: MOVBQZX 0x3(SI)(DI*1),DX
+ SHLQ $0x18,DX
+ ORQ DX,R11
+sw3: MOVBQZX 0x2(SI)(DI*1),DX
+ SHLQ $0x10,DX
+ ORQ DX,R11
+sw2: MOVBQZX 0x1(SI)(DI*1),DX
+ SHLQ $0x8,DX
+ ORQ DX,R11
+sw1: MOVBQZX 0(SI)(DI*1),DX
+ ORQ DX,R11
+afterSwitch:
+ LEAQ (AX)(R9*1),SI
+ XORQ R11,CX
+ RORQ $0x33,AX
+ ADDQ CX,BX
+ MOVQ CX,DX
+ XORQ SI,AX
+ RORQ $0x30,DX
+ RORQ $0x20,SI
+ LEAQ 0(BX)(AX*1),CX
+ XORQ BX,DX
+ RORQ $0x2F,AX
+ ADDQ DX,SI
+ RORQ $0x2B,DX
+ XORQ CX,AX
+ XORQ SI,DX
+ RORQ $0x20,CX
+ ADDQ AX,SI
+ RORQ $0x33,AX
+ ADDQ DX,CX
+ XORQ SI,AX
+ RORQ $0x30,DX
+ RORQ $0x20,SI
+ XORQ CX,DX
+ ADDQ AX,CX
+ RORQ $0x2F,AX
+ ADDQ DX,SI
+ XORQ CX,AX
+ RORQ $0x2B,DX
+ RORQ $0x20,CX
+ XORQ SI,DX
+ XORQ R11,SI
+ XORB $0xEE,CL
+ ADDQ AX,SI
+ RORQ $0x33,AX
+ ADDQ DX,CX
+ RORQ $0x30,DX
+ XORQ SI,AX
+ XORQ CX,DX
+ RORQ $0x20,SI
+ ADDQ AX,CX
+ ADDQ DX,SI
+ RORQ $0x2F,AX
+ RORQ $0x2B,DX
+ XORQ CX,AX
+ XORQ SI,DX
+ RORQ $0x20,CX
+ ADDQ AX,SI
+ ADDQ DX,CX
+ RORQ $0x33,AX
+ RORQ $0x30,DX
+ XORQ SI,AX
+ RORQ $0x20,SI
+ XORQ CX,DX
+ ADDQ AX,CX
+ RORQ $0x2F,AX
+ ADDQ DX,SI
+ RORQ $0x2B,DX
+ XORQ CX,AX
+ XORQ SI,DX
+ RORQ $0x20,CX
+ ADDQ AX,SI
+ ADDQ DX,CX
+ RORQ $0x33,AX
+ RORQ $0x30,DX
+ XORQ CX,DX
+ XORQ SI,AX
+ RORQ $0x20,SI
+ ADDQ DX,SI
+ ADDQ AX,CX
+ RORQ $0x2F,AX
+ XORQ CX,AX
+ RORQ $0x2B,DX
+ RORQ $0x20,CX
+ XORQ SI,DX
+
+ // gcc optimized the tail end of this function differently. However,
+ // we need to preserve out registers to carry out the second stage of
+ // the finalization. This is a duplicate of an earlier finalization
+ // round.
+
+ ADDQ AX,SI
+ RORQ $0x33,AX
+ ADDQ DX,CX
+ RORQ $0x30,DX
+ XORQ SI,AX
+ XORQ CX,DX
+ RORQ $0x20,SI
+ ADDQ AX,CX
+ ADDQ DX,SI
+ RORQ $0x2F,AX
+ RORQ $0x2B,DX
+ XORQ CX,AX
+ XORQ SI,DX
+ RORQ $0x20,CX
+
+ // Stuff the result into BX instead of AX as gcc had done
+
+ MOVQ SI,BX
+ XORQ AX,BX
+ XORQ DX,BX
+ XORQ CX,BX
+ MOVQ BX,ret+40(FP)
+
+ // Start the second finalization round
+
+ XORB $0xDD,AL
+ ADDQ AX,SI
+ RORQ $0x33,AX
+ ADDQ DX,CX
+ RORQ $0x30,DX
+ XORQ SI,AX
+ XORQ CX,DX
+ RORQ $0x20,SI
+ ADDQ AX,CX
+ ADDQ DX,SI
+ RORQ $0x2F,AX
+ RORQ $0x2B,DX
+ XORQ CX,AX
+ XORQ SI,DX
+ RORQ $0x20,CX
+ ADDQ AX,SI
+ ADDQ DX,CX
+ RORQ $0x33,AX
+ RORQ $0x30,DX
+ XORQ SI,AX
+ RORQ $0x20,SI
+ XORQ CX,DX
+ ADDQ AX,CX
+ RORQ $0x2F,AX
+ ADDQ DX,SI
+ RORQ $0x2B,DX
+ XORQ CX,AX
+ XORQ SI,DX
+ RORQ $0x20,CX
+ ADDQ AX,SI
+ ADDQ DX,CX
+ RORQ $0x33,AX
+ RORQ $0x30,DX
+ XORQ CX,DX
+ XORQ SI,AX
+ RORQ $0x20,SI
+ ADDQ DX,SI
+ ADDQ AX,CX
+ RORQ $0x2F,AX
+ XORQ CX,AX
+ RORQ $0x2B,DX
+ RORQ $0x20,CX
+ XORQ SI,DX
+
+ ADDQ AX,SI
+ RORQ $0x33,AX
+ ADDQ DX,CX
+ RORQ $0x30,DX
+ XORQ SI,AX
+ XORQ CX,DX
+ RORQ $0x20,SI
+ ADDQ AX,CX
+ ADDQ DX,SI
+ RORQ $0x2F,AX
+ RORQ $0x2B,DX
+ XORQ CX,AX
+ XORQ SI,DX
+ RORQ $0x20,CX
+
+ MOVQ SI,BX
+ XORQ AX,BX
+ XORQ DX,BX
+ XORQ CX,BX
+ MOVQ BX,ret1+48(FP)
+
+ RET