// +build amd64,!appengine,!gccgo // This is a translation of the gcc output of FloodyBerry's pure-C public // domain siphash implementation at https://github.com/floodyberry/siphash // This assembly code has been modified from the 64-bit output to the experiment 128-bit output. // SI = v0 // AX = v1 // CX = v2 // DX = v3 // func Hash128(k0, k1 uint64, b []byte) (r0 uint64, r1 uint64) TEXT ·Hash128(SB),4,$0-56 MOVQ k0+0(FP),CX MOVQ $0x736F6D6570736575,R9 MOVQ k1+8(FP),DI MOVQ $0x6C7967656E657261,BX MOVQ $0x646F72616E646F6D,AX MOVQ b_len+24(FP),DX XORQ $0xEE,AX MOVQ DX,R11 MOVQ DX,R10 XORQ CX,R9 XORQ CX,BX MOVQ $0x7465646279746573,CX XORQ DI,AX XORQ DI,CX SHLQ $0x38,R11 XORQ DI,DI MOVQ b_base+16(FP),SI ANDQ $0xFFFFFFFFFFFFFFF8,R10 JE afterLoop XCHGQ AX,AX loopBody: MOVQ 0(SI)(DI*1),R8 ADDQ AX,R9 RORQ $0x33,AX XORQ R9,AX RORQ $0x20,R9 ADDQ $0x8,DI XORQ R8,CX ADDQ CX,BX RORQ $0x30,CX XORQ BX,CX ADDQ AX,BX RORQ $0x2F,AX ADDQ CX,R9 RORQ $0x2B,CX XORQ BX,AX XORQ R9,CX RORQ $0x20,BX ADDQ AX,R9 ADDQ CX,BX RORQ $0x33,AX RORQ $0x30,CX XORQ R9,AX XORQ BX,CX RORQ $0x20,R9 ADDQ AX,BX ADDQ CX,R9 RORQ $0x2F,AX RORQ $0x2B,CX XORQ BX,AX RORQ $0x20,BX XORQ R9,CX XORQ R8,R9 CMPQ R10,DI JA loopBody afterLoop: SUBQ R10,DX CMPQ DX,$0x7 JA afterSwitch // no support for jump tables CMPQ DX,$0x7 JE sw7 CMPQ DX,$0x6 JE sw6 CMPQ DX,$0x5 JE sw5 CMPQ DX,$0x4 JE sw4 CMPQ DX,$0x3 JE sw3 CMPQ DX,$0x2 JE sw2 CMPQ DX,$0x1 JE sw1 JMP afterSwitch sw7: MOVBQZX 6(SI)(DI*1),DX SHLQ $0x30,DX ORQ DX,R11 sw6: MOVBQZX 0x5(SI)(DI*1),DX SHLQ $0x28,DX ORQ DX,R11 sw5: MOVBQZX 0x4(SI)(DI*1),DX SHLQ $0x20,DX ORQ DX,R11 sw4: MOVBQZX 0x3(SI)(DI*1),DX SHLQ $0x18,DX ORQ DX,R11 sw3: MOVBQZX 0x2(SI)(DI*1),DX SHLQ $0x10,DX ORQ DX,R11 sw2: MOVBQZX 0x1(SI)(DI*1),DX SHLQ $0x8,DX ORQ DX,R11 sw1: MOVBQZX 0(SI)(DI*1),DX ORQ DX,R11 afterSwitch: LEAQ (AX)(R9*1),SI XORQ R11,CX RORQ $0x33,AX ADDQ CX,BX MOVQ CX,DX XORQ SI,AX RORQ $0x30,DX RORQ $0x20,SI LEAQ 0(BX)(AX*1),CX XORQ BX,DX RORQ $0x2F,AX ADDQ DX,SI RORQ $0x2B,DX XORQ CX,AX XORQ SI,DX RORQ $0x20,CX ADDQ AX,SI RORQ $0x33,AX ADDQ DX,CX XORQ SI,AX RORQ $0x30,DX RORQ $0x20,SI XORQ CX,DX ADDQ AX,CX RORQ $0x2F,AX ADDQ DX,SI XORQ CX,AX RORQ $0x2B,DX RORQ $0x20,CX XORQ SI,DX XORQ R11,SI XORB $0xEE,CL ADDQ AX,SI RORQ $0x33,AX ADDQ DX,CX RORQ $0x30,DX XORQ SI,AX XORQ CX,DX RORQ $0x20,SI ADDQ AX,CX ADDQ DX,SI RORQ $0x2F,AX RORQ $0x2B,DX XORQ CX,AX XORQ SI,DX RORQ $0x20,CX ADDQ AX,SI ADDQ DX,CX RORQ $0x33,AX RORQ $0x30,DX XORQ SI,AX RORQ $0x20,SI XORQ CX,DX ADDQ AX,CX RORQ $0x2F,AX ADDQ DX,SI RORQ $0x2B,DX XORQ CX,AX XORQ SI,DX RORQ $0x20,CX ADDQ AX,SI ADDQ DX,CX RORQ $0x33,AX RORQ $0x30,DX XORQ CX,DX XORQ SI,AX RORQ $0x20,SI ADDQ DX,SI ADDQ AX,CX RORQ $0x2F,AX XORQ CX,AX RORQ $0x2B,DX RORQ $0x20,CX XORQ SI,DX // gcc optimized the tail end of this function differently. However, // we need to preserve out registers to carry out the second stage of // the finalization. This is a duplicate of an earlier finalization // round. ADDQ AX,SI RORQ $0x33,AX ADDQ DX,CX RORQ $0x30,DX XORQ SI,AX XORQ CX,DX RORQ $0x20,SI ADDQ AX,CX ADDQ DX,SI RORQ $0x2F,AX RORQ $0x2B,DX XORQ CX,AX XORQ SI,DX RORQ $0x20,CX // Stuff the result into BX instead of AX as gcc had done MOVQ SI,BX XORQ AX,BX XORQ DX,BX XORQ CX,BX MOVQ BX,ret+40(FP) // Start the second finalization round XORB $0xDD,AL ADDQ AX,SI RORQ $0x33,AX ADDQ DX,CX RORQ $0x30,DX XORQ SI,AX XORQ CX,DX RORQ $0x20,SI ADDQ AX,CX ADDQ DX,SI RORQ $0x2F,AX RORQ $0x2B,DX XORQ CX,AX XORQ SI,DX RORQ $0x20,CX ADDQ AX,SI ADDQ DX,CX RORQ $0x33,AX RORQ $0x30,DX XORQ SI,AX RORQ $0x20,SI XORQ CX,DX ADDQ AX,CX RORQ $0x2F,AX ADDQ DX,SI RORQ $0x2B,DX XORQ CX,AX XORQ SI,DX RORQ $0x20,CX ADDQ AX,SI ADDQ DX,CX RORQ $0x33,AX RORQ $0x30,DX XORQ CX,DX XORQ SI,AX RORQ $0x20,SI ADDQ DX,SI ADDQ AX,CX RORQ $0x2F,AX XORQ CX,AX RORQ $0x2B,DX RORQ $0x20,CX XORQ SI,DX ADDQ AX,SI RORQ $0x33,AX ADDQ DX,CX RORQ $0x30,DX XORQ SI,AX XORQ CX,DX RORQ $0x20,SI ADDQ AX,CX ADDQ DX,SI RORQ $0x2F,AX RORQ $0x2B,DX XORQ CX,AX XORQ SI,DX RORQ $0x20,CX MOVQ SI,BX XORQ AX,BX XORQ DX,BX XORQ CX,BX MOVQ BX,ret1+48(FP) RET