summaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/reedsolomon/galois_arm64.s')
-rw-r--r--vendor/github.com/klauspost/reedsolomon/galois_arm64.s125
1 files changed, 125 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.s b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
new file mode 100644
index 0000000..d2cac2c
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
@@ -0,0 +1,125 @@
+//+build !noasm !appengine !gccgo
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2017, Minio, Inc.
+
+#define LOAD(LO1, LO2, HI1, HI2) \
+ VLD1.P 32(R1), [LO1.B16, LO2.B16] \
+ \
+ \ // Get low input and high input
+ VUSHR $4, LO1.B16, HI1.B16 \
+ VUSHR $4, LO2.B16, HI2.B16 \
+ VAND V8.B16, LO1.B16, LO1.B16 \
+ VAND V8.B16, LO2.B16, LO2.B16
+
+#define GALOIS_MUL(MUL_LO, MUL_HI, OUT1, OUT2, TMP1, TMP2) \
+ \ // Mul low part and mul high part
+ VTBL V0.B16, [MUL_LO.B16], OUT1.B16 \
+ VTBL V10.B16, [MUL_HI.B16], OUT2.B16 \
+ VTBL V1.B16, [MUL_LO.B16], TMP1.B16 \
+ VTBL V11.B16, [MUL_HI.B16], TMP2.B16 \
+ \
+ \ // Combine results
+ VEOR OUT2.B16, OUT1.B16, OUT1.B16 \
+ VEOR TMP2.B16, TMP1.B16, OUT2.B16
+
+// func galMulNEON(low, high, in, out []byte)
+TEXT ·galMulNEON(SB), 7, $0
+ MOVD in_base+48(FP), R1
+ MOVD in_len+56(FP), R2 // length of message
+ MOVD out_base+72(FP), R5
+ SUBS $32, R2
+ BMI complete
+
+ MOVD low+0(FP), R10 // R10: &low
+ MOVD high+24(FP), R11 // R11: &high
+ VLD1 (R10), [V6.B16]
+ VLD1 (R11), [V7.B16]
+
+ //
+ // Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error
+ // WORD $0x4e010c68 // dup v8.16b, w3
+ //
+ MOVD $0x0f, R3
+ VMOV R3, V8.B[0]
+ VDUP V8.B[0], V8.B16
+
+loop:
+ // Main loop
+ LOAD(V0, V1, V10, V11)
+ GALOIS_MUL(V6, V7, V4, V5, V14, V15)
+
+ // Store result
+ VST1.P [V4.D2, V5.D2], 32(R5)
+
+ SUBS $32, R2
+ BPL loop
+
+complete:
+ RET
+
+// func galMulXorNEON(low, high, in, out []byte)
+TEXT ·galMulXorNEON(SB), 7, $0
+ MOVD in_base+48(FP), R1
+ MOVD in_len+56(FP), R2 // length of message
+ MOVD out_base+72(FP), R5
+ SUBS $32, R2
+ BMI completeXor
+
+ MOVD low+0(FP), R10 // R10: &low
+ MOVD high+24(FP), R11 // R11: &high
+ VLD1 (R10), [V6.B16]
+ VLD1 (R11), [V7.B16]
+
+ //
+ // Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error
+ // WORD $0x4e010c68 // dup v8.16b, w3
+ //
+ MOVD $0x0f, R3
+ VMOV R3, V8.B[0]
+ VDUP V8.B[0], V8.B16
+
+loopXor:
+ // Main loop
+ VLD1 (R5), [V20.B16, V21.B16]
+
+ LOAD(V0, V1, V10, V11)
+ GALOIS_MUL(V6, V7, V4, V5, V14, V15)
+
+ VEOR V20.B16, V4.B16, V4.B16
+ VEOR V21.B16, V5.B16, V5.B16
+
+ // Store result
+ VST1.P [V4.D2, V5.D2], 32(R5)
+
+ SUBS $32, R2
+ BPL loopXor
+
+completeXor:
+ RET
+
+// func galXorNEON(in, out []byte)
+TEXT ·galXorNEON(SB), 7, $0
+ MOVD in_base+0(FP), R1
+ MOVD in_len+8(FP), R2 // length of message
+ MOVD out_base+24(FP), R5
+ SUBS $32, R2
+ BMI completeXor
+
+loopXor:
+ // Main loop
+ VLD1.P 32(R1), [V0.B16, V1.B16]
+ VLD1 (R5), [V20.B16, V21.B16]
+
+ VEOR V20.B16, V0.B16, V4.B16
+ VEOR V21.B16, V1.B16, V5.B16
+
+ // Store result
+ VST1.P [V4.D2, V5.D2], 32(R5)
+
+ SUBS $32, R2
+ BPL loopXor
+
+completeXor:
+ RET
+