summaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go')
-rw-r--r--vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go338
1 files changed, 338 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go b/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go
new file mode 100644
index 0000000..720196f
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go
@@ -0,0 +1,338 @@
+//+build !noasm
+//+build !appengine
+//+build !gccgo
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2019, Minio, Inc.
+
+package reedsolomon
+
+import (
+ "sync"
+)
+
+//go:noescape
+func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
+
+//go:noescape
+func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
+
+//go:noescape
+func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
+
+const (
+ dimIn = 8 // Number of input rows processed simultaneously
+ dimOut81 = 1 // Number of output rows processed simultaneously for x1 routine
+ dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
+ dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
+ matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine
+ matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
+ matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
+)
+
+// Construct block of matrix coefficients for single output row in parallel
+func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) {
+ offset := 0
+ for c := inputOffset; c < inputOffset+dimIn; c++ {
+ for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ {
+ if c < len(matrixRows[iRow]) {
+ coeff := matrixRows[iRow][c]
+ copy(matrix[offset*32:], mulTableLow[coeff][:])
+ copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
+ } else {
+ // coefficients not used for this input shard (so null out)
+ v := matrix[offset*32 : offset*32+32]
+ for i := range v {
+ v[i] = 0
+ }
+ }
+ offset += dimIn
+ if offset >= dimIn*dimOut81 {
+ offset -= dimIn*dimOut81 - 1
+ }
+ }
+ }
+}
+
+// Construct block of matrix coefficients for 2 output rows in parallel
+func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
+ offset := 0
+ for c := inputOffset; c < inputOffset+dimIn; c++ {
+ for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
+ if c < len(matrixRows[iRow]) {
+ coeff := matrixRows[iRow][c]
+ copy(matrix[offset*32:], mulTableLow[coeff][:])
+ copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
+ } else {
+ // coefficients not used for this input shard (so null out)
+ v := matrix[offset*32 : offset*32+32]
+ for i := range v {
+ v[i] = 0
+ }
+ }
+ offset += dimIn
+ if offset >= dimIn*dimOut82 {
+ offset -= dimIn*dimOut82 - 1
+ }
+ }
+ }
+}
+
+// Construct block of matrix coefficients for 4 output rows in parallel
+func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
+ offset := 0
+ for c := inputOffset; c < inputOffset+dimIn; c++ {
+ for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
+ if c < len(matrixRows[iRow]) {
+ coeff := matrixRows[iRow][c]
+ copy(matrix[offset*32:], mulTableLow[coeff][:])
+ copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
+ } else {
+ // coefficients not used for this input shard (so null out)
+ v := matrix[offset*32 : offset*32+32]
+ for i := range v {
+ v[i] = 0
+ }
+ }
+ offset += dimIn
+ if offset >= dimIn*dimOut84 {
+ offset -= dimIn*dimOut84 - 1
+ }
+ }
+ }
+}
+
+// Invoke AVX512 routine for single output row in parallel
+func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) {
+ done := stop - start
+ if done <= 0 {
+ return
+ }
+
+ inputEnd := inputOffset + dimIn
+ if inputEnd > len(in) {
+ inputEnd = len(in)
+ }
+ outputEnd := outputOffset + dimOut81
+ if outputEnd > len(out) {
+ outputEnd = len(out)
+ }
+
+ // We know the max size, alloc temp array.
+ var inTmp [dimIn][]byte
+ for i, v := range in[inputOffset:inputEnd] {
+ inTmp[i] = v[start:stop]
+ }
+ var outTmp [dimOut81][]byte
+ for i, v := range out[outputOffset:outputEnd] {
+ outTmp[i] = v[start:stop]
+ }
+
+ addTo := inputOffset != 0 // Except for the first input column, add to previous results
+ _galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo)
+
+ done = start + ((done >> 6) << 6)
+ if done < stop {
+ galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
+ }
+}
+
+// Invoke AVX512 routine for 2 output rows in parallel
+func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) {
+ done := stop - start
+ if done <= 0 {
+ return
+ }
+
+ inputEnd := inputOffset + dimIn
+ if inputEnd > len(in) {
+ inputEnd = len(in)
+ }
+ outputEnd := outputOffset + dimOut82
+ if outputEnd > len(out) {
+ outputEnd = len(out)
+ }
+
+ // We know the max size, alloc temp array.
+ var inTmp [dimIn][]byte
+ for i, v := range in[inputOffset:inputEnd] {
+ inTmp[i] = v[start:stop]
+ }
+ var outTmp [dimOut82][]byte
+ for i, v := range out[outputOffset:outputEnd] {
+ outTmp[i] = v[start:stop]
+ }
+
+ addTo := inputOffset != 0 // Except for the first input column, add to previous results
+ _galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo)
+
+ done = start + ((done >> 6) << 6)
+ if done < stop {
+ galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
+ }
+}
+
+// Invoke AVX512 routine for 4 output rows in parallel
+func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix84 *[matrixSize84]byte) {
+ done := stop - start
+ if done <= 0 {
+ return
+ }
+
+ inputEnd := inputOffset + dimIn
+ if inputEnd > len(in) {
+ inputEnd = len(in)
+ }
+ outputEnd := outputOffset + dimOut84
+ if outputEnd > len(out) {
+ outputEnd = len(out)
+ }
+
+ // We know the max size, alloc temp array.
+ var inTmp [dimIn][]byte
+ for i, v := range in[inputOffset:inputEnd] {
+ inTmp[i] = v[start:stop]
+ }
+ var outTmp [dimOut84][]byte
+ for i, v := range out[outputOffset:outputEnd] {
+ outTmp[i] = v[start:stop]
+ }
+
+ addTo := inputOffset != 0 // Except for the first input column, add to previous results
+ _galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo)
+
+ done = start + ((done >> 6) << 6)
+ if done < stop {
+ galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
+ }
+}
+
+func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) {
+ for c := inputOffset; c < inputEnd; c++ {
+ for iRow := outputOffset; iRow < outputEnd; iRow++ {
+ if c < len(matrixRows[iRow]) {
+ mt := mulTable[matrixRows[iRow][c]][:256]
+ for i := done; i < stop; i++ {
+ if c == 0 { // only set value for first input column
+ out[iRow][i] = mt[in[c][i]]
+ } else { // and add for all others
+ out[iRow][i] ^= mt[in[c][i]]
+ }
+ }
+ }
+ }
+ }
+}
+
+// Perform the same as codeSomeShards, but taking advantage of
+// AVX512 parallelism for up to 4x faster execution as compared to AVX2
+func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
+ // Process using no goroutines
+ start, end := 0, r.o.perRound
+ if end > byteCount {
+ end = byteCount
+ }
+ for start < byteCount {
+ matrix84 := [matrixSize84]byte{}
+ matrix82 := [matrixSize82]byte{}
+ matrix81 := [matrixSize81]byte{}
+
+ outputRow := 0
+ // First process (multiple) batches of 4 output rows in parallel
+ if outputRow+dimOut84 <= outputCount {
+ for ; outputRow+dimOut84 <= outputCount; outputRow += dimOut84 {
+ for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
+ setupMatrix84(matrixRows, inputRow, outputRow, &matrix84)
+ galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix84)
+ }
+ }
+ }
+ // Then process a (single) batch of 2 output rows in parallel
+ if outputRow+dimOut82 <= outputCount {
+ for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
+ setupMatrix82(matrixRows, inputRow, outputRow, &matrix82)
+ galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix82)
+ }
+ outputRow += dimOut82
+ }
+ // Lastly, we may have a single output row left (for uneven parity)
+ if outputRow < outputCount {
+ for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
+ setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
+ galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81)
+ }
+ }
+
+ start = end
+ end += r.o.perRound
+ if end > byteCount {
+ end = byteCount
+ }
+ }
+}
+
+// Perform the same as codeSomeShards, but taking advantage of
+// AVX512 parallelism for up to 4x faster execution as compared to AVX2
+func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
+ var wg sync.WaitGroup
+ do := byteCount / r.o.maxGoroutines
+ if do < r.o.minSplitSize {
+ do = r.o.minSplitSize
+ }
+ // Make sizes divisible by 64
+ do = (do + 63) & (^63)
+ start := 0
+ for start < byteCount {
+ if start+do > byteCount {
+ do = byteCount - start
+ }
+ wg.Add(1)
+ go func(grStart, grStop int) {
+ start, stop := grStart, grStart+r.o.perRound
+ if stop > grStop {
+ stop = grStop
+ }
+ // Loop for each round.
+ matrix84 := [matrixSize84]byte{}
+ matrix82 := [matrixSize82]byte{}
+ matrix81 := [matrixSize81]byte{}
+ for start < grStop {
+ outputRow := 0
+ // First process (multiple) batches of 4 output rows in parallel
+ if outputRow+dimOut84 <= outputCount {
+ // 1K matrix buffer
+ for ; outputRow+dimOut84 <= outputCount; outputRow += dimOut84 {
+ for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
+ setupMatrix84(matrixRows, inputRow, outputRow, &matrix84)
+ galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix84)
+ }
+ }
+ }
+ // Then process a (single) batch of 2 output rows in parallel
+ if outputRow+dimOut82 <= outputCount {
+ // 512B matrix buffer
+ for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
+ setupMatrix82(matrixRows, inputRow, outputRow, &matrix82)
+ galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix82)
+ }
+ outputRow += dimOut82
+ }
+ // Lastly, we may have a single output row left (for uneven parity)
+ if outputRow < outputCount {
+ for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
+ setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
+ galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81)
+ }
+ }
+ start = stop
+ stop += r.o.perRound
+ if stop > grStop {
+ stop = grStop
+ }
+ }
+ wg.Done()
+ }(start, start+do)
+ start += do
+ }
+ wg.Wait()
+}