diff options
Diffstat (limited to 'vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go')
-rw-r--r-- | vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go | 338 |
1 files changed, 338 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go b/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go new file mode 100644 index 0000000..720196f --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go @@ -0,0 +1,338 @@ +//+build !noasm +//+build !appengine +//+build !gccgo + +// Copyright 2015, Klaus Post, see LICENSE for details. +// Copyright 2019, Minio, Inc. + +package reedsolomon + +import ( + "sync" +) + +//go:noescape +func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool) + +//go:noescape +func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool) + +//go:noescape +func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool) + +const ( + dimIn = 8 // Number of input rows processed simultaneously + dimOut81 = 1 // Number of output rows processed simultaneously for x1 routine + dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine + dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine + matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine + matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine + matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine +) + +// Construct block of matrix coefficients for single output row in parallel +func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) { + offset := 0 + for c := inputOffset; c < inputOffset+dimIn; c++ { + for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ { + if c < len(matrixRows[iRow]) { + coeff := matrixRows[iRow][c] + copy(matrix[offset*32:], mulTableLow[coeff][:]) + copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) + } else { + // coefficients not used for this input shard (so null out) + v := matrix[offset*32 : offset*32+32] + for i := range v { + v[i] = 0 + } + } + offset += dimIn + if offset >= dimIn*dimOut81 { + offset -= dimIn*dimOut81 - 1 + } + } + } +} + +// Construct block of matrix coefficients for 2 output rows in parallel +func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) { + offset := 0 + for c := inputOffset; c < inputOffset+dimIn; c++ { + for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ { + if c < len(matrixRows[iRow]) { + coeff := matrixRows[iRow][c] + copy(matrix[offset*32:], mulTableLow[coeff][:]) + copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) + } else { + // coefficients not used for this input shard (so null out) + v := matrix[offset*32 : offset*32+32] + for i := range v { + v[i] = 0 + } + } + offset += dimIn + if offset >= dimIn*dimOut82 { + offset -= dimIn*dimOut82 - 1 + } + } + } +} + +// Construct block of matrix coefficients for 4 output rows in parallel +func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) { + offset := 0 + for c := inputOffset; c < inputOffset+dimIn; c++ { + for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ { + if c < len(matrixRows[iRow]) { + coeff := matrixRows[iRow][c] + copy(matrix[offset*32:], mulTableLow[coeff][:]) + copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) + } else { + // coefficients not used for this input shard (so null out) + v := matrix[offset*32 : offset*32+32] + for i := range v { + v[i] = 0 + } + } + offset += dimIn + if offset >= dimIn*dimOut84 { + offset -= dimIn*dimOut84 - 1 + } + } + } +} + +// Invoke AVX512 routine for single output row in parallel +func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) { + done := stop - start + if done <= 0 { + return + } + + inputEnd := inputOffset + dimIn + if inputEnd > len(in) { + inputEnd = len(in) + } + outputEnd := outputOffset + dimOut81 + if outputEnd > len(out) { + outputEnd = len(out) + } + + // We know the max size, alloc temp array. + var inTmp [dimIn][]byte + for i, v := range in[inputOffset:inputEnd] { + inTmp[i] = v[start:stop] + } + var outTmp [dimOut81][]byte + for i, v := range out[outputOffset:outputEnd] { + outTmp[i] = v[start:stop] + } + + addTo := inputOffset != 0 // Except for the first input column, add to previous results + _galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo) + + done = start + ((done >> 6) << 6) + if done < stop { + galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) + } +} + +// Invoke AVX512 routine for 2 output rows in parallel +func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) { + done := stop - start + if done <= 0 { + return + } + + inputEnd := inputOffset + dimIn + if inputEnd > len(in) { + inputEnd = len(in) + } + outputEnd := outputOffset + dimOut82 + if outputEnd > len(out) { + outputEnd = len(out) + } + + // We know the max size, alloc temp array. + var inTmp [dimIn][]byte + for i, v := range in[inputOffset:inputEnd] { + inTmp[i] = v[start:stop] + } + var outTmp [dimOut82][]byte + for i, v := range out[outputOffset:outputEnd] { + outTmp[i] = v[start:stop] + } + + addTo := inputOffset != 0 // Except for the first input column, add to previous results + _galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo) + + done = start + ((done >> 6) << 6) + if done < stop { + galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) + } +} + +// Invoke AVX512 routine for 4 output rows in parallel +func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix84 *[matrixSize84]byte) { + done := stop - start + if done <= 0 { + return + } + + inputEnd := inputOffset + dimIn + if inputEnd > len(in) { + inputEnd = len(in) + } + outputEnd := outputOffset + dimOut84 + if outputEnd > len(out) { + outputEnd = len(out) + } + + // We know the max size, alloc temp array. + var inTmp [dimIn][]byte + for i, v := range in[inputOffset:inputEnd] { + inTmp[i] = v[start:stop] + } + var outTmp [dimOut84][]byte + for i, v := range out[outputOffset:outputEnd] { + outTmp[i] = v[start:stop] + } + + addTo := inputOffset != 0 // Except for the first input column, add to previous results + _galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo) + + done = start + ((done >> 6) << 6) + if done < stop { + galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) + } +} + +func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) { + for c := inputOffset; c < inputEnd; c++ { + for iRow := outputOffset; iRow < outputEnd; iRow++ { + if c < len(matrixRows[iRow]) { + mt := mulTable[matrixRows[iRow][c]][:256] + for i := done; i < stop; i++ { + if c == 0 { // only set value for first input column + out[iRow][i] = mt[in[c][i]] + } else { // and add for all others + out[iRow][i] ^= mt[in[c][i]] + } + } + } + } + } +} + +// Perform the same as codeSomeShards, but taking advantage of +// AVX512 parallelism for up to 4x faster execution as compared to AVX2 +func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { + // Process using no goroutines + start, end := 0, r.o.perRound + if end > byteCount { + end = byteCount + } + for start < byteCount { + matrix84 := [matrixSize84]byte{} + matrix82 := [matrixSize82]byte{} + matrix81 := [matrixSize81]byte{} + + outputRow := 0 + // First process (multiple) batches of 4 output rows in parallel + if outputRow+dimOut84 <= outputCount { + for ; outputRow+dimOut84 <= outputCount; outputRow += dimOut84 { + for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { + setupMatrix84(matrixRows, inputRow, outputRow, &matrix84) + galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix84) + } + } + } + // Then process a (single) batch of 2 output rows in parallel + if outputRow+dimOut82 <= outputCount { + for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { + setupMatrix82(matrixRows, inputRow, outputRow, &matrix82) + galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix82) + } + outputRow += dimOut82 + } + // Lastly, we may have a single output row left (for uneven parity) + if outputRow < outputCount { + for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { + setupMatrix81(matrixRows, inputRow, outputRow, &matrix81) + galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81) + } + } + + start = end + end += r.o.perRound + if end > byteCount { + end = byteCount + } + } +} + +// Perform the same as codeSomeShards, but taking advantage of +// AVX512 parallelism for up to 4x faster execution as compared to AVX2 +func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { + var wg sync.WaitGroup + do := byteCount / r.o.maxGoroutines + if do < r.o.minSplitSize { + do = r.o.minSplitSize + } + // Make sizes divisible by 64 + do = (do + 63) & (^63) + start := 0 + for start < byteCount { + if start+do > byteCount { + do = byteCount - start + } + wg.Add(1) + go func(grStart, grStop int) { + start, stop := grStart, grStart+r.o.perRound + if stop > grStop { + stop = grStop + } + // Loop for each round. + matrix84 := [matrixSize84]byte{} + matrix82 := [matrixSize82]byte{} + matrix81 := [matrixSize81]byte{} + for start < grStop { + outputRow := 0 + // First process (multiple) batches of 4 output rows in parallel + if outputRow+dimOut84 <= outputCount { + // 1K matrix buffer + for ; outputRow+dimOut84 <= outputCount; outputRow += dimOut84 { + for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { + setupMatrix84(matrixRows, inputRow, outputRow, &matrix84) + galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix84) + } + } + } + // Then process a (single) batch of 2 output rows in parallel + if outputRow+dimOut82 <= outputCount { + // 512B matrix buffer + for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { + setupMatrix82(matrixRows, inputRow, outputRow, &matrix82) + galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix82) + } + outputRow += dimOut82 + } + // Lastly, we may have a single output row left (for uneven parity) + if outputRow < outputCount { + for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { + setupMatrix81(matrixRows, inputRow, outputRow, &matrix81) + galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81) + } + } + start = stop + stop += r.o.perRound + if stop > grStop { + stop = grStop + } + } + wg.Done() + }(start, start+do) + start += do + } + wg.Wait() +} |