diff options
author | kali kaneko (leap communications) <kali@leap.se> | 2021-11-29 01:46:27 +0100 |
---|---|---|
committer | kali kaneko (leap communications) <kali@leap.se> | 2021-11-29 18:14:16 +0100 |
commit | 18f52af5be3a9a0c73811706108f790d65ee9c67 (patch) | |
tree | e13cbacb47d56919caa9c44a2b45dec1497a7860 /vendor/github.com/templexxx | |
parent | ebcef0d57b6ecb5a40c6579f6be07182dd3033ba (diff) |
[pkg] update vendor
Diffstat (limited to 'vendor/github.com/templexxx')
33 files changed, 2135 insertions, 0 deletions
diff --git a/vendor/github.com/templexxx/cpu/.gitignore b/vendor/github.com/templexxx/cpu/.gitignore new file mode 100644 index 0000000..f1c181e --- /dev/null +++ b/vendor/github.com/templexxx/cpu/.gitignore @@ -0,0 +1,12 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out diff --git a/vendor/github.com/templexxx/cpu/LICENSE b/vendor/github.com/templexxx/cpu/LICENSE new file mode 100644 index 0000000..dfa8f7b --- /dev/null +++ b/vendor/github.com/templexxx/cpu/LICENSE @@ -0,0 +1,32 @@ +BSD 3-Clause License + +Copyright (c) 2018 Temple3x (temple3x@gmail.com) +Copyright 2017 The Go Authors +Copyright (c) 2015 Klaus Post + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/templexxx/cpu/README.md b/vendor/github.com/templexxx/cpu/README.md new file mode 100644 index 0000000..50ccb9f --- /dev/null +++ b/vendor/github.com/templexxx/cpu/README.md @@ -0,0 +1,23 @@ +# cpu +internal/cpu(in Go standard lib) with these detections: + +>- AVX512 +> +>- Cache Size +> +>- Invariant TSC +> + +It also provides: + +>- False sharing range, see `X86FalseSharingRange` for X86 platform. +> +>- TSC frequency +> +>- Name +> +>- Family & Model + +# Acknowledgement + +[klauspost/cpuid](https://github.com/klauspost/cpuid)
\ No newline at end of file diff --git a/vendor/github.com/templexxx/cpu/cpu.go b/vendor/github.com/templexxx/cpu/cpu.go new file mode 100644 index 0000000..92295d9 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu.go @@ -0,0 +1,234 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package cpu implements processor feature detection +// used by the Go standard library. +package cpu + +// debugOptions is set to true by the runtime if go was compiled with GOEXPERIMENT=debugcpu +// and GOOS is Linux or Darwin. This variable is linknamed in runtime/proc.go. +var debugOptions bool + +var X86 x86 + +// "Loads data or instructions from memory to the second-level cache. +// To use the streamer, organize the data or instructions in blocks of 128 bytes, +// aligned on 128 bytes." +// From <Intel® 64 and IA-32 architectures optimization reference manual>, +// in section 3.7.3 "Hardware Prefetching for Second-Level Cache" +// +// In practice, I have found use 128bytes can gain better performance than 64bytes (one cache line). +const X86FalseSharingRange = 128 + +// The booleans in x86 contain the correspondingly named cpuid feature bit. +// HasAVX and HasAVX2 are only set if the OS does support XMM and YMM registers +// in addition to the cpuid feature bit being set. +// The struct is padded to avoid false sharing. +type x86 struct { + _ [X86FalseSharingRange]byte + HasAES bool + HasADX bool + HasAVX bool + HasAVX2 bool + HasAVX512F bool + HasAVX512DQ bool + HasAVX512BW bool + HasAVX512VL bool + HasBMI1 bool + HasBMI2 bool + HasERMS bool + HasFMA bool + HasOSXSAVE bool + HasPCLMULQDQ bool + HasPOPCNT bool + HasSSE2 bool + HasSSE3 bool + HasSSSE3 bool + HasSSE41 bool + HasSSE42 bool + // The invariant TSC will run at a constant rate in all ACPI P-, C-, and T-states. + // This is the architectural behavior moving forward. On processors with + // invariant TSC support, the OS may use the TSC for wall clock timer services (instead of ACPI or HPET timers). + HasInvariantTSC bool + + Cache Cache + + // TSCFrequency only meaningful when HasInvariantTSC == true. + // Unit: Hz. + // + // Warn: + // 1. If it's 0, means can't get it. Don't use it. + // 2. Don't use it if you want "100%" precise timestamp. + TSCFrequency uint64 + + Name string + Signature string // DisplayFamily_DisplayModel. + Family uint32 // CPU family number. + Model uint32 // CPU model number. + + _ [X86FalseSharingRange]byte +} + +// CPU Cache Size. +// -1 if undetected. +type Cache struct { + L1I int + L1D int + L2 int + L3 int +} + +var PPC64 ppc64 + +// For ppc64x, it is safe to check only for ISA level starting on ISA v3.00, +// since there are no optional categories. There are some exceptions that also +// require kernel support to work (darn, scv), so there are feature bits for +// those as well. The minimum processor requirement is POWER8 (ISA 2.07), so we +// maintain some of the old feature checks for optional categories for +// safety. +// The struct is padded to avoid false sharing. +type ppc64 struct { + _ [CacheLineSize]byte + HasVMX bool // Vector unit (Altivec) + HasDFP bool // Decimal Floating Point unit + HasVSX bool // Vector-scalar unit + HasHTM bool // Hardware Transactional Memory + HasISEL bool // Integer select + HasVCRYPTO bool // Vector cryptography + HasHTMNOSC bool // HTM: kernel-aborted transaction in syscalls + HasDARN bool // Hardware random number generator (requires kernel enablement) + HasSCV bool // Syscall vectored (requires kernel enablement) + IsPOWER8 bool // ISA v2.07 (POWER8) + IsPOWER9 bool // ISA v3.00 (POWER9) + _ [CacheLineSize]byte +} + +var ARM64 arm64 + +// The booleans in arm64 contain the correspondingly named cpu feature bit. +// The struct is padded to avoid false sharing. +type arm64 struct { + _ [CacheLineSize]byte + HasFP bool + HasASIMD bool + HasEVTSTRM bool + HasAES bool + HasPMULL bool + HasSHA1 bool + HasSHA2 bool + HasCRC32 bool + HasATOMICS bool + HasFPHP bool + HasASIMDHP bool + HasCPUID bool + HasASIMDRDM bool + HasJSCVT bool + HasFCMA bool + HasLRCPC bool + HasDCPOP bool + HasSHA3 bool + HasSM3 bool + HasSM4 bool + HasASIMDDP bool + HasSHA512 bool + HasSVE bool + HasASIMDFHM bool + _ [CacheLineSize]byte +} + +var S390X s390x + +type s390x struct { + _ [CacheLineSize]byte + HasZArch bool // z architecture mode is active [mandatory] + HasSTFLE bool // store facility list extended [mandatory] + HasLDisp bool // long (20-bit) displacements [mandatory] + HasEImm bool // 32-bit immediates [mandatory] + HasDFP bool // decimal floating point + HasETF3Enhanced bool // ETF-3 enhanced + HasMSA bool // message security assist (CPACF) + HasAES bool // KM-AES{128,192,256} functions + HasAESCBC bool // KMC-AES{128,192,256} functions + HasAESCTR bool // KMCTR-AES{128,192,256} functions + HasAESGCM bool // KMA-GCM-AES{128,192,256} functions + HasGHASH bool // KIMD-GHASH function + HasSHA1 bool // K{I,L}MD-SHA-1 functions + HasSHA256 bool // K{I,L}MD-SHA-256 functions + HasSHA512 bool // K{I,L}MD-SHA-512 functions + HasVX bool // vector facility. Note: the runtime sets this when it processes auxv records. + _ [CacheLineSize]byte +} + +// initialize examines the processor and sets the relevant variables above. +// This is called by the runtime package early in program initialization, +// before normal init functions are run. env is set by runtime on Linux and Darwin +// if go was compiled with GOEXPERIMENT=debugcpu. +func init() { + doinit() + processOptions("") +} + +// options contains the cpu debug options that can be used in GODEBUGCPU. +// Options are arch dependent and are added by the arch specific doinit functions. +// Features that are mandatory for the specific GOARCH should not be added to options +// (e.g. SSE2 on amd64). +var options []option + +// Option names should be lower case. e.g. avx instead of AVX. +type option struct { + Name string + Feature *bool +} + +// processOptions disables CPU feature values based on the parsed env string. +// The env string is expected to be of the form feature1=0,feature2=0... +// where feature names is one of the architecture specifc list stored in the +// cpu packages options variable. If env contains all=0 then all capabilities +// referenced through the options variable are disabled. Other feature +// names and values other than 0 are silently ignored. +func processOptions(env string) { +field: + for env != "" { + field := "" + i := indexByte(env, ',') + if i < 0 { + field, env = env, "" + } else { + field, env = env[:i], env[i+1:] + } + i = indexByte(field, '=') + if i < 0 { + continue + } + key, value := field[:i], field[i+1:] + + // Only allow turning off CPU features by specifying '0'. + if value == "0" { + if key == "all" { + for _, v := range options { + *v.Feature = false + } + return + } else { + for _, v := range options { + if v.Name == key { + *v.Feature = false + continue field + } + } + } + } + } +} + +// indexByte returns the index of the first instance of c in s, +// or -1 if c is not present in s. +func indexByte(s string, c byte) int { + for i := 0; i < len(s); i++ { + if s[i] == c { + return i + } + } + return -1 +} diff --git a/vendor/github.com/templexxx/cpu/cpu_386.go b/vendor/github.com/templexxx/cpu/cpu_386.go new file mode 100644 index 0000000..561c81f --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_386.go @@ -0,0 +1,7 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const GOARCH = "386" diff --git a/vendor/github.com/templexxx/cpu/cpu_amd64.go b/vendor/github.com/templexxx/cpu/cpu_amd64.go new file mode 100644 index 0000000..9b00153 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_amd64.go @@ -0,0 +1,7 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const GOARCH = "amd64" diff --git a/vendor/github.com/templexxx/cpu/cpu_amd64p32.go b/vendor/github.com/templexxx/cpu/cpu_amd64p32.go new file mode 100644 index 0000000..177b14e --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_amd64p32.go @@ -0,0 +1,7 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const GOARCH = "amd64p32" diff --git a/vendor/github.com/templexxx/cpu/cpu_arm.go b/vendor/github.com/templexxx/cpu/cpu_arm.go new file mode 100644 index 0000000..078a6c3 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_arm.go @@ -0,0 +1,7 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const CacheLineSize = 32 diff --git a/vendor/github.com/templexxx/cpu/cpu_arm64.go b/vendor/github.com/templexxx/cpu/cpu_arm64.go new file mode 100644 index 0000000..487ccf8 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_arm64.go @@ -0,0 +1,102 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const CacheLineSize = 64 + +// arm64 doesn't have a 'cpuid' equivalent, so we rely on HWCAP/HWCAP2. +// These are linknamed in runtime/os_linux_arm64.go and are initialized by +// archauxv(). +var hwcap uint +var hwcap2 uint + +// HWCAP/HWCAP2 bits. These are exposed by Linux. +const ( + hwcap_FP = (1 << 0) + hwcap_ASIMD = (1 << 1) + hwcap_EVTSTRM = (1 << 2) + hwcap_AES = (1 << 3) + hwcap_PMULL = (1 << 4) + hwcap_SHA1 = (1 << 5) + hwcap_SHA2 = (1 << 6) + hwcap_CRC32 = (1 << 7) + hwcap_ATOMICS = (1 << 8) + hwcap_FPHP = (1 << 9) + hwcap_ASIMDHP = (1 << 10) + hwcap_CPUID = (1 << 11) + hwcap_ASIMDRDM = (1 << 12) + hwcap_JSCVT = (1 << 13) + hwcap_FCMA = (1 << 14) + hwcap_LRCPC = (1 << 15) + hwcap_DCPOP = (1 << 16) + hwcap_SHA3 = (1 << 17) + hwcap_SM3 = (1 << 18) + hwcap_SM4 = (1 << 19) + hwcap_ASIMDDP = (1 << 20) + hwcap_SHA512 = (1 << 21) + hwcap_SVE = (1 << 22) + hwcap_ASIMDFHM = (1 << 23) +) + +func doinit() { + options = []option{ + {"evtstrm", &ARM64.HasEVTSTRM}, + {"aes", &ARM64.HasAES}, + {"pmull", &ARM64.HasPMULL}, + {"sha1", &ARM64.HasSHA1}, + {"sha2", &ARM64.HasSHA2}, + {"crc32", &ARM64.HasCRC32}, + {"atomics", &ARM64.HasATOMICS}, + {"fphp", &ARM64.HasFPHP}, + {"asimdhp", &ARM64.HasASIMDHP}, + {"cpuid", &ARM64.HasCPUID}, + {"asimdrdm", &ARM64.HasASIMDRDM}, + {"jscvt", &ARM64.HasJSCVT}, + {"fcma", &ARM64.HasFCMA}, + {"lrcpc", &ARM64.HasLRCPC}, + {"dcpop", &ARM64.HasDCPOP}, + {"sha3", &ARM64.HasSHA3}, + {"sm3", &ARM64.HasSM3}, + {"sm4", &ARM64.HasSM4}, + {"asimddp", &ARM64.HasASIMDDP}, + {"sha512", &ARM64.HasSHA512}, + {"sve", &ARM64.HasSVE}, + {"asimdfhm", &ARM64.HasASIMDFHM}, + + // These capabilities should always be enabled on arm64: + // {"fp", &ARM64.HasFP}, + // {"asimd", &ARM64.HasASIMD}, + } + + // HWCAP feature bits + ARM64.HasFP = isSet(hwcap, hwcap_FP) + ARM64.HasASIMD = isSet(hwcap, hwcap_ASIMD) + ARM64.HasEVTSTRM = isSet(hwcap, hwcap_EVTSTRM) + ARM64.HasAES = isSet(hwcap, hwcap_AES) + ARM64.HasPMULL = isSet(hwcap, hwcap_PMULL) + ARM64.HasSHA1 = isSet(hwcap, hwcap_SHA1) + ARM64.HasSHA2 = isSet(hwcap, hwcap_SHA2) + ARM64.HasCRC32 = isSet(hwcap, hwcap_CRC32) + ARM64.HasATOMICS = isSet(hwcap, hwcap_ATOMICS) + ARM64.HasFPHP = isSet(hwcap, hwcap_FPHP) + ARM64.HasASIMDHP = isSet(hwcap, hwcap_ASIMDHP) + ARM64.HasCPUID = isSet(hwcap, hwcap_CPUID) + ARM64.HasASIMDRDM = isSet(hwcap, hwcap_ASIMDRDM) + ARM64.HasJSCVT = isSet(hwcap, hwcap_JSCVT) + ARM64.HasFCMA = isSet(hwcap, hwcap_FCMA) + ARM64.HasLRCPC = isSet(hwcap, hwcap_LRCPC) + ARM64.HasDCPOP = isSet(hwcap, hwcap_DCPOP) + ARM64.HasSHA3 = isSet(hwcap, hwcap_SHA3) + ARM64.HasSM3 = isSet(hwcap, hwcap_SM3) + ARM64.HasSM4 = isSet(hwcap, hwcap_SM4) + ARM64.HasASIMDDP = isSet(hwcap, hwcap_ASIMDDP) + ARM64.HasSHA512 = isSet(hwcap, hwcap_SHA512) + ARM64.HasSVE = isSet(hwcap, hwcap_SVE) + ARM64.HasASIMDFHM = isSet(hwcap, hwcap_ASIMDFHM) +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} diff --git a/vendor/github.com/templexxx/cpu/cpu_mips.go b/vendor/github.com/templexxx/cpu/cpu_mips.go new file mode 100644 index 0000000..078a6c3 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_mips.go @@ -0,0 +1,7 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const CacheLineSize = 32 diff --git a/vendor/github.com/templexxx/cpu/cpu_mips64.go b/vendor/github.com/templexxx/cpu/cpu_mips64.go new file mode 100644 index 0000000..078a6c3 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_mips64.go @@ -0,0 +1,7 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const CacheLineSize = 32 diff --git a/vendor/github.com/templexxx/cpu/cpu_mips64le.go b/vendor/github.com/templexxx/cpu/cpu_mips64le.go new file mode 100644 index 0000000..078a6c3 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_mips64le.go @@ -0,0 +1,7 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const CacheLineSize = 32 diff --git a/vendor/github.com/templexxx/cpu/cpu_mipsle.go b/vendor/github.com/templexxx/cpu/cpu_mipsle.go new file mode 100644 index 0000000..078a6c3 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_mipsle.go @@ -0,0 +1,7 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const CacheLineSize = 32 diff --git a/vendor/github.com/templexxx/cpu/cpu_no_init.go b/vendor/github.com/templexxx/cpu/cpu_no_init.go new file mode 100644 index 0000000..1be4f29 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_no_init.go @@ -0,0 +1,16 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !386 +// +build !amd64 +// +build !amd64p32 +// +build !arm64 +// +build !ppc64 +// +build !ppc64le +// +build !s390x + +package cpu + +func doinit() { +} diff --git a/vendor/github.com/templexxx/cpu/cpu_ppc64x.go b/vendor/github.com/templexxx/cpu/cpu_ppc64x.go new file mode 100644 index 0000000..995cf02 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_ppc64x.go @@ -0,0 +1,68 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ppc64 ppc64le + +package cpu + +const CacheLineSize = 128 + +// ppc64x doesn't have a 'cpuid' equivalent, so we rely on HWCAP/HWCAP2. +// These are linknamed in runtime/os_linux_ppc64x.go and are initialized by +// archauxv(). +var hwcap uint +var hwcap2 uint + +// HWCAP/HWCAP2 bits. These are exposed by the kernel. +const ( + // ISA Level + _PPC_FEATURE2_ARCH_2_07 = 0x80000000 + _PPC_FEATURE2_ARCH_3_00 = 0x00800000 + + // CPU features + _PPC_FEATURE_HAS_ALTIVEC = 0x10000000 + _PPC_FEATURE_HAS_DFP = 0x00000400 + _PPC_FEATURE_HAS_VSX = 0x00000080 + _PPC_FEATURE2_HAS_HTM = 0x40000000 + _PPC_FEATURE2_HAS_ISEL = 0x08000000 + _PPC_FEATURE2_HAS_VEC_CRYPTO = 0x02000000 + _PPC_FEATURE2_HTM_NOSC = 0x01000000 + _PPC_FEATURE2_DARN = 0x00200000 + _PPC_FEATURE2_SCV = 0x00100000 +) + +func doinit() { + options = []option{ + {"htm", &PPC64.HasHTM}, + {"htmnosc", &PPC64.HasHTMNOSC}, + {"darn", &PPC64.HasDARN}, + {"scv", &PPC64.HasSCV}, + + // These capabilities should always be enabled on ppc64 and ppc64le: + // {"vmx", &PPC64.HasVMX}, + // {"dfp", &PPC64.HasDFP}, + // {"vsx", &PPC64.HasVSX}, + // {"isel", &PPC64.HasISEL}, + // {"vcrypto", &PPC64.HasVCRYPTO}, + } + + // HWCAP feature bits + PPC64.HasVMX = isSet(hwcap, _PPC_FEATURE_HAS_ALTIVEC) + PPC64.HasDFP = isSet(hwcap, _PPC_FEATURE_HAS_DFP) + PPC64.HasVSX = isSet(hwcap, _PPC_FEATURE_HAS_VSX) + + // HWCAP2 feature bits + PPC64.IsPOWER8 = isSet(hwcap2, _PPC_FEATURE2_ARCH_2_07) + PPC64.HasHTM = isSet(hwcap2, _PPC_FEATURE2_HAS_HTM) + PPC64.HasISEL = isSet(hwcap2, _PPC_FEATURE2_HAS_ISEL) + PPC64.HasVCRYPTO = isSet(hwcap2, _PPC_FEATURE2_HAS_VEC_CRYPTO) + PPC64.HasHTMNOSC = isSet(hwcap2, _PPC_FEATURE2_HTM_NOSC) + PPC64.IsPOWER9 = isSet(hwcap2, _PPC_FEATURE2_ARCH_3_00) + PPC64.HasDARN = isSet(hwcap2, _PPC_FEATURE2_DARN) + PPC64.HasSCV = isSet(hwcap2, _PPC_FEATURE2_SCV) +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} diff --git a/vendor/github.com/templexxx/cpu/cpu_s390x.go b/vendor/github.com/templexxx/cpu/cpu_s390x.go new file mode 100644 index 0000000..389a058 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_s390x.go @@ -0,0 +1,153 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const CacheLineSize = 256 + +// bitIsSet reports whether the bit at index is set. The bit index +// is in big endian order, so bit index 0 is the leftmost bit. +func bitIsSet(bits []uint64, index uint) bool { + return bits[index/64]&((1<<63)>>(index%64)) != 0 +} + +// function is the function code for the named function. +type function uint8 + +const ( + // KM{,A,C,CTR} function codes + aes128 function = 18 // AES-128 + aes192 = 19 // AES-192 + aes256 = 20 // AES-256 + + // K{I,L}MD function codes + sha1 = 1 // SHA-1 + sha256 = 2 // SHA-256 + sha512 = 3 // SHA-512 + + // KLMD function codes + ghash = 65 // GHASH +) + +// queryResult contains the result of a Query function +// call. Bits are numbered in big endian order so the +// leftmost bit (the MSB) is at index 0. +type queryResult struct { + bits [2]uint64 +} + +// Has reports whether the given functions are present. +func (q *queryResult) Has(fns ...function) bool { + if len(fns) == 0 { + panic("no function codes provided") + } + for _, f := range fns { + if !bitIsSet(q.bits[:], uint(f)) { + return false + } + } + return true +} + +// facility is a bit index for the named facility. +type facility uint8 + +const ( + // mandatory facilities + zarch facility = 1 // z architecture mode is active + stflef = 7 // store-facility-list-extended + ldisp = 18 // long-displacement + eimm = 21 // extended-immediate + + // miscellaneous facilities + dfp = 42 // decimal-floating-point + etf3eh = 30 // extended-translation 3 enhancement + + // cryptography facilities + msa = 17 // message-security-assist + msa3 = 76 // message-security-assist extension 3 + msa4 = 77 // message-security-assist extension 4 + msa5 = 57 // message-security-assist extension 5 + msa8 = 146 // message-security-assist extension 8 + + // Note: vx and highgprs are excluded because they require + // kernel support and so must be fetched from HWCAP. +) + +// facilityList contains the result of an STFLE call. +// Bits are numbered in big endian order so the +// leftmost bit (the MSB) is at index 0. +type facilityList struct { + bits [4]uint64 +} + +// Has reports whether the given facilities are present. +func (s *facilityList) Has(fs ...facility) bool { + if len(fs) == 0 { + panic("no facility bits provided") + } + for _, f := range fs { + if !bitIsSet(s.bits[:], uint(f)) { + return false + } + } + return true +} + +// The following feature detection functions are defined in cpu_s390x.s. +// They are likely to be expensive to call so the results should be cached. +func stfle() facilityList +func kmQuery() queryResult +func kmcQuery() queryResult +func kmctrQuery() queryResult +func kmaQuery() queryResult +func kimdQuery() queryResult +func klmdQuery() queryResult + +func doinit() { + options = []option{ + {"zarch", &S390X.HasZArch}, + {"stfle", &S390X.HasSTFLE}, + {"ldisp", &S390X.HasLDisp}, + {"msa", &S390X.HasMSA}, + {"eimm", &S390X.HasEImm}, + {"dfp", &S390X.HasDFP}, + {"etf3eh", &S390X.HasETF3Enhanced}, + {"vx", &S390X.HasVX}, + } + + aes := []function{aes128, aes192, aes256} + facilities := stfle() + + S390X.HasZArch = facilities.Has(zarch) + S390X.HasSTFLE = facilities.Has(stflef) + S390X.HasLDisp = facilities.Has(ldisp) + S390X.HasEImm = facilities.Has(eimm) + S390X.HasDFP = facilities.Has(dfp) + S390X.HasETF3Enhanced = facilities.Has(etf3eh) + S390X.HasMSA = facilities.Has(msa) + + if S390X.HasMSA { + // cipher message + km, kmc := kmQuery(), kmcQuery() + S390X.HasAES = km.Has(aes...) + S390X.HasAESCBC = kmc.Has(aes...) + if facilities.Has(msa4) { + kmctr := kmctrQuery() + S390X.HasAESCTR = kmctr.Has(aes...) + } + if facilities.Has(msa8) { + kma := kmaQuery() + S390X.HasAESGCM = kma.Has(aes...) + } + + // compute message digest + kimd := kimdQuery() // intermediate (no padding) + klmd := klmdQuery() // last (padding) + S390X.HasSHA1 = kimd.Has(sha1) && klmd.Has(sha1) + S390X.HasSHA256 = kimd.Has(sha256) && klmd.Has(sha256) + S390X.HasSHA512 = kimd.Has(sha512) && klmd.Has(sha512) + S390X.HasGHASH = kimd.Has(ghash) // KLMD-GHASH does not exist + } +} diff --git a/vendor/github.com/templexxx/cpu/cpu_s390x.s b/vendor/github.com/templexxx/cpu/cpu_s390x.s new file mode 100644 index 0000000..9678035 --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_s390x.s @@ -0,0 +1,55 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// func stfle() facilityList +TEXT ·stfle(SB), NOSPLIT|NOFRAME, $0-32 + MOVD $ret+0(FP), R1 + MOVD $3, R0 // last doubleword index to store + XC $32, (R1), (R1) // clear 4 doublewords (32 bytes) + WORD $0xb2b01000 // store facility list extended (STFLE) + RET + +// func kmQuery() queryResult +TEXT ·kmQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KM-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB92E0024 // cipher message (KM) + RET + +// func kmcQuery() queryResult +TEXT ·kmcQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KMC-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB92F0024 // cipher message with chaining (KMC) + RET + +// func kmctrQuery() queryResult +TEXT ·kmctrQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KMCTR-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB92D4024 // cipher message with counter (KMCTR) + RET + +// func kmaQuery() queryResult +TEXT ·kmaQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KMA-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xb9296024 // cipher message with authentication (KMA) + RET + +// func kimdQuery() queryResult +TEXT ·kimdQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KIMD-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB93E0024 // compute intermediate message digest (KIMD) + RET + +// func klmdQuery() queryResult +TEXT ·klmdQuery(SB), NOSPLIT|NOFRAME, $0-16 + MOVD $0, R0 // set function code to 0 (KLMD-Query) + MOVD $ret+0(FP), R1 // address of 16-byte return value + WORD $0xB93F0024 // compute last message digest (KLMD) + RET diff --git a/vendor/github.com/templexxx/cpu/cpu_wasm.go b/vendor/github.com/templexxx/cpu/cpu_wasm.go new file mode 100644 index 0000000..1107a7a --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_wasm.go @@ -0,0 +1,7 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const CacheLineSize = 64 diff --git a/vendor/github.com/templexxx/cpu/cpu_x86.go b/vendor/github.com/templexxx/cpu/cpu_x86.go new file mode 100644 index 0000000..313a29a --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_x86.go @@ -0,0 +1,425 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build 386 amd64 amd64p32 + +package cpu + +import ( + "fmt" + "strings" +) + +const CacheLineSize = 64 + +// cpuid is implemented in cpu_x86.s. +func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) + +// xgetbv with ecx = 0 is implemented in cpu_x86.s. +func xgetbv() (eax, edx uint32) + +const ( + // edx bits + cpuid_SSE2 = 1 << 26 + + // ecx bits + cpuid_SSE3 = 1 << 0 + cpuid_PCLMULQDQ = 1 << 1 + cpuid_SSSE3 = 1 << 9 + cpuid_FMA = 1 << 12 + cpuid_SSE41 = 1 << 19 + cpuid_SSE42 = 1 << 20 + cpuid_POPCNT = 1 << 23 + cpuid_AES = 1 << 25 + cpuid_OSXSAVE = 1 << 27 + cpuid_AVX = 1 << 28 + + // ebx bits + cpuid_BMI1 = 1 << 3 + cpuid_AVX2 = 1 << 5 + cpuid_BMI2 = 1 << 8 + cpuid_ERMS = 1 << 9 + cpuid_ADX = 1 << 19 + cpuid_AVX512F = 1 << 16 + cpuid_AVX512DQ = 1 << 17 + cpuid_AVX512BW = 1 << 30 + cpuid_AVX512VL = 1 << 31 + + // edx bits + cpuid_Invariant_TSC = 1 << 8 +) + +func doinit() { + options = []option{ + {"adx", &X86.HasADX}, + {"aes", &X86.HasAES}, + {"avx", &X86.HasAVX}, + {"avx2", &X86.HasAVX2}, + {"bmi1", &X86.HasBMI1}, + {"bmi2", &X86.HasBMI2}, + {"erms", &X86.HasERMS}, + {"fma", &X86.HasFMA}, + {"pclmulqdq", &X86.HasPCLMULQDQ}, + {"popcnt", &X86.HasPOPCNT}, + {"sse3", &X86.HasSSE3}, + {"sse41", &X86.HasSSE41}, + {"sse42", &X86.HasSSE42}, + {"ssse3", &X86.HasSSSE3}, + {"avx512f", &X86.HasAVX512F}, + {"avx512dq", &X86.HasAVX512DQ}, + {"avx512bw", &X86.HasAVX512BW}, + {"avx512vl", &X86.HasAVX512VL}, + {"invariant_tsc", &X86.HasInvariantTSC}, + + // sse2 set as last element so it can easily be removed again. See code below. + {"sse2", &X86.HasSSE2}, + } + + // Remove sse2 from options on amd64(p32) because SSE2 is a mandatory feature for these GOARCHs. + if GOARCH == "amd64" || GOARCH == "amd64p32" { + options = options[:len(options)-1] + } + + maxID, _, _, _ := cpuid(0, 0) + + if maxID < 1 { + return + } + + _, _, ecx1, edx1 := cpuid(1, 0) + X86.HasSSE2 = isSet(edx1, cpuid_SSE2) + + X86.HasSSE3 = isSet(ecx1, cpuid_SSE3) + X86.HasPCLMULQDQ = isSet(ecx1, cpuid_PCLMULQDQ) + X86.HasSSSE3 = isSet(ecx1, cpuid_SSSE3) + X86.HasFMA = isSet(ecx1, cpuid_FMA) + X86.HasSSE41 = isSet(ecx1, cpuid_SSE41) + X86.HasSSE42 = isSet(ecx1, cpuid_SSE42) + X86.HasPOPCNT = isSet(ecx1, cpuid_POPCNT) + X86.HasAES = isSet(ecx1, cpuid_AES) + X86.HasOSXSAVE = isSet(ecx1, cpuid_OSXSAVE) + + osSupportsAVX := false + osSupportsAVX512 := false + // For XGETBV, OSXSAVE bit is required and sufficient. + if X86.HasOSXSAVE { + eax, _ := xgetbv() + // Check if XMM and YMM registers have OS support. + osSupportsAVX = isSet(eax, 1<<1) && isSet(eax, 1<<2) + // Check is ZMM registers have OS support. + osSupportsAVX512 = isSet(eax>>5, 7) && isSet(eax>>1, 3) + } + + X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAVX + + if maxID < 7 { + return + } + + _, ebx7, _, _ := cpuid(7, 0) + X86.HasBMI1 = isSet(ebx7, cpuid_BMI1) + X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX + X86.HasAVX512F = isSet(ebx7, cpuid_AVX512F) && osSupportsAVX512 + X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ) && osSupportsAVX512 + X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW) && osSupportsAVX512 + X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL) && osSupportsAVX512 + X86.HasBMI2 = isSet(ebx7, cpuid_BMI2) + X86.HasERMS = isSet(ebx7, cpuid_ERMS) + X86.HasADX = isSet(ebx7, cpuid_ADX) + + X86.Cache = getCacheSize() + + X86.HasInvariantTSC = hasInvariantTSC() + + X86.Family, X86.Model = getFamilyModel() + + X86.Signature = makeSignature(X86.Family, X86.Model) + + X86.Name = getName() + + X86.TSCFrequency = getNativeTSCFrequency(X86.Name, X86.Signature) +} + +func isSet(hwc uint32, value uint32) bool { + return hwc&value != 0 +} + +func hasInvariantTSC() bool { + if maxExtendedFunction() < 0x80000007 { + return false + } + _, _, _, edx := cpuid(0x80000007, 0) + return isSet(edx, cpuid_Invariant_TSC) +} + +func getName() string { + if maxExtendedFunction() >= 0x80000004 { + v := make([]uint32, 0, 48) + for i := uint32(0); i < 3; i++ { + a, b, c, d := cpuid(0x80000002+i, 0) + v = append(v, a, b, c, d) + } + return strings.Trim(string(valAsString(v...)), " ") + } + return "unknown" +} + +// getNativeTSCFrequency gets TSC frequency from CPUID, +// only supports Intel (Skylake or later microarchitecture) & key information is from Intel manual & kernel codes +// (especially this commit: https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684). +func getNativeTSCFrequency(name, sign string) uint64 { + + if vendorID() != Intel { + return 0 + } + + if maxFunctionID() < 0x15 { + return 0 + } + + // ApolloLake, GeminiLake, CannonLake (and presumably all new chipsets + // from this point) report the crystal frequency directly via CPUID.0x15. + // That's definitive data that we can rely upon. + eax, ebx, ecx, _ := cpuid(0x15, 0) + + // If ebx is 0, the TSC/”core crystal clock” ratio is not enumerated. + // We won't provide TSC frequency detection in this situation. + if eax == 0 || ebx == 0 { + return 0 + } + + // Skylake, Kabylake and all variants of those two chipsets report a + // crystal frequency of zero. + if ecx == 0 { // Crystal clock frequency is not enumerated. + ecx = getCrystalClockFrequency(sign) + } + + // TSC frequency = “core crystal clock frequency” * EBX/EAX. + return uint64(ecx) * (uint64(ebx) / uint64(eax)) +} + +// Copied from: CPUID Signature values of DisplayFamily and DisplayModel, +// in Intel® 64 and IA-32 Architectures Software Developer’s Manual +// Volume 4: Model-Specific Registers +// & https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/intel-family.h +const ( + IntelFam6SkylakeL = "06_4EH" + IntelFam6Skylake = "06_5EH" + IntelFam6SkylakeX = "06_55H" + IntelFam6KabylakeL = "06_8EH" + IntelFam6Kabylake = "06_9EH" +) + +// getCrystalClockFrequency gets crystal clock frequency +// for Intel processors in which CPUID.15H.EBX[31:0] ÷ CPUID.0x15.EAX[31:0] is enumerated +// but CPUID.15H.ECX is not enumerated using this function to get nominal core crystal clock frequency. +// +// Actually these crystal clock frequencies provided by Intel hardcoded tables are not so accurate in some cases, +// e.g. SkyLake server CPU may have issue (All SKX subject the crystal to an EMI reduction circuit that +//reduces its actual frequency by (approximately) -0.25%): +// see https://lore.kernel.org/lkml/ff6dcea166e8ff8f2f6a03c17beab2cb436aa779.1513920414.git.len.brown@intel.com/ +// for more details. +// With this report, I set a coefficient (0.9975) for IntelFam6SkyLakeX. +// +// Unlike the kernel way (mentioned in https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684), +// I prefer the Intel hardcoded tables, +// because after some testing (comparing with wall clock, see https://github.com/templexxx/tsc/tsc_test.go for more details), +// I found hardcoded tables are more accurate. +func getCrystalClockFrequency(sign string) uint32 { + + if maxFunctionID() < 0x16 { + return 0 + } + + switch sign { + case IntelFam6SkylakeL: + return 24 * 1000 * 1000 + case IntelFam6Skylake: + return 24 * 1000 * 1000 + case IntelFam6SkylakeX: + return 25 * 1000 * 1000 * 0.9975 + case IntelFam6KabylakeL: + return 24 * 1000 * 1000 + case IntelFam6Kabylake: + return 24 * 1000 * 1000 + } + + return 0 +} + +func getFamilyModel() (uint32, uint32) { + if maxFunctionID() < 0x1 { + return 0, 0 + } + eax, _, _, _ := cpuid(1, 0) + family := (eax >> 8) & 0xf + displayFamily := family + if family == 0xf { + displayFamily = ((eax >> 20) & 0xff) + family + } + model := (eax >> 4) & 0xf + displayModel := model + if family == 0x6 || family == 0xf { + displayModel = ((eax >> 12) & 0xf0) + model + } + return displayFamily, displayModel +} + +// signature format: XX_XXH +func makeSignature(family, model uint32) string { + signature := strings.ToUpper(fmt.Sprintf("0%x_0%xH", family, model)) + ss := strings.Split(signature, "_") + for i, s := range ss { + // Maybe insert too more `0`, drop it. + if len(s) > 2 { + s = s[1:] + ss[i] = s + } + } + return strings.Join(ss, "_") +} + +// getCacheSize is from +// https://github.com/klauspost/cpuid/blob/5a626f7029c910cc8329dae5405ee4f65034bce5/cpuid.go#L723 +func getCacheSize() Cache { + c := Cache{ + L1I: -1, + L1D: -1, + L2: -1, + L3: -1, + } + + vendor := vendorID() + switch vendor { + case Intel: + if maxFunctionID() < 4 { + return c + } + for i := uint32(0); ; i++ { + eax, ebx, ecx, _ := cpuid(4, i) + cacheType := eax & 15 + if cacheType == 0 { + break + } + cacheLevel := (eax >> 5) & 7 + coherency := int(ebx&0xfff) + 1 + partitions := int((ebx>>12)&0x3ff) + 1 + associativity := int((ebx>>22)&0x3ff) + 1 + sets := int(ecx) + 1 + size := associativity * partitions * coherency * sets + switch cacheLevel { + case 1: + if cacheType == 1 { + // 1 = Data Cache + c.L1D = size + } else if cacheType == 2 { + // 2 = Instruction Cache + c.L1I = size + } else { + if c.L1D < 0 { + c.L1I = size + } + if c.L1I < 0 { + c.L1I = size + } + } + case 2: + c.L2 = size + case 3: + c.L3 = size + } + } + case AMD, Hygon: + // Untested. + if maxExtendedFunction() < 0x80000005 { + return c + } + _, _, ecx, edx := cpuid(0x80000005, 0) + c.L1D = int(((ecx >> 24) & 0xFF) * 1024) + c.L1I = int(((edx >> 24) & 0xFF) * 1024) + + if maxExtendedFunction() < 0x80000006 { + return c + } + _, _, ecx, _ = cpuid(0x80000006, 0) + c.L2 = int(((ecx >> 16) & 0xFFFF) * 1024) + } + + return c +} + +func maxFunctionID() uint32 { + a, _, _, _ := cpuid(0, 0) + return a +} + +func maxExtendedFunction() uint32 { + eax, _, _, _ := cpuid(0x80000000, 0) + return eax +} + +const ( + Other = iota + Intel + AMD + VIA + Transmeta + NSC + KVM // Kernel-based Virtual Machine + MSVM // Microsoft Hyper-V or Windows Virtual PC + VMware + XenHVM + Bhyve + Hygon +) + +// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID +var vendorMapping = map[string]int{ + "AMDisbetter!": AMD, + "AuthenticAMD": AMD, + "CentaurHauls": VIA, + "GenuineIntel": Intel, + "TransmetaCPU": Transmeta, + "GenuineTMx86": Transmeta, + "Geode by NSC": NSC, + "VIA VIA VIA ": VIA, + "KVMKVMKVMKVM": KVM, + "Microsoft Hv": MSVM, + "VMwareVMware": VMware, + "XenVMMXenVMM": XenHVM, + "bhyve bhyve ": Bhyve, + "HygonGenuine": Hygon, +} + +func vendorID() int { + _, b, c, d := cpuid(0, 0) + v := valAsString(b, d, c) + vend, ok := vendorMapping[string(v)] + if !ok { + return Other + } + return vend +} + +func valAsString(values ...uint32) []byte { + r := make([]byte, 4*len(values)) + for i, v := range values { + dst := r[i*4:] + dst[0] = byte(v & 0xff) + dst[1] = byte((v >> 8) & 0xff) + dst[2] = byte((v >> 16) & 0xff) + dst[3] = byte((v >> 24) & 0xff) + switch { + case dst[0] == 0: + return r[:i*4] + case dst[1] == 0: + return r[:i*4+1] + case dst[2] == 0: + return r[:i*4+2] + case dst[3] == 0: + return r[:i*4+3] + } + } + return r +} diff --git a/vendor/github.com/templexxx/cpu/cpu_x86.s b/vendor/github.com/templexxx/cpu/cpu_x86.s new file mode 100644 index 0000000..228fbcf --- /dev/null +++ b/vendor/github.com/templexxx/cpu/cpu_x86.s @@ -0,0 +1,32 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build 386 amd64 amd64p32 + +#include "textflag.h" + +// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) +TEXT ·cpuid(SB), NOSPLIT, $0-24 + MOVL eaxArg+0(FP), AX + MOVL ecxArg+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func xgetbv() (eax, edx uint32) +TEXT ·xgetbv(SB),NOSPLIT,$0-8 +#ifdef GOOS_nacl + // nacl does not support XGETBV. + MOVL $0, eax+0(FP) + MOVL $0, edx+4(FP) +#else + MOVL $0, CX + XGETBV + MOVL AX, eax+0(FP) + MOVL DX, edx+4(FP) +#endif + RET diff --git a/vendor/github.com/templexxx/xorsimd/.gitattributes b/vendor/github.com/templexxx/xorsimd/.gitattributes new file mode 100644 index 0000000..68f7d04 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/.gitattributes @@ -0,0 +1 @@ +*.s linguist-language=go:x diff --git a/vendor/github.com/templexxx/xorsimd/.gitignore b/vendor/github.com/templexxx/xorsimd/.gitignore new file mode 100644 index 0000000..43309f8 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/.gitignore @@ -0,0 +1,13 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out +.idea diff --git a/vendor/github.com/templexxx/xorsimd/LICENSE b/vendor/github.com/templexxx/xorsimd/LICENSE new file mode 100644 index 0000000..08ee714 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Temple3x (temple3x@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/templexxx/xorsimd/README.md b/vendor/github.com/templexxx/xorsimd/README.md new file mode 100644 index 0000000..9dce5c9 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/README.md @@ -0,0 +1,46 @@ +# XOR SIMD + +[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8] [![Sourcegraph][9]][10] + +[1]: https://godoc.org/github.com/templexxx/xorsimd?status.svg +[2]: https://godoc.org/github.com/templexxx/xorsimd +[3]: https://img.shields.io/badge/license-MIT-blue.svg +[4]: LICENSE +[5]: https://github.com/templexxx/xorsimd/workflows/unit-test/badge.svg +[6]: https://github.com/templexxx/xorsimd +[7]: https://goreportcard.com/badge/github.com/templexxx/xorsimd +[8]: https://goreportcard.com/report/github.com/templexxx/xorsimd +[9]: https://sourcegraph.com/github.com/templexxx/xorsimd/-/badge.svg +[10]: https://sourcegraph.com/github.com/templexxx/xorsimd?badge + +## Introduction: + +>- XOR code engine in pure Go. +> +>- [High Performance](https://github.com/templexxx/xorsimd#performance): +More than 270GB/s per physics core. + +## Performance + +Performance depends mainly on: + +>- CPU instruction extension. +> +>- Number of source row vectors. + +**Platform:** + +*AWS c5d.xlarge (Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz)* + +**All test run on a single Core.** + +`I/O = (src_num + 1) * vector_size / cost` + +| Src Num | Vector size | AVX512 I/O (MB/S) | AVX2 I/O (MB/S) |SSE2 I/O (MB/S) | +|-------|-------------|-------------|---------------|---------------| +|5|4KB| 270403.73 | 142825.25 | 74443.91 | +|5|1MB| 26948.34 | 26887.37 | 26950.65 | +|5|8MB| 17881.32 | 17212.56 | 16402.97 | +|10|4KB| 190445.30 | 102953.59 | 53244.04 | +|10|1MB| 26424.44 | 26618.65 | 26094.39 | +|10|8MB| 15471.31 | 14866.72 | 13565.80 | diff --git a/vendor/github.com/templexxx/xorsimd/go.mod b/vendor/github.com/templexxx/xorsimd/go.mod new file mode 100644 index 0000000..ac5f57f --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/go.mod @@ -0,0 +1,5 @@ +module github.com/templexxx/xorsimd + +require github.com/templexxx/cpu v0.0.1 + +go 1.13 diff --git a/vendor/github.com/templexxx/xorsimd/go.sum b/vendor/github.com/templexxx/xorsimd/go.sum new file mode 100644 index 0000000..04d04de --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/go.sum @@ -0,0 +1,2 @@ +github.com/templexxx/cpu v0.0.1 h1:hY4WdLOgKdc8y13EYklu9OUTXik80BkxHoWvTO6MQQY= +github.com/templexxx/cpu v0.0.1/go.mod h1:w7Tb+7qgcAlIyX4NhLuDKt78AHA5SzPmq0Wj6HiEnnk= diff --git a/vendor/github.com/templexxx/xorsimd/xor.go b/vendor/github.com/templexxx/xorsimd/xor.go new file mode 100644 index 0000000..ae88911 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xor.go @@ -0,0 +1,89 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +package xorsimd + +import "github.com/templexxx/cpu" + +// EnableAVX512 may slow down CPU Clock (maybe not). +// TODO need more research: +// https://lemire.me/blog/2018/04/19/by-how-much-does-avx-512-slow-down-your-cpu-a-first-experiment/ +var EnableAVX512 = true + +// cpuFeature indicates which instruction set will be used. +var cpuFeature = getCPUFeature() + +const ( + avx512 = iota + avx2 + sse2 + generic +) + +// TODO: Add ARM feature... +func getCPUFeature() int { + if hasAVX512() && EnableAVX512 { + return avx512 + } else if cpu.X86.HasAVX2 { + return avx2 + } else { + return sse2 // amd64 must has sse2 + } +} + +func hasAVX512() (ok bool) { + + return cpu.X86.HasAVX512VL && + cpu.X86.HasAVX512BW && + cpu.X86.HasAVX512F && + cpu.X86.HasAVX512DQ +} + +// Encode encodes elements from source slice into a +// destination slice. The source and destination may overlap. +// Encode returns the number of bytes encoded, which will be the minimum of +// len(src[i]) and len(dst). +func Encode(dst []byte, src [][]byte) (n int) { + n = checkLen(dst, src) + if n == 0 { + return + } + + dst = dst[:n] + for i := range src { + src[i] = src[i][:n] + } + + if len(src) == 1 { + copy(dst, src[0]) + return + } + + encode(dst, src) + return +} + +func checkLen(dst []byte, src [][]byte) int { + n := len(dst) + for i := range src { + if len(src[i]) < n { + n = len(src[i]) + } + } + + if n <= 0 { + return 0 + } + return n +} + +// Bytes XORs the bytes in a and b into a +// destination slice. The source and destination may overlap. +// +// Bytes returns the number of bytes encoded, which will be the minimum of +// len(dst), len(a), len(b). +func Bytes(dst, a, b []byte) int { + return Encode(dst, [][]byte{a, b}) +} diff --git a/vendor/github.com/templexxx/xorsimd/xor_amd64.go b/vendor/github.com/templexxx/xorsimd/xor_amd64.go new file mode 100644 index 0000000..5d46df3 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xor_amd64.go @@ -0,0 +1,95 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +package xorsimd + +func encode(dst []byte, src [][]byte) { + + switch cpuFeature { + case avx512: + encodeAVX512(dst, src) + case avx2: + encodeAVX2(dst, src) + default: + encodeSSE2(dst, src) + } + return +} + +// Bytes8 XORs of 8 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 8, +// if not, Bytes8 will panic. +func Bytes8(dst, a, b []byte) { + + bytes8(&dst[0], &a[0], &b[0]) +} + +// Bytes16 XORs of packed 16 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 16, +// if not, Bytes16 will panic. +func Bytes16(dst, a, b []byte) { + + bytes16(&dst[0], &a[0], &b[0]) +} + +// Bytes8Align XORs of 8 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 8, +// if not, Bytes8 will panic. +func Bytes8Align(dst, a, b []byte) { + + bytes8(&dst[0], &a[0], &b[0]) +} + +// Bytes16Align XORs of packed 16 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 16, +// if not, Bytes16 will panic. +func Bytes16Align(dst, a, b []byte) { + + bytes16(&dst[0], &a[0], &b[0]) +} + +// BytesA XORs the len(a) bytes in a and b into a +// destination slice. +// The destination should have enough space. +// +// It's used for encoding small bytes slices (< dozens bytes), +// and the slices may not be aligned to 8 bytes or 16 bytes. +// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead +// for gain better performance. +func BytesA(dst, a, b []byte) { + + bytesN(&dst[0], &a[0], &b[0], len(a)) +} + +// BytesB XORs the len(b) bytes in a and b into a +// destination slice. +// The destination should have enough space. +// +// It's used for encoding small bytes slices (< dozens bytes), +// and the slices may not be aligned to 8 bytes or 16 bytes. +// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead +// for gain better performance. +func BytesB(dst, a, b []byte) { + + bytesN(&dst[0], &a[0], &b[0], len(b)) +} + +//go:noescape +func encodeAVX512(dst []byte, src [][]byte) + +//go:noescape +func encodeAVX2(dst []byte, src [][]byte) + +//go:noescape +func encodeSSE2(dst []byte, src [][]byte) + +//go:noescape +func bytesN(dst, a, b *byte, n int) + +//go:noescape +func bytes8(dst, a, b *byte) + +//go:noescape +func bytes16(dst, a, b *byte) diff --git a/vendor/github.com/templexxx/xorsimd/xor_generic.go b/vendor/github.com/templexxx/xorsimd/xor_generic.go new file mode 100644 index 0000000..b12908f --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xor_generic.go @@ -0,0 +1,205 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. +// +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 + +package xorsimd + +import ( + "runtime" + "unsafe" +) + +const wordSize = int(unsafe.Sizeof(uintptr(0))) +const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" + +func encode(dst []byte, src [][]byte) { + if supportsUnaligned { + fastEncode(dst, src, len(dst)) + } else { + // TODO(hanwen): if (dst, a, b) have common alignment + // we could still try fastEncode. It is not clear + // how often this happens, and it's only worth it if + // the block encryption itself is hardware + // accelerated. + safeEncode(dst, src, len(dst)) + } + +} + +// fastEncode xor in bulk. It only works on architectures that +// support unaligned read/writes. +func fastEncode(dst []byte, src [][]byte, n int) { + w := n / wordSize + if w > 0 { + wordBytes := w * wordSize + + wordAlignSrc := make([][]byte, len(src)) + for i := range src { + wordAlignSrc[i] = src[i][:wordBytes] + } + fastEnc(dst[:wordBytes], wordAlignSrc) + } + + for i := n - n%wordSize; i < n; i++ { + s := src[0][i] + for j := 1; j < len(src); j++ { + s ^= src[j][i] + } + dst[i] = s + } +} + +func fastEnc(dst []byte, src [][]byte) { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + sw := make([][]uintptr, len(src)) + for i := range src { + sw[i] = *(*[]uintptr)(unsafe.Pointer(&src[i])) + } + + n := len(dst) / wordSize + for i := 0; i < n; i++ { + s := sw[0][i] + for j := 1; j < len(sw); j++ { + s ^= sw[j][i] + } + dw[i] = s + } +} + +func safeEncode(dst []byte, src [][]byte, n int) { + for i := 0; i < n; i++ { + s := src[0][i] + for j := 1; j < len(src); j++ { + s ^= src[j][i] + } + dst[i] = s + } +} + +// Bytes8 XORs of word 8 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 8, +// if not, Bytes8 will panic. +func Bytes8(dst, a, b []byte) { + + bytesWords(dst[:8], a[:8], b[:8]) +} + +// Bytes16 XORs of packed doubleword 16 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 16, +// if not, Bytes16 will panic. +func Bytes16(dst, a, b []byte) { + + bytesWords(dst[:16], a[:16], b[:16]) +} + +// bytesWords XORs multiples of 4 or 8 bytes (depending on architecture.) +// The slice arguments a and b are assumed to be of equal length. +func bytesWords(dst, a, b []byte) { + if supportsUnaligned { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + n := len(b) / wordSize + for i := 0; i < n; i++ { + dw[i] = aw[i] ^ bw[i] + } + } else { + n := len(b) + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b[i] + } + } +} + +// Bytes8Align XORs of 8 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 8, +// if not, Bytes8 will panic. +// +// All the byte slices must be aligned to wordsize. +func Bytes8Align(dst, a, b []byte) { + + bytesWordsAlign(dst[:8], a[:8], b[:8]) +} + +// Bytes16Align XORs of packed 16 Bytes. +// The slice arguments a, b, dst's lengths are assumed to be at least 16, +// if not, Bytes16 will panic. +// +// All the byte slices must be aligned to wordsize. +func Bytes16Align(dst, a, b []byte) { + + bytesWordsAlign(dst[:16], a[:16], b[:16]) +} + +// bytesWordsAlign XORs multiples of 4 or 8 bytes (depending on architecture.) +// The slice arguments a and b are assumed to be of equal length. +// +// All the byte slices must be aligned to wordsize. +func bytesWordsAlign(dst, a, b []byte) { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + n := len(b) / wordSize + for i := 0; i < n; i++ { + dw[i] = aw[i] ^ bw[i] + } +} + +// BytesA XORs the len(a) bytes in a and b into a +// destination slice. +// The destination should have enough space. +// +// It's used for encoding small bytes slices (< dozens bytes), +// and the slices may not be aligned to 8 bytes or 16 bytes. +// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead +// for gain better performance. +func BytesA(dst, a, b []byte) { + + n := len(a) + bytesN(dst[:n], a[:n], b[:n], n) +} + +// BytesB XORs the len(b) bytes in a and b into a +// destination slice. +// The destination should have enough space. +// +// It's used for encoding small bytes slices (< dozens bytes), +// and the slices may not be aligned to 8 bytes or 16 bytes. +// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead +// for gain better performance. +func BytesB(dst, a, b []byte) { + + n := len(b) + bytesN(dst[:n], a[:n], b[:n], n) +} + +func bytesN(dst, a, b []byte, n int) { + + switch { + case supportsUnaligned: + w := n / wordSize + if w > 0 { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + for i := 0; i < w; i++ { + dw[i] = aw[i] ^ bw[i] + } + } + + for i := (n - n%wordSize); i < n; i++ { + dst[i] = a[i] ^ b[i] + } + default: + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b[i] + } + } +} diff --git a/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s new file mode 100644 index 0000000..23cf924 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s @@ -0,0 +1,124 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +#include "textflag.h" + +#define dst BX // parity's address +#define d2src SI // two-dimension src_slice's address +#define csrc CX // cnt of src +#define len DX // len of vect +#define pos R8 // job position in vect + +#define csrc_tmp R9 +#define d2src_off R10 +#define src_tmp R11 +#define not_aligned_len R12 +#define src_val0 R13 +#define src_val1 R14 + +// func encodeAVX2(dst []byte, src [][]byte) +TEXT ·encodeAVX2(SB), NOSPLIT, $0 + MOVQ d+0(FP), dst + MOVQ s+24(FP), d2src + MOVQ c+32(FP), csrc + MOVQ l+8(FP), len + TESTQ $127, len + JNZ not_aligned + +aligned: + MOVQ $0, pos + +loop128b: + MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp + SUBQ $2, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp + VMOVDQU (src_tmp)(pos*1), Y0 + VMOVDQU 32(src_tmp)(pos*1), Y1 + VMOVDQU 64(src_tmp)(pos*1), Y2 + VMOVDQU 96(src_tmp)(pos*1), Y3 + +next_vect: + ADDQ $24, d2src_off // len(slice) = 24 + MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect + VMOVDQU (src_tmp)(pos*1), Y4 + VMOVDQU 32(src_tmp)(pos*1), Y5 + VMOVDQU 64(src_tmp)(pos*1), Y6 + VMOVDQU 96(src_tmp)(pos*1), Y7 + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + SUBQ $1, csrc_tmp + JGE next_vect + + VMOVDQU Y0, (dst)(pos*1) + VMOVDQU Y1, 32(dst)(pos*1) + VMOVDQU Y2, 64(dst)(pos*1) + VMOVDQU Y3, 96(dst)(pos*1) + + ADDQ $128, pos + CMPQ len, pos + JNE loop128b + VZEROUPPER + RET + +loop_1b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src + +next_vect_1b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVB -1(src_tmp)(len*1), src_val1 + XORB src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_1b + + MOVB src_val0, -1(dst)(len*1) + SUBQ $1, len + TESTQ $7, len + JNZ loop_1b + + CMPQ len, $0 + JE ret + TESTQ $127, len + JZ aligned + +not_aligned: + TESTQ $7, len + JNE loop_1b + MOVQ len, not_aligned_len + ANDQ $127, not_aligned_len + +loop_8b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVQ -8(src_tmp)(len*1), src_val0 + +next_vect_8b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVQ -8(src_tmp)(len*1), src_val1 + XORQ src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_8b + + MOVQ src_val0, -8(dst)(len*1) + SUBQ $8, len + SUBQ $8, not_aligned_len + JG loop_8b + + CMPQ len, $128 + JGE aligned + RET + +ret: + RET diff --git a/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s b/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s new file mode 100644 index 0000000..2ba6b75 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s @@ -0,0 +1,124 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +#include "textflag.h" + +#define dst BX // parity's address +#define d2src SI // two-dimension src_slice's address +#define csrc CX // cnt of src +#define len DX // len of vect +#define pos R8 // job position in vect + +#define csrc_tmp R9 +#define d2src_off R10 +#define src_tmp R11 +#define not_aligned_len R12 +#define src_val0 R13 +#define src_val1 R14 + +// func encodeAVX512(dst []byte, src [][]byte) +TEXT ·encodeAVX512(SB), NOSPLIT, $0 + MOVQ d+0(FP), dst + MOVQ src+24(FP), d2src + MOVQ c+32(FP), csrc + MOVQ l+8(FP), len + TESTQ $255, len + JNZ not_aligned + +aligned: + MOVQ $0, pos + +loop256b: + MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp + SUBQ $2, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp + VMOVDQU8 (src_tmp)(pos*1), Z0 + VMOVDQU8 64(src_tmp)(pos*1), Z1 + VMOVDQU8 128(src_tmp)(pos*1), Z2 + VMOVDQU8 192(src_tmp)(pos*1), Z3 + +next_vect: + ADDQ $24, d2src_off // len(slice) = 24 + MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect + VMOVDQU8 (src_tmp)(pos*1), Z4 + VMOVDQU8 64(src_tmp)(pos*1), Z5 + VMOVDQU8 128(src_tmp)(pos*1), Z6 + VMOVDQU8 192(src_tmp)(pos*1), Z7 + VPXORQ Z4, Z0, Z0 + VPXORQ Z5, Z1, Z1 + VPXORQ Z6, Z2, Z2 + VPXORQ Z7, Z3, Z3 + SUBQ $1, csrc_tmp + JGE next_vect + + VMOVDQU8 Z0, (dst)(pos*1) + VMOVDQU8 Z1, 64(dst)(pos*1) + VMOVDQU8 Z2, 128(dst)(pos*1) + VMOVDQU8 Z3, 192(dst)(pos*1) + + ADDQ $256, pos + CMPQ len, pos + JNE loop256b + VZEROUPPER + RET + +loop_1b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src + +next_vect_1b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVB -1(src_tmp)(len*1), src_val1 + XORB src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_1b + + MOVB src_val0, -1(dst)(len*1) + SUBQ $1, len + TESTQ $7, len + JNZ loop_1b + + CMPQ len, $0 + JE ret + TESTQ $255, len + JZ aligned + +not_aligned: + TESTQ $7, len + JNE loop_1b + MOVQ len, not_aligned_len + ANDQ $255, not_aligned_len + +loop_8b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVQ -8(src_tmp)(len*1), src_val0 + +next_vect_8b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVQ -8(src_tmp)(len*1), src_val1 + XORQ src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_8b + + MOVQ src_val0, -8(dst)(len*1) + SUBQ $8, len + SUBQ $8, not_aligned_len + JG loop_8b + + CMPQ len, $256 + JGE aligned + RET + +ret: + RET diff --git a/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s new file mode 100644 index 0000000..8f67edd --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s @@ -0,0 +1,72 @@ +#include "textflag.h" + +// func bytesN(dst, a, b *byte, n int) +TEXT ·bytesN(SB), NOSPLIT, $0 + MOVQ d+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVQ n+24(FP), DX + TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned. + JNZ not_aligned + +aligned: + MOVQ $0, AX // position in slices + +loop16b: + MOVOU (SI)(AX*1), X0 // XOR 16byte forwards. + MOVOU (CX)(AX*1), X1 + PXOR X1, X0 + MOVOU X0, (BX)(AX*1) + ADDQ $16, AX + CMPQ DX, AX + JNE loop16b + RET + +loop_1b: + SUBQ $1, DX // XOR 1byte backwards. + MOVB (SI)(DX*1), DI + MOVB (CX)(DX*1), AX + XORB AX, DI + MOVB DI, (BX)(DX*1) + TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b. + JNZ loop_1b + CMPQ DX, $0 // if len is 0, ret. + JE ret + TESTQ $15, DX // AND 15 & len, if zero jump to aligned. + JZ aligned + +not_aligned: + TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b. + JNE loop_1b + SUBQ $8, DX // XOR 8bytes backwards. + MOVQ (SI)(DX*1), DI + MOVQ (CX)(DX*1), AX + XORQ AX, DI + MOVQ DI, (BX)(DX*1) + CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned. + JGE aligned + +ret: + RET + +// func bytes8(dst, a, b *byte) +TEXT ·bytes8(SB), NOSPLIT, $0 + MOVQ d+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVQ (SI), DI + MOVQ (CX), AX + XORQ AX, DI + MOVQ DI, (BX) + RET + +// func bytes16(dst, a, b *byte) +TEXT ·bytes16(SB), NOSPLIT, $0 + MOVQ d+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVOU (SI), X0 + MOVOU (CX), X1 + PXOR X1, X0 + MOVOU X0, (BX) + RET diff --git a/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s b/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s new file mode 100644 index 0000000..38df948 --- /dev/null +++ b/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s @@ -0,0 +1,123 @@ +// Copyright (c) 2019. Temple3x (temple3x@gmail.com) +// +// Use of this source code is governed by the MIT License +// that can be found in the LICENSE file. + +#include "textflag.h" + +#define dst BX // parity's address +#define d2src SI // two-dimension src_slice's address +#define csrc CX // cnt of src +#define len DX // len of vect +#define pos R8 // job position in vect + +#define csrc_tmp R9 +#define d2src_off R10 +#define src_tmp R11 +#define not_aligned_len R12 +#define src_val0 R13 +#define src_val1 R14 + +// func encodeSSE2(dst []byte, src [][]byte) +TEXT ·encodeSSE2(SB), NOSPLIT, $0 + MOVQ d+0(FP), dst + MOVQ src+24(FP), d2src + MOVQ c+32(FP), csrc + MOVQ l+8(FP), len + TESTQ $63, len + JNZ not_aligned + +aligned: + MOVQ $0, pos + +loop64b: + MOVQ csrc, csrc_tmp + SUBQ $2, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVOU (src_tmp)(pos*1), X0 + MOVOU 16(src_tmp)(pos*1), X1 + MOVOU 32(src_tmp)(pos*1), X2 + MOVOU 48(src_tmp)(pos*1), X3 + +next_vect: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVOU (src_tmp)(pos*1), X4 + MOVOU 16(src_tmp)(pos*1), X5 + MOVOU 32(src_tmp)(pos*1), X6 + MOVOU 48(src_tmp)(pos*1), X7 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + SUBQ $1, csrc_tmp + JGE next_vect + + MOVOU X0, (dst)(pos*1) + MOVOU X1, 16(dst)(pos*1) + MOVOU X2, 32(dst)(pos*1) + MOVOU X3, 48(dst)(pos*1) + + ADDQ $64, pos + CMPQ len, pos + JNE loop64b + RET + +loop_1b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVB -1(src_tmp)(len*1), src_val0 + +next_vect_1b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVB -1(src_tmp)(len*1), src_val1 + XORB src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_1b + + MOVB src_val0, -1(dst)(len*1) + SUBQ $1, len + TESTQ $7, len + JNZ loop_1b + + CMPQ len, $0 + JE ret + TESTQ $63, len + JZ aligned + +not_aligned: + TESTQ $7, len + JNE loop_1b + MOVQ len, not_aligned_len + ANDQ $63, not_aligned_len + +loop_8b: + MOVQ csrc, csrc_tmp + MOVQ $0, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + SUBQ $2, csrc_tmp + MOVQ -8(src_tmp)(len*1), src_val0 + +next_vect_8b: + ADDQ $24, d2src_off + MOVQ (d2src)(d2src_off*1), src_tmp + MOVQ -8(src_tmp)(len*1), src_val1 + XORQ src_val1, src_val0 + SUBQ $1, csrc_tmp + JGE next_vect_8b + + MOVQ src_val0, -8(dst)(len*1) + SUBQ $8, len + SUBQ $8, not_aligned_len + JG loop_8b + + CMPQ len, $64 + JGE aligned + RET + +ret: + RET |