diff options
author | Arne Schwabe <arne@rfc2549.org> | 2015-04-15 00:17:26 +0200 |
---|---|---|
committer | Arne Schwabe <arne@rfc2549.org> | 2015-04-15 00:20:23 +0200 |
commit | c3ae4aaac9f0b168aed063d3e86c5196608eaba1 (patch) | |
tree | 1a18e7d8751d4dd3682d82d12c8441b335112984 /main/openssl/crypto/modes/asm/ghash-alpha.pl | |
parent | 5e42114d22faefe7c272b1b498fdf5640da494c7 (diff) |
Move more to git, add submodules, fix build script, change hgignore to gitignore
Diffstat (limited to 'main/openssl/crypto/modes/asm/ghash-alpha.pl')
m--------- | main/openssl | 0 | ||||
-rw-r--r-- | main/openssl/crypto/modes/asm/ghash-alpha.pl | 460 |
2 files changed, 0 insertions, 460 deletions
diff --git a/main/openssl b/main/openssl new file mode 160000 +Subproject 4d377a9ce111930d8a8f06dc0e94a892a7f6c51 diff --git a/main/openssl/crypto/modes/asm/ghash-alpha.pl b/main/openssl/crypto/modes/asm/ghash-alpha.pl deleted file mode 100644 index aa360293..00000000 --- a/main/openssl/crypto/modes/asm/ghash-alpha.pl +++ /dev/null @@ -1,460 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# March 2010 -# -# The module implements "4-bit" GCM GHASH function and underlying -# single multiplication operation in GF(2^128). "4-bit" means that it -# uses 256 bytes per-key table [+128 bytes shared table]. Even though -# loops are aggressively modulo-scheduled in respect to references to -# Htbl and Z.hi updates for 8 cycles per byte, measured performance is -# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic -# scheduling "glitch," because uprofile(1) indicates uniform sample -# distribution, as if all instruction bundles execute in 1.5 cycles. -# Meaning that it could have been even faster, yet 12 cycles is ~60% -# better than gcc-generated code and ~80% than code generated by vendor -# compiler. - -$cnt="v0"; # $0 -$t0="t0"; -$t1="t1"; -$t2="t2"; -$Thi0="t3"; # $4 -$Tlo0="t4"; -$Thi1="t5"; -$Tlo1="t6"; -$rem="t7"; # $8 -################# -$Xi="a0"; # $16, input argument block -$Htbl="a1"; -$inp="a2"; -$len="a3"; -$nlo="a4"; # $20 -$nhi="a5"; -$Zhi="t8"; -$Zlo="t9"; -$Xhi="t10"; # $24 -$Xlo="t11"; -$remp="t12"; -$rem_4bit="AT"; # $28 - -{ my $N; - sub loop() { - - $N++; -$code.=<<___; -.align 4 - extbl $Xlo,7,$nlo - and $nlo,0xf0,$nhi - sll $nlo,4,$nlo - and $nlo,0xf0,$nlo - - addq $nlo,$Htbl,$nlo - ldq $Zlo,8($nlo) - addq $nhi,$Htbl,$nhi - ldq $Zhi,0($nlo) - - and $Zlo,0x0f,$remp - sll $Zhi,60,$t0 - lda $cnt,6(zero) - extbl $Xlo,6,$nlo - - ldq $Tlo1,8($nhi) - s8addq $remp,$rem_4bit,$remp - ldq $Thi1,0($nhi) - srl $Zlo,4,$Zlo - - ldq $rem,0($remp) - srl $Zhi,4,$Zhi - xor $t0,$Zlo,$Zlo - and $nlo,0xf0,$nhi - - xor $Tlo1,$Zlo,$Zlo - sll $nlo,4,$nlo - xor $Thi1,$Zhi,$Zhi - and $nlo,0xf0,$nlo - - addq $nlo,$Htbl,$nlo - ldq $Tlo0,8($nlo) - addq $nhi,$Htbl,$nhi - ldq $Thi0,0($nlo) - -.Looplo$N: - and $Zlo,0x0f,$remp - sll $Zhi,60,$t0 - subq $cnt,1,$cnt - srl $Zlo,4,$Zlo - - ldq $Tlo1,8($nhi) - xor $rem,$Zhi,$Zhi - ldq $Thi1,0($nhi) - s8addq $remp,$rem_4bit,$remp - - ldq $rem,0($remp) - srl $Zhi,4,$Zhi - xor $t0,$Zlo,$Zlo - extbl $Xlo,$cnt,$nlo - - and $nlo,0xf0,$nhi - xor $Thi0,$Zhi,$Zhi - xor $Tlo0,$Zlo,$Zlo - sll $nlo,4,$nlo - - - and $Zlo,0x0f,$remp - sll $Zhi,60,$t0 - and $nlo,0xf0,$nlo - srl $Zlo,4,$Zlo - - s8addq $remp,$rem_4bit,$remp - xor $rem,$Zhi,$Zhi - addq $nlo,$Htbl,$nlo - addq $nhi,$Htbl,$nhi - - ldq $rem,0($remp) - srl $Zhi,4,$Zhi - ldq $Tlo0,8($nlo) - xor $t0,$Zlo,$Zlo - - xor $Tlo1,$Zlo,$Zlo - xor $Thi1,$Zhi,$Zhi - ldq $Thi0,0($nlo) - bne $cnt,.Looplo$N - - - and $Zlo,0x0f,$remp - sll $Zhi,60,$t0 - lda $cnt,7(zero) - srl $Zlo,4,$Zlo - - ldq $Tlo1,8($nhi) - xor $rem,$Zhi,$Zhi - ldq $Thi1,0($nhi) - s8addq $remp,$rem_4bit,$remp - - ldq $rem,0($remp) - srl $Zhi,4,$Zhi - xor $t0,$Zlo,$Zlo - extbl $Xhi,$cnt,$nlo - - and $nlo,0xf0,$nhi - xor $Thi0,$Zhi,$Zhi - xor $Tlo0,$Zlo,$Zlo - sll $nlo,4,$nlo - - and $Zlo,0x0f,$remp - sll $Zhi,60,$t0 - and $nlo,0xf0,$nlo - srl $Zlo,4,$Zlo - - s8addq $remp,$rem_4bit,$remp - xor $rem,$Zhi,$Zhi - addq $nlo,$Htbl,$nlo - addq $nhi,$Htbl,$nhi - - ldq $rem,0($remp) - srl $Zhi,4,$Zhi - ldq $Tlo0,8($nlo) - xor $t0,$Zlo,$Zlo - - xor $Tlo1,$Zlo,$Zlo - xor $Thi1,$Zhi,$Zhi - ldq $Thi0,0($nlo) - unop - - -.Loophi$N: - and $Zlo,0x0f,$remp - sll $Zhi,60,$t0 - subq $cnt,1,$cnt - srl $Zlo,4,$Zlo - - ldq $Tlo1,8($nhi) - xor $rem,$Zhi,$Zhi - ldq $Thi1,0($nhi) - s8addq $remp,$rem_4bit,$remp - - ldq $rem,0($remp) - srl $Zhi,4,$Zhi - xor $t0,$Zlo,$Zlo - extbl $Xhi,$cnt,$nlo - - and $nlo,0xf0,$nhi - xor $Thi0,$Zhi,$Zhi - xor $Tlo0,$Zlo,$Zlo - sll $nlo,4,$nlo - - - and $Zlo,0x0f,$remp - sll $Zhi,60,$t0 - and $nlo,0xf0,$nlo - srl $Zlo,4,$Zlo - - s8addq $remp,$rem_4bit,$remp - xor $rem,$Zhi,$Zhi - addq $nlo,$Htbl,$nlo - addq $nhi,$Htbl,$nhi - - ldq $rem,0($remp) - srl $Zhi,4,$Zhi - ldq $Tlo0,8($nlo) - xor $t0,$Zlo,$Zlo - - xor $Tlo1,$Zlo,$Zlo - xor $Thi1,$Zhi,$Zhi - ldq $Thi0,0($nlo) - bne $cnt,.Loophi$N - - - and $Zlo,0x0f,$remp - sll $Zhi,60,$t0 - srl $Zlo,4,$Zlo - - ldq $Tlo1,8($nhi) - xor $rem,$Zhi,$Zhi - ldq $Thi1,0($nhi) - s8addq $remp,$rem_4bit,$remp - - ldq $rem,0($remp) - srl $Zhi,4,$Zhi - xor $t0,$Zlo,$Zlo - - xor $Tlo0,$Zlo,$Zlo - xor $Thi0,$Zhi,$Zhi - - and $Zlo,0x0f,$remp - sll $Zhi,60,$t0 - srl $Zlo,4,$Zlo - - s8addq $remp,$rem_4bit,$remp - xor $rem,$Zhi,$Zhi - - ldq $rem,0($remp) - srl $Zhi,4,$Zhi - xor $Tlo1,$Zlo,$Zlo - xor $Thi1,$Zhi,$Zhi - xor $t0,$Zlo,$Zlo - xor $rem,$Zhi,$Zhi -___ -}} - -$code=<<___; -#ifdef __linux__ -#include <asm/regdef.h> -#else -#include <asm.h> -#include <regdef.h> -#endif - -.text - -.set noat -.set noreorder -.globl gcm_gmult_4bit -.align 4 -.ent gcm_gmult_4bit -gcm_gmult_4bit: - .frame sp,0,ra - .prologue 0 - - ldq $Xlo,8($Xi) - ldq $Xhi,0($Xi) - - bsr $t0,picmeup - nop -___ - - &loop(); - -$code.=<<___; - srl $Zlo,24,$t0 # byte swap - srl $Zlo,8,$t1 - - sll $Zlo,8,$t2 - sll $Zlo,24,$Zlo - zapnot $t0,0x11,$t0 - zapnot $t1,0x22,$t1 - - zapnot $Zlo,0x88,$Zlo - or $t0,$t1,$t0 - zapnot $t2,0x44,$t2 - - or $Zlo,$t0,$Zlo - srl $Zhi,24,$t0 - srl $Zhi,8,$t1 - - or $Zlo,$t2,$Zlo - sll $Zhi,8,$t2 - sll $Zhi,24,$Zhi - - srl $Zlo,32,$Xlo - sll $Zlo,32,$Zlo - - zapnot $t0,0x11,$t0 - zapnot $t1,0x22,$t1 - or $Zlo,$Xlo,$Xlo - - zapnot $Zhi,0x88,$Zhi - or $t0,$t1,$t0 - zapnot $t2,0x44,$t2 - - or $Zhi,$t0,$Zhi - or $Zhi,$t2,$Zhi - - srl $Zhi,32,$Xhi - sll $Zhi,32,$Zhi - - or $Zhi,$Xhi,$Xhi - stq $Xlo,8($Xi) - stq $Xhi,0($Xi) - - ret (ra) -.end gcm_gmult_4bit -___ - -$inhi="s0"; -$inlo="s1"; - -$code.=<<___; -.globl gcm_ghash_4bit -.align 4 -.ent gcm_ghash_4bit -gcm_ghash_4bit: - lda sp,-32(sp) - stq ra,0(sp) - stq s0,8(sp) - stq s1,16(sp) - .mask 0x04000600,-32 - .frame sp,32,ra - .prologue 0 - - ldq_u $inhi,0($inp) - ldq_u $Thi0,7($inp) - ldq_u $inlo,8($inp) - ldq_u $Tlo0,15($inp) - ldq $Xhi,0($Xi) - ldq $Xlo,8($Xi) - - bsr $t0,picmeup - nop - -.Louter: - extql $inhi,$inp,$inhi - extqh $Thi0,$inp,$Thi0 - or $inhi,$Thi0,$inhi - lda $inp,16($inp) - - extql $inlo,$inp,$inlo - extqh $Tlo0,$inp,$Tlo0 - or $inlo,$Tlo0,$inlo - subq $len,16,$len - - xor $Xlo,$inlo,$Xlo - xor $Xhi,$inhi,$Xhi -___ - - &loop(); - -$code.=<<___; - srl $Zlo,24,$t0 # byte swap - srl $Zlo,8,$t1 - - sll $Zlo,8,$t2 - sll $Zlo,24,$Zlo - zapnot $t0,0x11,$t0 - zapnot $t1,0x22,$t1 - - zapnot $Zlo,0x88,$Zlo - or $t0,$t1,$t0 - zapnot $t2,0x44,$t2 - - or $Zlo,$t0,$Zlo - srl $Zhi,24,$t0 - srl $Zhi,8,$t1 - - or $Zlo,$t2,$Zlo - sll $Zhi,8,$t2 - sll $Zhi,24,$Zhi - - srl $Zlo,32,$Xlo - sll $Zlo,32,$Zlo - beq $len,.Ldone - - zapnot $t0,0x11,$t0 - zapnot $t1,0x22,$t1 - or $Zlo,$Xlo,$Xlo - ldq_u $inhi,0($inp) - - zapnot $Zhi,0x88,$Zhi - or $t0,$t1,$t0 - zapnot $t2,0x44,$t2 - ldq_u $Thi0,7($inp) - - or $Zhi,$t0,$Zhi - or $Zhi,$t2,$Zhi - ldq_u $inlo,8($inp) - ldq_u $Tlo0,15($inp) - - srl $Zhi,32,$Xhi - sll $Zhi,32,$Zhi - - or $Zhi,$Xhi,$Xhi - br zero,.Louter - -.Ldone: - zapnot $t0,0x11,$t0 - zapnot $t1,0x22,$t1 - or $Zlo,$Xlo,$Xlo - - zapnot $Zhi,0x88,$Zhi - or $t0,$t1,$t0 - zapnot $t2,0x44,$t2 - - or $Zhi,$t0,$Zhi - or $Zhi,$t2,$Zhi - - srl $Zhi,32,$Xhi - sll $Zhi,32,$Zhi - - or $Zhi,$Xhi,$Xhi - - stq $Xlo,8($Xi) - stq $Xhi,0($Xi) - - .set noreorder - /*ldq ra,0(sp)*/ - ldq s0,8(sp) - ldq s1,16(sp) - lda sp,32(sp) - ret (ra) -.end gcm_ghash_4bit - -.align 4 -.ent picmeup -picmeup: - .frame sp,0,$t0 - .prologue 0 - br $rem_4bit,.Lpic -.Lpic: lda $rem_4bit,12($rem_4bit) - ret ($t0) -.end picmeup - nop -rem_4bit: - .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 - .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 - .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 - .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 -.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" -.align 4 - -___ -$output=shift and open STDOUT,">$output"; -print $code; -close STDOUT; - |