summaryrefslogtreecommitdiff
path: root/app/openssl/crypto/modes
diff options
context:
space:
mode:
Diffstat (limited to 'app/openssl/crypto/modes')
-rw-r--r--app/openssl/crypto/modes/asm/ghash-alpha.pl460
-rw-r--r--app/openssl/crypto/modes/asm/ghash-armv4.S522
-rw-r--r--app/openssl/crypto/modes/asm/ghash-armv4.pl492
-rwxr-xr-xapp/openssl/crypto/modes/asm/ghash-ia64.pl463
-rw-r--r--app/openssl/crypto/modes/asm/ghash-parisc.pl731
-rw-r--r--app/openssl/crypto/modes/asm/ghash-s390x.pl262
-rw-r--r--app/openssl/crypto/modes/asm/ghash-sparcv9.pl330
-rw-r--r--app/openssl/crypto/modes/asm/ghash-x86.S1269
-rw-r--r--app/openssl/crypto/modes/asm/ghash-x86.pl1342
-rw-r--r--app/openssl/crypto/modes/asm/ghash-x86_64.S1026
-rw-r--r--app/openssl/crypto/modes/asm/ghash-x86_64.pl806
-rw-r--r--app/openssl/crypto/modes/asm/ghashv8-armx-64.S115
-rw-r--r--app/openssl/crypto/modes/asm/ghashv8-armx.S116
-rw-r--r--app/openssl/crypto/modes/asm/ghashv8-armx.pl240
-rw-r--r--app/openssl/crypto/modes/cbc128.c205
-rw-r--r--app/openssl/crypto/modes/ccm128.c441
-rw-r--r--app/openssl/crypto/modes/cfb128.c242
-rw-r--r--app/openssl/crypto/modes/ctr128.c252
-rw-r--r--app/openssl/crypto/modes/gcm128.c1924
-rw-r--r--app/openssl/crypto/modes/modes_lcl.h128
-rw-r--r--app/openssl/crypto/modes/ofb128.c121
-rw-r--r--app/openssl/crypto/modes/xts128.c187
22 files changed, 0 insertions, 11674 deletions
diff --git a/app/openssl/crypto/modes/asm/ghash-alpha.pl b/app/openssl/crypto/modes/asm/ghash-alpha.pl
deleted file mode 100644
index aa360293..00000000
--- a/app/openssl/crypto/modes/asm/ghash-alpha.pl
+++ /dev/null
@@ -1,460 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Even though
-# loops are aggressively modulo-scheduled in respect to references to
-# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
-# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
-# scheduling "glitch," because uprofile(1) indicates uniform sample
-# distribution, as if all instruction bundles execute in 1.5 cycles.
-# Meaning that it could have been even faster, yet 12 cycles is ~60%
-# better than gcc-generated code and ~80% than code generated by vendor
-# compiler.
-
-$cnt="v0"; # $0
-$t0="t0";
-$t1="t1";
-$t2="t2";
-$Thi0="t3"; # $4
-$Tlo0="t4";
-$Thi1="t5";
-$Tlo1="t6";
-$rem="t7"; # $8
-#################
-$Xi="a0"; # $16, input argument block
-$Htbl="a1";
-$inp="a2";
-$len="a3";
-$nlo="a4"; # $20
-$nhi="a5";
-$Zhi="t8";
-$Zlo="t9";
-$Xhi="t10"; # $24
-$Xlo="t11";
-$remp="t12";
-$rem_4bit="AT"; # $28
-
-{ my $N;
- sub loop() {
-
- $N++;
-$code.=<<___;
-.align 4
- extbl $Xlo,7,$nlo
- and $nlo,0xf0,$nhi
- sll $nlo,4,$nlo
- and $nlo,0xf0,$nlo
-
- addq $nlo,$Htbl,$nlo
- ldq $Zlo,8($nlo)
- addq $nhi,$Htbl,$nhi
- ldq $Zhi,0($nlo)
-
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- lda $cnt,6(zero)
- extbl $Xlo,6,$nlo
-
- ldq $Tlo1,8($nhi)
- s8addq $remp,$rem_4bit,$remp
- ldq $Thi1,0($nhi)
- srl $Zlo,4,$Zlo
-
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
- and $nlo,0xf0,$nhi
-
- xor $Tlo1,$Zlo,$Zlo
- sll $nlo,4,$nlo
- xor $Thi1,$Zhi,$Zhi
- and $nlo,0xf0,$nlo
-
- addq $nlo,$Htbl,$nlo
- ldq $Tlo0,8($nlo)
- addq $nhi,$Htbl,$nhi
- ldq $Thi0,0($nlo)
-
-.Looplo$N:
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- subq $cnt,1,$cnt
- srl $Zlo,4,$Zlo
-
- ldq $Tlo1,8($nhi)
- xor $rem,$Zhi,$Zhi
- ldq $Thi1,0($nhi)
- s8addq $remp,$rem_4bit,$remp
-
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
- extbl $Xlo,$cnt,$nlo
-
- and $nlo,0xf0,$nhi
- xor $Thi0,$Zhi,$Zhi
- xor $Tlo0,$Zlo,$Zlo
- sll $nlo,4,$nlo
-
-
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- and $nlo,0xf0,$nlo
- srl $Zlo,4,$Zlo
-
- s8addq $remp,$rem_4bit,$remp
- xor $rem,$Zhi,$Zhi
- addq $nlo,$Htbl,$nlo
- addq $nhi,$Htbl,$nhi
-
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- ldq $Tlo0,8($nlo)
- xor $t0,$Zlo,$Zlo
-
- xor $Tlo1,$Zlo,$Zlo
- xor $Thi1,$Zhi,$Zhi
- ldq $Thi0,0($nlo)
- bne $cnt,.Looplo$N
-
-
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- lda $cnt,7(zero)
- srl $Zlo,4,$Zlo
-
- ldq $Tlo1,8($nhi)
- xor $rem,$Zhi,$Zhi
- ldq $Thi1,0($nhi)
- s8addq $remp,$rem_4bit,$remp
-
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
- extbl $Xhi,$cnt,$nlo
-
- and $nlo,0xf0,$nhi
- xor $Thi0,$Zhi,$Zhi
- xor $Tlo0,$Zlo,$Zlo
- sll $nlo,4,$nlo
-
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- and $nlo,0xf0,$nlo
- srl $Zlo,4,$Zlo
-
- s8addq $remp,$rem_4bit,$remp
- xor $rem,$Zhi,$Zhi
- addq $nlo,$Htbl,$nlo
- addq $nhi,$Htbl,$nhi
-
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- ldq $Tlo0,8($nlo)
- xor $t0,$Zlo,$Zlo
-
- xor $Tlo1,$Zlo,$Zlo
- xor $Thi1,$Zhi,$Zhi
- ldq $Thi0,0($nlo)
- unop
-
-
-.Loophi$N:
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- subq $cnt,1,$cnt
- srl $Zlo,4,$Zlo
-
- ldq $Tlo1,8($nhi)
- xor $rem,$Zhi,$Zhi
- ldq $Thi1,0($nhi)
- s8addq $remp,$rem_4bit,$remp
-
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
- extbl $Xhi,$cnt,$nlo
-
- and $nlo,0xf0,$nhi
- xor $Thi0,$Zhi,$Zhi
- xor $Tlo0,$Zlo,$Zlo
- sll $nlo,4,$nlo
-
-
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- and $nlo,0xf0,$nlo
- srl $Zlo,4,$Zlo
-
- s8addq $remp,$rem_4bit,$remp
- xor $rem,$Zhi,$Zhi
- addq $nlo,$Htbl,$nlo
- addq $nhi,$Htbl,$nhi
-
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- ldq $Tlo0,8($nlo)
- xor $t0,$Zlo,$Zlo
-
- xor $Tlo1,$Zlo,$Zlo
- xor $Thi1,$Zhi,$Zhi
- ldq $Thi0,0($nlo)
- bne $cnt,.Loophi$N
-
-
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- srl $Zlo,4,$Zlo
-
- ldq $Tlo1,8($nhi)
- xor $rem,$Zhi,$Zhi
- ldq $Thi1,0($nhi)
- s8addq $remp,$rem_4bit,$remp
-
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
-
- xor $Tlo0,$Zlo,$Zlo
- xor $Thi0,$Zhi,$Zhi
-
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- srl $Zlo,4,$Zlo
-
- s8addq $remp,$rem_4bit,$remp
- xor $rem,$Zhi,$Zhi
-
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $Tlo1,$Zlo,$Zlo
- xor $Thi1,$Zhi,$Zhi
- xor $t0,$Zlo,$Zlo
- xor $rem,$Zhi,$Zhi
-___
-}}
-
-$code=<<___;
-#ifdef __linux__
-#include <asm/regdef.h>
-#else
-#include <asm.h>
-#include <regdef.h>
-#endif
-
-.text
-
-.set noat
-.set noreorder
-.globl gcm_gmult_4bit
-.align 4
-.ent gcm_gmult_4bit
-gcm_gmult_4bit:
- .frame sp,0,ra
- .prologue 0
-
- ldq $Xlo,8($Xi)
- ldq $Xhi,0($Xi)
-
- bsr $t0,picmeup
- nop
-___
-
- &loop();
-
-$code.=<<___;
- srl $Zlo,24,$t0 # byte swap
- srl $Zlo,8,$t1
-
- sll $Zlo,8,$t2
- sll $Zlo,24,$Zlo
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
-
- zapnot $Zlo,0x88,$Zlo
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
-
- or $Zlo,$t0,$Zlo
- srl $Zhi,24,$t0
- srl $Zhi,8,$t1
-
- or $Zlo,$t2,$Zlo
- sll $Zhi,8,$t2
- sll $Zhi,24,$Zhi
-
- srl $Zlo,32,$Xlo
- sll $Zlo,32,$Zlo
-
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
- or $Zlo,$Xlo,$Xlo
-
- zapnot $Zhi,0x88,$Zhi
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
-
- or $Zhi,$t0,$Zhi
- or $Zhi,$t2,$Zhi
-
- srl $Zhi,32,$Xhi
- sll $Zhi,32,$Zhi
-
- or $Zhi,$Xhi,$Xhi
- stq $Xlo,8($Xi)
- stq $Xhi,0($Xi)
-
- ret (ra)
-.end gcm_gmult_4bit
-___
-
-$inhi="s0";
-$inlo="s1";
-
-$code.=<<___;
-.globl gcm_ghash_4bit
-.align 4
-.ent gcm_ghash_4bit
-gcm_ghash_4bit:
- lda sp,-32(sp)
- stq ra,0(sp)
- stq s0,8(sp)
- stq s1,16(sp)
- .mask 0x04000600,-32
- .frame sp,32,ra
- .prologue 0
-
- ldq_u $inhi,0($inp)
- ldq_u $Thi0,7($inp)
- ldq_u $inlo,8($inp)
- ldq_u $Tlo0,15($inp)
- ldq $Xhi,0($Xi)
- ldq $Xlo,8($Xi)
-
- bsr $t0,picmeup
- nop
-
-.Louter:
- extql $inhi,$inp,$inhi
- extqh $Thi0,$inp,$Thi0
- or $inhi,$Thi0,$inhi
- lda $inp,16($inp)
-
- extql $inlo,$inp,$inlo
- extqh $Tlo0,$inp,$Tlo0
- or $inlo,$Tlo0,$inlo
- subq $len,16,$len
-
- xor $Xlo,$inlo,$Xlo
- xor $Xhi,$inhi,$Xhi
-___
-
- &loop();
-
-$code.=<<___;
- srl $Zlo,24,$t0 # byte swap
- srl $Zlo,8,$t1
-
- sll $Zlo,8,$t2
- sll $Zlo,24,$Zlo
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
-
- zapnot $Zlo,0x88,$Zlo
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
-
- or $Zlo,$t0,$Zlo
- srl $Zhi,24,$t0
- srl $Zhi,8,$t1
-
- or $Zlo,$t2,$Zlo
- sll $Zhi,8,$t2
- sll $Zhi,24,$Zhi
-
- srl $Zlo,32,$Xlo
- sll $Zlo,32,$Zlo
- beq $len,.Ldone
-
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
- or $Zlo,$Xlo,$Xlo
- ldq_u $inhi,0($inp)
-
- zapnot $Zhi,0x88,$Zhi
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
- ldq_u $Thi0,7($inp)
-
- or $Zhi,$t0,$Zhi
- or $Zhi,$t2,$Zhi
- ldq_u $inlo,8($inp)
- ldq_u $Tlo0,15($inp)
-
- srl $Zhi,32,$Xhi
- sll $Zhi,32,$Zhi
-
- or $Zhi,$Xhi,$Xhi
- br zero,.Louter
-
-.Ldone:
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
- or $Zlo,$Xlo,$Xlo
-
- zapnot $Zhi,0x88,$Zhi
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
-
- or $Zhi,$t0,$Zhi
- or $Zhi,$t2,$Zhi
-
- srl $Zhi,32,$Xhi
- sll $Zhi,32,$Zhi
-
- or $Zhi,$Xhi,$Xhi
-
- stq $Xlo,8($Xi)
- stq $Xhi,0($Xi)
-
- .set noreorder
- /*ldq ra,0(sp)*/
- ldq s0,8(sp)
- ldq s1,16(sp)
- lda sp,32(sp)
- ret (ra)
-.end gcm_ghash_4bit
-
-.align 4
-.ent picmeup
-picmeup:
- .frame sp,0,$t0
- .prologue 0
- br $rem_4bit,.Lpic
-.Lpic: lda $rem_4bit,12($rem_4bit)
- ret ($t0)
-.end picmeup
- nop
-rem_4bit:
- .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
- .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
- .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
- .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
-.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
-.align 4
-
-___
-$output=shift and open STDOUT,">$output";
-print $code;
-close STDOUT;
-
diff --git a/app/openssl/crypto/modes/asm/ghash-armv4.S b/app/openssl/crypto/modes/asm/ghash-armv4.S
deleted file mode 100644
index 6c453774..00000000
--- a/app/openssl/crypto/modes/asm/ghash-armv4.S
+++ /dev/null
@@ -1,522 +0,0 @@
-#include "arm_arch.h"
-
-.text
-.code 32
-
-.type rem_4bit,%object
-.align 5
-rem_4bit:
-.short 0x0000,0x1C20,0x3840,0x2460
-.short 0x7080,0x6CA0,0x48C0,0x54E0
-.short 0xE100,0xFD20,0xD940,0xC560
-.short 0x9180,0x8DA0,0xA9C0,0xB5E0
-.size rem_4bit,.-rem_4bit
-
-.type rem_4bit_get,%function
-rem_4bit_get:
- sub r2,pc,#8
- sub r2,r2,#32 @ &rem_4bit
- b .Lrem_4bit_got
- nop
-.size rem_4bit_get,.-rem_4bit_get
-
-.global gcm_ghash_4bit
-.type gcm_ghash_4bit,%function
-gcm_ghash_4bit:
- sub r12,pc,#8
- add r3,r2,r3 @ r3 to point at the end
- stmdb sp!,{r3-r11,lr} @ save r3/end too
- sub r12,r12,#48 @ &rem_4bit
-
- ldmia r12,{r4-r11} @ copy rem_4bit ...
- stmdb sp!,{r4-r11} @ ... to stack
-
- ldrb r12,[r2,#15]
- ldrb r14,[r0,#15]
-.Louter:
- eor r12,r12,r14
- and r14,r12,#0xf0
- and r12,r12,#0x0f
- mov r3,#14
-
- add r7,r1,r12,lsl#4
- ldmia r7,{r4-r7} @ load Htbl[nlo]
- add r11,r1,r14
- ldrb r12,[r2,#14]
-
- and r14,r4,#0xf @ rem
- ldmia r11,{r8-r11} @ load Htbl[nhi]
- add r14,r14,r14
- eor r4,r8,r4,lsr#4
- ldrh r8,[sp,r14] @ rem_4bit[rem]
- eor r4,r4,r5,lsl#28
- ldrb r14,[r0,#14]
- eor r5,r9,r5,lsr#4
- eor r5,r5,r6,lsl#28
- eor r6,r10,r6,lsr#4
- eor r6,r6,r7,lsl#28
- eor r7,r11,r7,lsr#4
- eor r12,r12,r14
- and r14,r12,#0xf0
- and r12,r12,#0x0f
- eor r7,r7,r8,lsl#16
-
-.Linner:
- add r11,r1,r12,lsl#4
- and r12,r4,#0xf @ rem
- subs r3,r3,#1
- add r12,r12,r12
- ldmia r11,{r8-r11} @ load Htbl[nlo]
- eor r4,r8,r4,lsr#4
- eor r4,r4,r5,lsl#28
- eor r5,r9,r5,lsr#4
- eor r5,r5,r6,lsl#28
- ldrh r8,[sp,r12] @ rem_4bit[rem]
- eor r6,r10,r6,lsr#4
- ldrplb r12,[r2,r3]
- eor r6,r6,r7,lsl#28
- eor r7,r11,r7,lsr#4
-
- add r11,r1,r14
- and r14,r4,#0xf @ rem
- eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
- add r14,r14,r14
- ldmia r11,{r8-r11} @ load Htbl[nhi]
- eor r4,r8,r4,lsr#4
- ldrplb r8,[r0,r3]
- eor r4,r4,r5,lsl#28
- eor r5,r9,r5,lsr#4
- ldrh r9,[sp,r14]
- eor r5,r5,r6,lsl#28
- eor r6,r10,r6,lsr#4
- eor r6,r6,r7,lsl#28
- eorpl r12,r12,r8
- eor r7,r11,r7,lsr#4
- andpl r14,r12,#0xf0
- andpl r12,r12,#0x0f
- eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
- bpl .Linner
-
- ldr r3,[sp,#32] @ re-load r3/end
- add r2,r2,#16
- mov r14,r4
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev r4,r4
- str r4,[r0,#12]
-#elif defined(__ARMEB__)
- str r4,[r0,#12]
-#else
- mov r9,r4,lsr#8
- strb r4,[r0,#12+3]
- mov r10,r4,lsr#16
- strb r9,[r0,#12+2]
- mov r11,r4,lsr#24
- strb r10,[r0,#12+1]
- strb r11,[r0,#12]
-#endif
- cmp r2,r3
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev r5,r5
- str r5,[r0,#8]
-#elif defined(__ARMEB__)
- str r5,[r0,#8]
-#else
- mov r9,r5,lsr#8
- strb r5,[r0,#8+3]
- mov r10,r5,lsr#16
- strb r9,[r0,#8+2]
- mov r11,r5,lsr#24
- strb r10,[r0,#8+1]
- strb r11,[r0,#8]
-#endif
- ldrneb r12,[r2,#15]
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev r6,r6
- str r6,[r0,#4]
-#elif defined(__ARMEB__)
- str r6,[r0,#4]
-#else
- mov r9,r6,lsr#8
- strb r6,[r0,#4+3]
- mov r10,r6,lsr#16
- strb r9,[r0,#4+2]
- mov r11,r6,lsr#24
- strb r10,[r0,#4+1]
- strb r11,[r0,#4]
-#endif
-
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev r7,r7
- str r7,[r0,#0]
-#elif defined(__ARMEB__)
- str r7,[r0,#0]
-#else
- mov r9,r7,lsr#8
- strb r7,[r0,#0+3]
- mov r10,r7,lsr#16
- strb r9,[r0,#0+2]
- mov r11,r7,lsr#24
- strb r10,[r0,#0+1]
- strb r11,[r0,#0]
-#endif
-
- bne .Louter
-
- add sp,sp,#36
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size gcm_ghash_4bit,.-gcm_ghash_4bit
-
-.global gcm_gmult_4bit
-.type gcm_gmult_4bit,%function
-gcm_gmult_4bit:
- stmdb sp!,{r4-r11,lr}
- ldrb r12,[r0,#15]
- b rem_4bit_get
-.Lrem_4bit_got:
- and r14,r12,#0xf0
- and r12,r12,#0x0f
- mov r3,#14
-
- add r7,r1,r12,lsl#4
- ldmia r7,{r4-r7} @ load Htbl[nlo]
- ldrb r12,[r0,#14]
-
- add r11,r1,r14
- and r14,r4,#0xf @ rem
- ldmia r11,{r8-r11} @ load Htbl[nhi]
- add r14,r14,r14
- eor r4,r8,r4,lsr#4
- ldrh r8,[r2,r14] @ rem_4bit[rem]
- eor r4,r4,r5,lsl#28
- eor r5,r9,r5,lsr#4
- eor r5,r5,r6,lsl#28
- eor r6,r10,r6,lsr#4
- eor r6,r6,r7,lsl#28
- eor r7,r11,r7,lsr#4
- and r14,r12,#0xf0
- eor r7,r7,r8,lsl#16
- and r12,r12,#0x0f
-
-.Loop:
- add r11,r1,r12,lsl#4
- and r12,r4,#0xf @ rem
- subs r3,r3,#1
- add r12,r12,r12
- ldmia r11,{r8-r11} @ load Htbl[nlo]
- eor r4,r8,r4,lsr#4
- eor r4,r4,r5,lsl#28
- eor r5,r9,r5,lsr#4
- eor r5,r5,r6,lsl#28
- ldrh r8,[r2,r12] @ rem_4bit[rem]
- eor r6,r10,r6,lsr#4
- ldrplb r12,[r0,r3]
- eor r6,r6,r7,lsl#28
- eor r7,r11,r7,lsr#4
-
- add r11,r1,r14
- and r14,r4,#0xf @ rem
- eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
- add r14,r14,r14
- ldmia r11,{r8-r11} @ load Htbl[nhi]
- eor r4,r8,r4,lsr#4
- eor r4,r4,r5,lsl#28
- eor r5,r9,r5,lsr#4
- ldrh r8,[r2,r14] @ rem_4bit[rem]
- eor r5,r5,r6,lsl#28
- eor r6,r10,r6,lsr#4
- eor r6,r6,r7,lsl#28
- eor r7,r11,r7,lsr#4
- andpl r14,r12,#0xf0
- andpl r12,r12,#0x0f
- eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
- bpl .Loop
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev r4,r4
- str r4,[r0,#12]
-#elif defined(__ARMEB__)
- str r4,[r0,#12]
-#else
- mov r9,r4,lsr#8
- strb r4,[r0,#12+3]
- mov r10,r4,lsr#16
- strb r9,[r0,#12+2]
- mov r11,r4,lsr#24
- strb r10,[r0,#12+1]
- strb r11,[r0,#12]
-#endif
-
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev r5,r5
- str r5,[r0,#8]
-#elif defined(__ARMEB__)
- str r5,[r0,#8]
-#else
- mov r9,r5,lsr#8
- strb r5,[r0,#8+3]
- mov r10,r5,lsr#16
- strb r9,[r0,#8+2]
- mov r11,r5,lsr#24
- strb r10,[r0,#8+1]
- strb r11,[r0,#8]
-#endif
-
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev r6,r6
- str r6,[r0,#4]
-#elif defined(__ARMEB__)
- str r6,[r0,#4]
-#else
- mov r9,r6,lsr#8
- strb r6,[r0,#4+3]
- mov r10,r6,lsr#16
- strb r9,[r0,#4+2]
- mov r11,r6,lsr#24
- strb r10,[r0,#4+1]
- strb r11,[r0,#4]
-#endif
-
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev r7,r7
- str r7,[r0,#0]
-#elif defined(__ARMEB__)
- str r7,[r0,#0]
-#else
- mov r9,r7,lsr#8
- strb r7,[r0,#0+3]
- mov r10,r7,lsr#16
- strb r9,[r0,#0+2]
- mov r11,r7,lsr#24
- strb r10,[r0,#0+1]
- strb r11,[r0,#0]
-#endif
-
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size gcm_gmult_4bit,.-gcm_gmult_4bit
-#if __ARM_ARCH__>=7
-.fpu neon
-
-.global gcm_init_neon
-.type gcm_init_neon,%function
-.align 4
-gcm_init_neon:
- vld1.64 d7,[r1,:64]! @ load H
- vmov.i8 q8,#0xe1
- vld1.64 d6,[r1,:64]
- vshl.i64 d17,#57
- vshr.u64 d16,#63 @ t0=0xc2....01
- vdup.8 q9,d7[7]
- vshr.u64 d26,d6,#63
- vshr.s8 q9,#7 @ broadcast carry bit
- vshl.i64 q3,q3,#1
- vand q8,q8,q9
- vorr d7,d26 @ H<<<=1
- veor q3,q3,q8 @ twisted H
- vstmia r0,{q3}
-
- bx lr @ bx lr
-.size gcm_init_neon,.-gcm_init_neon
-
-.global gcm_gmult_neon
-.type gcm_gmult_neon,%function
-.align 4
-gcm_gmult_neon:
- vld1.64 d7,[r0,:64]! @ load Xi
- vld1.64 d6,[r0,:64]!
- vmov.i64 d29,#0x0000ffffffffffff
- vldmia r1,{d26-d27} @ load twisted H
- vmov.i64 d30,#0x00000000ffffffff
-#ifdef __ARMEL__
- vrev64.8 q3,q3
-#endif
- vmov.i64 d31,#0x000000000000ffff
- veor d28,d26,d27 @ Karatsuba pre-processing
- mov r3,#16
- b .Lgmult_neon
-.size gcm_gmult_neon,.-gcm_gmult_neon
-
-.global gcm_ghash_neon
-.type gcm_ghash_neon,%function
-.align 4
-gcm_ghash_neon:
- vld1.64 d1,[r0,:64]! @ load Xi
- vld1.64 d0,[r0,:64]!
- vmov.i64 d29,#0x0000ffffffffffff
- vldmia r1,{d26-d27} @ load twisted H
- vmov.i64 d30,#0x00000000ffffffff
-#ifdef __ARMEL__
- vrev64.8 q0,q0
-#endif
- vmov.i64 d31,#0x000000000000ffff
- veor d28,d26,d27 @ Karatsuba pre-processing
-
-.Loop_neon:
- vld1.64 d7,[r2]! @ load inp
- vld1.64 d6,[r2]!
-#ifdef __ARMEL__
- vrev64.8 q3,q3
-#endif
- veor q3,q0 @ inp^=Xi
-.Lgmult_neon:
- vext.8 d16, d26, d26, #1 @ A1
- vmull.p8 q8, d16, d6 @ F = A1*B
- vext.8 d0, d6, d6, #1 @ B1
- vmull.p8 q0, d26, d0 @ E = A*B1
- vext.8 d18, d26, d26, #2 @ A2
- vmull.p8 q9, d18, d6 @ H = A2*B
- vext.8 d22, d6, d6, #2 @ B2
- vmull.p8 q11, d26, d22 @ G = A*B2
- vext.8 d20, d26, d26, #3 @ A3
- veor q8, q8, q0 @ L = E + F
- vmull.p8 q10, d20, d6 @ J = A3*B
- vext.8 d0, d6, d6, #3 @ B3
- veor q9, q9, q11 @ M = G + H
- vmull.p8 q0, d26, d0 @ I = A*B3
- veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
- vand d17, d17, d29
- vext.8 d22, d6, d6, #4 @ B4
- veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
- vand d19, d19, d30
- vmull.p8 q11, d26, d22 @ K = A*B4
- veor q10, q10, q0 @ N = I + J
- veor d16, d16, d17
- veor d18, d18, d19
- veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
- vand d21, d21, d31
- vext.8 q8, q8, q8, #15
- veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
- vmov.i64 d23, #0
- vext.8 q9, q9, q9, #14
- veor d20, d20, d21
- vmull.p8 q0, d26, d6 @ D = A*B
- vext.8 q11, q11, q11, #12
- vext.8 q10, q10, q10, #13
- veor q8, q8, q9
- veor q10, q10, q11
- veor q0, q0, q8
- veor q0, q0, q10
- veor d6,d6,d7 @ Karatsuba pre-processing
- vext.8 d16, d28, d28, #1 @ A1
- vmull.p8 q8, d16, d6 @ F = A1*B
- vext.8 d2, d6, d6, #1 @ B1
- vmull.p8 q1, d28, d2 @ E = A*B1
- vext.8 d18, d28, d28, #2 @ A2
- vmull.p8 q9, d18, d6 @ H = A2*B
- vext.8 d22, d6, d6, #2 @ B2
- vmull.p8 q11, d28, d22 @ G = A*B2
- vext.8 d20, d28, d28, #3 @ A3
- veor q8, q8, q1 @ L = E + F
- vmull.p8 q10, d20, d6 @ J = A3*B
- vext.8 d2, d6, d6, #3 @ B3
- veor q9, q9, q11 @ M = G + H
- vmull.p8 q1, d28, d2 @ I = A*B3
- veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
- vand d17, d17, d29
- vext.8 d22, d6, d6, #4 @ B4
- veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
- vand d19, d19, d30
- vmull.p8 q11, d28, d22 @ K = A*B4
- veor q10, q10, q1 @ N = I + J
- veor d16, d16, d17
- veor d18, d18, d19
- veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
- vand d21, d21, d31
- vext.8 q8, q8, q8, #15
- veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
- vmov.i64 d23, #0
- vext.8 q9, q9, q9, #14
- veor d20, d20, d21
- vmull.p8 q1, d28, d6 @ D = A*B
- vext.8 q11, q11, q11, #12
- vext.8 q10, q10, q10, #13
- veor q8, q8, q9
- veor q10, q10, q11
- veor q1, q1, q8
- veor q1, q1, q10
- vext.8 d16, d27, d27, #1 @ A1
- vmull.p8 q8, d16, d7 @ F = A1*B
- vext.8 d4, d7, d7, #1 @ B1
- vmull.p8 q2, d27, d4 @ E = A*B1
- vext.8 d18, d27, d27, #2 @ A2
- vmull.p8 q9, d18, d7 @ H = A2*B
- vext.8 d22, d7, d7, #2 @ B2
- vmull.p8 q11, d27, d22 @ G = A*B2
- vext.8 d20, d27, d27, #3 @ A3
- veor q8, q8, q2 @ L = E + F
- vmull.p8 q10, d20, d7 @ J = A3*B
- vext.8 d4, d7, d7, #3 @ B3
- veor q9, q9, q11 @ M = G + H
- vmull.p8 q2, d27, d4 @ I = A*B3
- veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
- vand d17, d17, d29
- vext.8 d22, d7, d7, #4 @ B4
- veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
- vand d19, d19, d30
- vmull.p8 q11, d27, d22 @ K = A*B4
- veor q10, q10, q2 @ N = I + J
- veor d16, d16, d17
- veor d18, d18, d19
- veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
- vand d21, d21, d31
- vext.8 q8, q8, q8, #15
- veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
- vmov.i64 d23, #0
- vext.8 q9, q9, q9, #14
- veor d20, d20, d21
- vmull.p8 q2, d27, d7 @ D = A*B
- vext.8 q11, q11, q11, #12
- vext.8 q10, q10, q10, #13
- veor q8, q8, q9
- veor q10, q10, q11
- veor q2, q2, q8
- veor q2, q2, q10
- veor q1,q1,q0 @ Karatsuba post-processing
- veor q1,q1,q2
- veor d1,d1,d2
- veor d4,d4,d3 @ Xh|Xl - 256-bit result
-
- @ equivalent of reduction_avx from ghash-x86_64.pl
- vshl.i64 q9,q0,#57 @ 1st phase
- vshl.i64 q10,q0,#62
- veor q10,q10,q9 @
- vshl.i64 q9,q0,#63
- veor q10, q10, q9 @
- veor d1,d1,d20 @
- veor d4,d4,d21
-
- vshr.u64 q10,q0,#1 @ 2nd phase
- veor q2,q2,q0
- veor q0,q0,q10 @
- vshr.u64 q10,q10,#6
- vshr.u64 q0,q0,#1 @
- veor q0,q0,q2 @
- veor q0,q0,q10 @
-
- subs r3,#16
- bne .Loop_neon
-
-#ifdef __ARMEL__
- vrev64.8 q0,q0
-#endif
- sub r0,#16
- vst1.64 d1,[r0,:64]! @ write out Xi
- vst1.64 d0,[r0,:64]
-
- bx lr @ bx lr
-.size gcm_ghash_neon,.-gcm_ghash_neon
-#endif
-.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
diff --git a/app/openssl/crypto/modes/asm/ghash-armv4.pl b/app/openssl/crypto/modes/asm/ghash-armv4.pl
deleted file mode 100644
index b79ecbcc..00000000
--- a/app/openssl/crypto/modes/asm/ghash-armv4.pl
+++ /dev/null
@@ -1,492 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# April 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+32 bytes shared table]. There is no
-# experimental performance data available yet. The only approximation
-# that can be made at this point is based on code size. Inner loop is
-# 32 instructions long and on single-issue core should execute in <40
-# cycles. Having verified that gcc 3.4 didn't unroll corresponding
-# loop, this assembler loop body was found to be ~3x smaller than
-# compiler-generated one...
-#
-# July 2010
-#
-# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
-# Cortex A8 core and ~25 cycles per processed byte (which was observed
-# to be ~3 times faster than gcc-generated code:-)
-#
-# February 2011
-#
-# Profiler-assisted and platform-specific optimization resulted in 7%
-# improvement on Cortex A8 core and ~23.5 cycles per byte.
-#
-# March 2011
-#
-# Add NEON implementation featuring polynomial multiplication, i.e. no
-# lookup tables involved. On Cortex A8 it was measured to process one
-# byte in 15 cycles or 55% faster than integer-only code.
-#
-# April 2014
-#
-# Switch to multiplication algorithm suggested in paper referred
-# below and combine it with reduction algorithm from x86 module.
-# Performance improvement over previous version varies from 65% on
-# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
-# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
-# in 9.33.
-#
-# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
-# Polynomial Multiplication on ARM Processors using the NEON Engine.
-#
-# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
-
-# ====================================================================
-# Note about "528B" variant. In ARM case it makes lesser sense to
-# implement it for following reasons:
-#
-# - performance improvement won't be anywhere near 50%, because 128-
-# bit shift operation is neatly fused with 128-bit xor here, and
-# "538B" variant would eliminate only 4-5 instructions out of 32
-# in the inner loop (meaning that estimated improvement is ~15%);
-# - ARM-based systems are often embedded ones and extra memory
-# consumption might be unappreciated (for so little improvement);
-#
-# Byte order [in]dependence. =========================================
-#
-# Caller is expected to maintain specific *dword* order in Htable,
-# namely with *least* significant dword of 128-bit value at *lower*
-# address. This differs completely from C code and has everything to
-# do with ldm instruction and order in which dwords are "consumed" by
-# algorithm. *Byte* order within these dwords in turn is whatever
-# *native* byte order on current platform. See gcm128.c for working
-# example...
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$Xi="r0"; # argument block
-$Htbl="r1";
-$inp="r2";
-$len="r3";
-
-$Zll="r4"; # variables
-$Zlh="r5";
-$Zhl="r6";
-$Zhh="r7";
-$Tll="r8";
-$Tlh="r9";
-$Thl="r10";
-$Thh="r11";
-$nlo="r12";
-################# r13 is stack pointer
-$nhi="r14";
-################# r15 is program counter
-
-$rem_4bit=$inp; # used in gcm_gmult_4bit
-$cnt=$len;
-
-sub Zsmash() {
- my $i=12;
- my @args=@_;
- for ($Zll,$Zlh,$Zhl,$Zhh) {
- $code.=<<___;
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev $_,$_
- str $_,[$Xi,#$i]
-#elif defined(__ARMEB__)
- str $_,[$Xi,#$i]
-#else
- mov $Tlh,$_,lsr#8
- strb $_,[$Xi,#$i+3]
- mov $Thl,$_,lsr#16
- strb $Tlh,[$Xi,#$i+2]
- mov $Thh,$_,lsr#24
- strb $Thl,[$Xi,#$i+1]
- strb $Thh,[$Xi,#$i]
-#endif
-___
- $code.="\t".shift(@args)."\n";
- $i-=4;
- }
-}
-
-$code=<<___;
-#include "arm_arch.h"
-
-.text
-.code 32
-
-.type rem_4bit,%object
-.align 5
-rem_4bit:
-.short 0x0000,0x1C20,0x3840,0x2460
-.short 0x7080,0x6CA0,0x48C0,0x54E0
-.short 0xE100,0xFD20,0xD940,0xC560
-.short 0x9180,0x8DA0,0xA9C0,0xB5E0
-.size rem_4bit,.-rem_4bit
-
-.type rem_4bit_get,%function
-rem_4bit_get:
- sub $rem_4bit,pc,#8
- sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
- b .Lrem_4bit_got
- nop
-.size rem_4bit_get,.-rem_4bit_get
-
-.global gcm_ghash_4bit
-.type gcm_ghash_4bit,%function
-gcm_ghash_4bit:
- sub r12,pc,#8
- add $len,$inp,$len @ $len to point at the end
- stmdb sp!,{r3-r11,lr} @ save $len/end too
- sub r12,r12,#48 @ &rem_4bit
-
- ldmia r12,{r4-r11} @ copy rem_4bit ...
- stmdb sp!,{r4-r11} @ ... to stack
-
- ldrb $nlo,[$inp,#15]
- ldrb $nhi,[$Xi,#15]
-.Louter:
- eor $nlo,$nlo,$nhi
- and $nhi,$nlo,#0xf0
- and $nlo,$nlo,#0x0f
- mov $cnt,#14
-
- add $Zhh,$Htbl,$nlo,lsl#4
- ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
- add $Thh,$Htbl,$nhi
- ldrb $nlo,[$inp,#14]
-
- and $nhi,$Zll,#0xf @ rem
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
- add $nhi,$nhi,$nhi
- eor $Zll,$Tll,$Zll,lsr#4
- ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
- eor $Zll,$Zll,$Zlh,lsl#28
- ldrb $nhi,[$Xi,#14]
- eor $Zlh,$Tlh,$Zlh,lsr#4
- eor $Zlh,$Zlh,$Zhl,lsl#28
- eor $Zhl,$Thl,$Zhl,lsr#4
- eor $Zhl,$Zhl,$Zhh,lsl#28
- eor $Zhh,$Thh,$Zhh,lsr#4
- eor $nlo,$nlo,$nhi
- and $nhi,$nlo,#0xf0
- and $nlo,$nlo,#0x0f
- eor $Zhh,$Zhh,$Tll,lsl#16
-
-.Linner:
- add $Thh,$Htbl,$nlo,lsl#4
- and $nlo,$Zll,#0xf @ rem
- subs $cnt,$cnt,#1
- add $nlo,$nlo,$nlo
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
- eor $Zll,$Tll,$Zll,lsr#4
- eor $Zll,$Zll,$Zlh,lsl#28
- eor $Zlh,$Tlh,$Zlh,lsr#4
- eor $Zlh,$Zlh,$Zhl,lsl#28
- ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
- eor $Zhl,$Thl,$Zhl,lsr#4
- ldrplb $nlo,[$inp,$cnt]
- eor $Zhl,$Zhl,$Zhh,lsl#28
- eor $Zhh,$Thh,$Zhh,lsr#4
-
- add $Thh,$Htbl,$nhi
- and $nhi,$Zll,#0xf @ rem
- eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
- add $nhi,$nhi,$nhi
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
- eor $Zll,$Tll,$Zll,lsr#4
- ldrplb $Tll,[$Xi,$cnt]
- eor $Zll,$Zll,$Zlh,lsl#28
- eor $Zlh,$Tlh,$Zlh,lsr#4
- ldrh $Tlh,[sp,$nhi]
- eor $Zlh,$Zlh,$Zhl,lsl#28
- eor $Zhl,$Thl,$Zhl,lsr#4
- eor $Zhl,$Zhl,$Zhh,lsl#28
- eorpl $nlo,$nlo,$Tll
- eor $Zhh,$Thh,$Zhh,lsr#4
- andpl $nhi,$nlo,#0xf0
- andpl $nlo,$nlo,#0x0f
- eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
- bpl .Linner
-
- ldr $len,[sp,#32] @ re-load $len/end
- add $inp,$inp,#16
- mov $nhi,$Zll
-___
- &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
-$code.=<<___;
- bne .Louter
-
- add sp,sp,#36
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size gcm_ghash_4bit,.-gcm_ghash_4bit
-
-.global gcm_gmult_4bit
-.type gcm_gmult_4bit,%function
-gcm_gmult_4bit:
- stmdb sp!,{r4-r11,lr}
- ldrb $nlo,[$Xi,#15]
- b rem_4bit_get
-.Lrem_4bit_got:
- and $nhi,$nlo,#0xf0
- and $nlo,$nlo,#0x0f
- mov $cnt,#14
-
- add $Zhh,$Htbl,$nlo,lsl#4
- ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
- ldrb $nlo,[$Xi,#14]
-
- add $Thh,$Htbl,$nhi
- and $nhi,$Zll,#0xf @ rem
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
- add $nhi,$nhi,$nhi
- eor $Zll,$Tll,$Zll,lsr#4
- ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
- eor $Zll,$Zll,$Zlh,lsl#28
- eor $Zlh,$Tlh,$Zlh,lsr#4
- eor $Zlh,$Zlh,$Zhl,lsl#28
- eor $Zhl,$Thl,$Zhl,lsr#4
- eor $Zhl,$Zhl,$Zhh,lsl#28
- eor $Zhh,$Thh,$Zhh,lsr#4
- and $nhi,$nlo,#0xf0
- eor $Zhh,$Zhh,$Tll,lsl#16
- and $nlo,$nlo,#0x0f
-
-.Loop:
- add $Thh,$Htbl,$nlo,lsl#4
- and $nlo,$Zll,#0xf @ rem
- subs $cnt,$cnt,#1
- add $nlo,$nlo,$nlo
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
- eor $Zll,$Tll,$Zll,lsr#4
- eor $Zll,$Zll,$Zlh,lsl#28
- eor $Zlh,$Tlh,$Zlh,lsr#4
- eor $Zlh,$Zlh,$Zhl,lsl#28
- ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
- eor $Zhl,$Thl,$Zhl,lsr#4
- ldrplb $nlo,[$Xi,$cnt]
- eor $Zhl,$Zhl,$Zhh,lsl#28
- eor $Zhh,$Thh,$Zhh,lsr#4
-
- add $Thh,$Htbl,$nhi
- and $nhi,$Zll,#0xf @ rem
- eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
- add $nhi,$nhi,$nhi
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
- eor $Zll,$Tll,$Zll,lsr#4
- eor $Zll,$Zll,$Zlh,lsl#28
- eor $Zlh,$Tlh,$Zlh,lsr#4
- ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
- eor $Zlh,$Zlh,$Zhl,lsl#28
- eor $Zhl,$Thl,$Zhl,lsr#4
- eor $Zhl,$Zhl,$Zhh,lsl#28
- eor $Zhh,$Thh,$Zhh,lsr#4
- andpl $nhi,$nlo,#0xf0
- andpl $nlo,$nlo,#0x0f
- eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
- bpl .Loop
-___
- &Zsmash();
-$code.=<<___;
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size gcm_gmult_4bit,.-gcm_gmult_4bit
-___
-{
-my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
-my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
-my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
-
-sub clmul64x64 {
-my ($r,$a,$b)=@_;
-$code.=<<___;
- vext.8 $t0#lo, $a, $a, #1 @ A1
- vmull.p8 $t0, $t0#lo, $b @ F = A1*B
- vext.8 $r#lo, $b, $b, #1 @ B1
- vmull.p8 $r, $a, $r#lo @ E = A*B1
- vext.8 $t1#lo, $a, $a, #2 @ A2
- vmull.p8 $t1, $t1#lo, $b @ H = A2*B
- vext.8 $t3#lo, $b, $b, #2 @ B2
- vmull.p8 $t3, $a, $t3#lo @ G = A*B2
- vext.8 $t2#lo, $a, $a, #3 @ A3
- veor $t0, $t0, $r @ L = E + F
- vmull.p8 $t2, $t2#lo, $b @ J = A3*B
- vext.8 $r#lo, $b, $b, #3 @ B3
- veor $t1, $t1, $t3 @ M = G + H
- vmull.p8 $r, $a, $r#lo @ I = A*B3
- veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
- vand $t0#hi, $t0#hi, $k48
- vext.8 $t3#lo, $b, $b, #4 @ B4
- veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
- vand $t1#hi, $t1#hi, $k32
- vmull.p8 $t3, $a, $t3#lo @ K = A*B4
- veor $t2, $t2, $r @ N = I + J
- veor $t0#lo, $t0#lo, $t0#hi
- veor $t1#lo, $t1#lo, $t1#hi
- veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
- vand $t2#hi, $t2#hi, $k16
- vext.8 $t0, $t0, $t0, #15
- veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
- vmov.i64 $t3#hi, #0
- vext.8 $t1, $t1, $t1, #14
- veor $t2#lo, $t2#lo, $t2#hi
- vmull.p8 $r, $a, $b @ D = A*B
- vext.8 $t3, $t3, $t3, #12
- vext.8 $t2, $t2, $t2, #13
- veor $t0, $t0, $t1
- veor $t2, $t2, $t3
- veor $r, $r, $t0
- veor $r, $r, $t2
-___
-}
-
-$code.=<<___;
-#if __ARM_ARCH__>=7
-.fpu neon
-
-.global gcm_init_neon
-.type gcm_init_neon,%function
-.align 4
-gcm_init_neon:
- vld1.64 $IN#hi,[r1,:64]! @ load H
- vmov.i8 $t0,#0xe1
- vld1.64 $IN#lo,[r1,:64]
- vshl.i64 $t0#hi,#57
- vshr.u64 $t0#lo,#63 @ t0=0xc2....01
- vdup.8 $t1,$IN#hi[7]
- vshr.u64 $Hlo,$IN#lo,#63
- vshr.s8 $t1,#7 @ broadcast carry bit
- vshl.i64 $IN,$IN,#1
- vand $t0,$t0,$t1
- vorr $IN#hi,$Hlo @ H<<<=1
- veor $IN,$IN,$t0 @ twisted H
- vstmia r0,{$IN}
-
- ret @ bx lr
-.size gcm_init_neon,.-gcm_init_neon
-
-.global gcm_gmult_neon
-.type gcm_gmult_neon,%function
-.align 4
-gcm_gmult_neon:
- vld1.64 $IN#hi,[$Xi,:64]! @ load Xi
- vld1.64 $IN#lo,[$Xi,:64]!
- vmov.i64 $k48,#0x0000ffffffffffff
- vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
- vmov.i64 $k32,#0x00000000ffffffff
-#ifdef __ARMEL__
- vrev64.8 $IN,$IN
-#endif
- vmov.i64 $k16,#0x000000000000ffff
- veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
- mov $len,#16
- b .Lgmult_neon
-.size gcm_gmult_neon,.-gcm_gmult_neon
-
-.global gcm_ghash_neon
-.type gcm_ghash_neon,%function
-.align 4
-gcm_ghash_neon:
- vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi
- vld1.64 $Xl#lo,[$Xi,:64]!
- vmov.i64 $k48,#0x0000ffffffffffff
- vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
- vmov.i64 $k32,#0x00000000ffffffff
-#ifdef __ARMEL__
- vrev64.8 $Xl,$Xl
-#endif
- vmov.i64 $k16,#0x000000000000ffff
- veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
-
-.Loop_neon:
- vld1.64 $IN#hi,[$inp]! @ load inp
- vld1.64 $IN#lo,[$inp]!
-#ifdef __ARMEL__
- vrev64.8 $IN,$IN
-#endif
- veor $IN,$Xl @ inp^=Xi
-.Lgmult_neon:
-___
- &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
-$code.=<<___;
- veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
-___
- &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
- &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
-$code.=<<___;
- veor $Xm,$Xm,$Xl @ Karatsuba post-processing
- veor $Xm,$Xm,$Xh
- veor $Xl#hi,$Xl#hi,$Xm#lo
- veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
-
- @ equivalent of reduction_avx from ghash-x86_64.pl
- vshl.i64 $t1,$Xl,#57 @ 1st phase
- vshl.i64 $t2,$Xl,#62
- veor $t2,$t2,$t1 @
- vshl.i64 $t1,$Xl,#63
- veor $t2, $t2, $t1 @
- veor $Xl#hi,$Xl#hi,$t2#lo @
- veor $Xh#lo,$Xh#lo,$t2#hi
-
- vshr.u64 $t2,$Xl,#1 @ 2nd phase
- veor $Xh,$Xh,$Xl
- veor $Xl,$Xl,$t2 @
- vshr.u64 $t2,$t2,#6
- vshr.u64 $Xl,$Xl,#1 @
- veor $Xl,$Xl,$Xh @
- veor $Xl,$Xl,$t2 @
-
- subs $len,#16
- bne .Loop_neon
-
-#ifdef __ARMEL__
- vrev64.8 $Xl,$Xl
-#endif
- sub $Xi,#16
- vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
- vst1.64 $Xl#lo,[$Xi,:64]
-
- ret @ bx lr
-.size gcm_ghash_neon,.-gcm_ghash_neon
-#endif
-___
-}
-$code.=<<___;
-.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-___
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/app/openssl/crypto/modes/asm/ghash-ia64.pl b/app/openssl/crypto/modes/asm/ghash-ia64.pl
deleted file mode 100755
index 0354c954..00000000
--- a/app/openssl/crypto/modes/asm/ghash-ia64.pl
+++ /dev/null
@@ -1,463 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
-# GHASH performance was measured to be 6.67 cycles per processed byte
-# on Itanium 2, which is >90% better than Microsoft compiler generated
-# code. To anchor to something else sha1-ia64.pl module processes one
-# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
-# byte.
-
-# September 2010
-#
-# It was originally thought that it makes lesser sense to implement
-# "528B" variant on Itanium 2 for following reason. Because number of
-# functional units is naturally limited, it appeared impossible to
-# implement "528B" loop in 4 cycles, only in 5. This would mean that
-# theoretically performance improvement couldn't be more than 20%.
-# But occasionally you prove yourself wrong:-) I figured out a way to
-# fold couple of instructions and having freed yet another instruction
-# slot by unrolling the loop... Resulting performance is 4.45 cycles
-# per processed byte and 50% better than "256B" version. On original
-# Itanium performance should remain the same as the "256B" version,
-# i.e. ~8.5 cycles.
-
-$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
-
-if ($^O eq "hpux") {
- $ADDP="addp4";
- for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
-} else { $ADDP="add"; }
-for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
- $big_endian=0 if (/\-DL_ENDIAN/); }
-if (!defined($big_endian))
- { $big_endian=(unpack('L',pack('N',1))==1); }
-
-sub loop() {
-my $label=shift;
-my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
-
-# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
-# in scalable manner;-) Naturally assuming data in L1 cache...
-# Special note about 'dep' instruction, which is used to construct
-# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
-# bytes boundary and lower 7 bits of its address are guaranteed to
-# be zero.
-$code.=<<___;
-$label:
-{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
- (p19) dep rem=Zlo,rem_4bitp,3,4 }
-{ .mfi; (p19) xor Zhi=Zhi,Hhi
- ($p17) xor xi[1]=xi[1],in[1] };;
-{ .mfi; (p18) ld8 Hhi=[Hi[1]]
- (p19) shrp Zlo=Zhi,Zlo,4 }
-{ .mfi; (p19) ld8 rem=[rem]
- (p18) and Hi[1]=mask0xf0,xi[2] };;
-{ .mmi; ($p16) ld1 in[0]=[inp],-1
- (p18) xor Zlo=Zlo,Hlo
- (p19) shr.u Zhi=Zhi,4 }
-{ .mib; (p19) xor Hhi=Hhi,rem
- (p18) add Hi[1]=Htbl,Hi[1] };;
-
-{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
- (p18) dep rem=Zlo,rem_4bitp,3,4 }
-{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
- (p18) xor Zhi=Zhi,Hhi };;
-{ .mfi; (p18) ld8 Hhi=[Hi[1]]
- (p18) shrp Zlo=Zhi,Zlo,4 }
-{ .mfi; (p18) ld8 rem=[rem]
- (p17) and Hi[0]=mask0xf0,Hi[0] };;
-{ .mmi; (p16) ld1 xi[0]=[Xi],-1
- (p18) xor Zlo=Zlo,Hlo
- (p18) shr.u Zhi=Zhi,4 }
-{ .mib; (p18) xor Hhi=Hhi,rem
- (p17) add Hi[0]=Htbl,Hi[0]
- br.ctop.sptk $label };;
-___
-}
-
-$code=<<___;
-.explicit
-.text
-
-prevfs=r2; prevlc=r3; prevpr=r8;
-mask0xf0=r21;
-rem=r22; rem_4bitp=r23;
-Xi=r24; Htbl=r25;
-inp=r26; end=r27;
-Hhi=r28; Hlo=r29;
-Zhi=r30; Zlo=r31;
-
-.align 128
-.skip 16 // aligns loop body
-.global gcm_gmult_4bit#
-.proc gcm_gmult_4bit#
-gcm_gmult_4bit:
- .prologue
-{ .mmi; .save ar.pfs,prevfs
- alloc prevfs=ar.pfs,2,6,0,8
- $ADDP Xi=15,in0 // &Xi[15]
- mov rem_4bitp=ip }
-{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
- .save ar.lc,prevlc
- mov prevlc=ar.lc
- .save pr,prevpr
- mov prevpr=pr };;
-
- .body
- .rotr in[3],xi[3],Hi[2]
-
-{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
- mov mask0xf0=0xf0
- brp.loop.imp .Loop1,.Lend1-16};;
-{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
- };;
-{ .mii; shladd Hi[1]=xi[2],4,r0
- mov pr.rot=0x7<<16
- mov ar.lc=13 };;
-{ .mii; and Hi[1]=mask0xf0,Hi[1]
- mov ar.ec=3
- xor Zlo=Zlo,Zlo };;
-{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
- add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
- xor Zhi=Zhi,Zhi };;
-___
- &loop (".Loop1",1);
-$code.=<<___;
-.Lend1:
-{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
-{ .mib; mux1 Zlo=Zlo,\@rev };;
-{ .mib; mux1 Zhi=Zhi,\@rev };;
-{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
- add Hhi=1,Xi };; // pipeline flush on Itanium
-{ .mib; st8 [Hlo]=Zlo
- mov pr=prevpr,0x1ffff };;
-{ .mib; st8 [Hhi]=Zhi
- mov ar.lc=prevlc
- br.ret.sptk.many b0 };;
-.endp gcm_gmult_4bit#
-___
-
-######################################################################
-# "528B" (well, "512B" actualy) streamed GHASH
-#
-$Xip="in0";
-$Htbl="in1";
-$inp="in2";
-$len="in3";
-$rem_8bit="loc0";
-$mask0xff="loc1";
-($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
-
-sub load_htable() {
- for (my $i=0;$i<8;$i++) {
- $code.=<<___;
-{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
- ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
-{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
- ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
-___
- $code.=shift if (($i+$#_)==7);
- $code.="\t};;\n"
- }
-}
-
-$code.=<<___;
-prevsp=r3;
-
-.align 32
-.skip 16 // aligns loop body
-.global gcm_ghash_4bit#
-.proc gcm_ghash_4bit#
-gcm_ghash_4bit:
- .prologue
-{ .mmi; .save ar.pfs,prevfs
- alloc prevfs=ar.pfs,4,2,0,0
- .vframe prevsp
- mov prevsp=sp
- mov $rem_8bit=ip };;
- .body
-{ .mfi; $ADDP r8=0+0,$Htbl
- $ADDP r9=0+8,$Htbl }
-{ .mfi; $ADDP r10=128+0,$Htbl
- $ADDP r11=128+8,$Htbl };;
-___
- &load_htable(
- " $ADDP $Xip=15,$Xip", # &Xi[15]
- " $ADDP $len=$len,$inp", # &inp[len]
- " $ADDP $inp=15,$inp", # &inp[15]
- " mov $mask0xff=0xff",
- " add sp=-512,sp",
- " andcm sp=sp,$mask0xff", # align stack frame
- " add r14=0,sp",
- " add r15=8,sp");
-$code.=<<___;
-{ .mmi; $sum 1<<1 // go big-endian
- add r8=256+0,sp
- add r9=256+8,sp }
-{ .mmi; add r10=256+128+0,sp
- add r11=256+128+8,sp
- add $len=-17,$len };;
-___
-for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
-my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
-$code.=<<___;
-{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
- st8 [r9]=$rhi,16 // Htable[$i].hi
- shrp $rlo=$rhi,$rlo,4 }//;;
-{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
- stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
- shr.u $rhi=$rhi,4 };;
-{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
- st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
-___
-}
-$code.=<<___;
-{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
- ld8 r17=[r9],16 };; // Htable[8].hi
-{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
- ld8 r19=[r9],16 } // Htable[9].hi
-{ .mmi; rum 1<<5 // clear um.mfh
- shrp r16=r17,r16,4 };;
-___
-for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
-$code.=<<___;
-{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
- ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
- shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
-{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
- st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
- shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
-___
-}
-$code.=<<___;
-{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
-{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
- st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
- shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
-{ .mmi; add $Htbl=256,sp // &Htable[0]
- add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
- shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
-{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
- st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
-___
-
-$in="r15";
-@xi=("r16","r17");
-@rem=("r18","r19");
-($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
-($Atbl,$Btbl)=("r26","r27");
-
-$code.=<<___; # (p16)
-{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
- ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
- cmp.eq p0,p6=r0,r0 };; // clear p6
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-
-$code.=<<___; # (p16),(p17)
-{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
- xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
-{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
- dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
- and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
-.align 32
-.LOOP:
-{ .mmi;
-(p6) st8 [$Xip]=$Zhi,13
- xor $Zlo=$Zlo,$Zlo
- add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-
-$code.=<<___; # (p16),(p17),(p18)
-{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
- ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
- xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
-{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
- dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
-{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
- xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
-{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
- ld1 $in=[$inp],-1 } //(p16) *inp--
-{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
- mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
- and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
-{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
- ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
- shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
-{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
- add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-
-for ($i=1;$i<14;$i++) {
-# Above and below fragments are derived from this one by removing
-# unsuitable (p??) instructions.
-$code.=<<___; # (p16),(p17),(p18),(p19)
-{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
- ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
- shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
-{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
- xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
- xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
-{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
- ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
- dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
-{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
- xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
- xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
-{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
- ld1 $in=[$inp],-1 //(p16) *inp--
- shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
-{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
- xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
- and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
-{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
- ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
- shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
-{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
- xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
- add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-}
-
-$code.=<<___; # (p17),(p18),(p19)
-{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
- ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
- shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
-{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
- xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
- xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
-{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
- ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
- dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
-{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
- xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
- xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
-{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
- shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
-{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
- xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
- and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
-{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
- shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
-{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
- xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
- add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-
-$code.=<<___; # (p18),(p19)
-{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
- shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
-{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
- xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
-{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
- xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
-{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
- xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
-{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
- shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
-{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
- xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
-{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
- shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
-{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
- xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-
-$code.=<<___; # (p19)
-{ .mmi; cmp.ltu p6,p0=$inp,$len
- add $inp=32,$inp
- shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
-{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
- xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
- add $Xip=9,$Xip };; // &Xi.lo
-{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
-(p6) ld1 $in=[$inp],-1 //[p16] *inp--
-(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
-{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
-(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
-{ .mmi; st8 [$Xip]=$Zlo,-8
-(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
- shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
-{ .mmi;
-(p6) ld1 $in=[$inp],-1 //[p16] *inp--
- xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
-(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
-{ .mib;
-(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
-(p6) br.cond.dptk.many .LOOP };;
-
-{ .mib; st8 [$Xip]=$Zhi };;
-{ .mib; $rum 1<<1 // return to little-endian
- .restore sp
- mov sp=prevsp
- br.ret.sptk.many b0 };;
-.endp gcm_ghash_4bit#
-___
-$code.=<<___;
-.align 128
-.type rem_4bit#,\@object
-rem_4bit:
- data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
- data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
- data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
- data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
-.size rem_4bit#,128
-.type rem_8bit#,\@object
-rem_8bit:
- data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
- data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
- data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
- data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
- data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
- data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
- data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
- data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
- data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
- data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
- data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
- data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
- data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
- data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
- data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
- data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
- data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
- data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
- data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
- data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
- data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
- data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
- data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
- data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
- data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
- data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
- data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
- data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
- data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
- data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
- data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
- data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
-.size rem_8bit#,512
-stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
-___
-
-$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-
-print $code;
-close STDOUT;
diff --git a/app/openssl/crypto/modes/asm/ghash-parisc.pl b/app/openssl/crypto/modes/asm/ghash-parisc.pl
deleted file mode 100644
index d5ad96b4..00000000
--- a/app/openssl/crypto/modes/asm/ghash-parisc.pl
+++ /dev/null
@@ -1,731 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# April 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
-# it processes one byte in 19.6 cycles, which is more than twice as
-# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
-# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
-# processed byte. This is ~2.2x faster than 64-bit code generated by
-# vendor compiler (which used to be very hard to beat:-).
-#
-# Special thanks to polarhome.com for providing HP-UX account.
-
-$flavour = shift;
-$output = shift;
-open STDOUT,">$output";
-
-if ($flavour =~ /64/) {
- $LEVEL ="2.0W";
- $SIZE_T =8;
- $FRAME_MARKER =80;
- $SAVED_RP =16;
- $PUSH ="std";
- $PUSHMA ="std,ma";
- $POP ="ldd";
- $POPMB ="ldd,mb";
- $NREGS =6;
-} else {
- $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
- $SIZE_T =4;
- $FRAME_MARKER =48;
- $SAVED_RP =20;
- $PUSH ="stw";
- $PUSHMA ="stwm";
- $POP ="ldw";
- $POPMB ="ldwm";
- $NREGS =11;
-}
-
-$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
- # [+ argument transfer]
-
-################# volatile registers
-$Xi="%r26"; # argument block
-$Htbl="%r25";
-$inp="%r24";
-$len="%r23";
-$Hhh=$Htbl; # variables
-$Hll="%r22";
-$Zhh="%r21";
-$Zll="%r20";
-$cnt="%r19";
-$rem_4bit="%r28";
-$rem="%r29";
-$mask0xf0="%r31";
-
-################# preserved registers
-$Thh="%r1";
-$Tll="%r2";
-$nlo="%r3";
-$nhi="%r4";
-$byte="%r5";
-if ($SIZE_T==4) {
- $Zhl="%r6";
- $Zlh="%r7";
- $Hhl="%r8";
- $Hlh="%r9";
- $Thl="%r10";
- $Tlh="%r11";
-}
-$rem2="%r6"; # used in PA-RISC 2.0 code
-
-$code.=<<___;
- .LEVEL $LEVEL
- .SPACE \$TEXT\$
- .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
-
- .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
- .ALIGN 64
-gcm_gmult_4bit
- .PROC
- .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
- .ENTRY
- $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
- $PUSHMA %r3,$FRAME(%sp)
- $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
- $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
- $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
-___
-$code.=<<___ if ($SIZE_T==4);
- $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
- $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
- $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
- $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
- $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
-___
-$code.=<<___;
- blr %r0,$rem_4bit
- ldi 3,$rem
-L\$pic_gmult
- andcm $rem_4bit,$rem,$rem_4bit
- addl $inp,$len,$len
- ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
- ldi 0xf0,$mask0xf0
-___
-$code.=<<___ if ($SIZE_T==4);
- ldi 31,$rem
- mtctl $rem,%cr11
- extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
- b L\$parisc1_gmult
- nop
-___
-
-$code.=<<___;
- ldb 15($Xi),$nlo
- ldo 8($Htbl),$Hll
-
- and $mask0xf0,$nlo,$nhi
- depd,z $nlo,59,4,$nlo
-
- ldd $nlo($Hll),$Zll
- ldd $nlo($Hhh),$Zhh
-
- depd,z $Zll,60,4,$rem
- shrpd $Zhh,$Zll,4,$Zll
- extrd,u $Zhh,59,60,$Zhh
- ldb 14($Xi),$nlo
-
- ldd $nhi($Hll),$Tll
- ldd $nhi($Hhh),$Thh
- and $mask0xf0,$nlo,$nhi
- depd,z $nlo,59,4,$nlo
-
- xor $Tll,$Zll,$Zll
- xor $Thh,$Zhh,$Zhh
- ldd $rem($rem_4bit),$rem
- b L\$oop_gmult_pa2
- ldi 13,$cnt
-
- .ALIGN 8
-L\$oop_gmult_pa2
- xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
- depd,z $Zll,60,4,$rem
-
- shrpd $Zhh,$Zll,4,$Zll
- extrd,u $Zhh,59,60,$Zhh
- ldd $nlo($Hll),$Tll
- ldd $nlo($Hhh),$Thh
-
- xor $Tll,$Zll,$Zll
- xor $Thh,$Zhh,$Zhh
- ldd $rem($rem_4bit),$rem
-
- xor $rem,$Zhh,$Zhh
- depd,z $Zll,60,4,$rem
- ldbx $cnt($Xi),$nlo
-
- shrpd $Zhh,$Zll,4,$Zll
- extrd,u $Zhh,59,60,$Zhh
- ldd $nhi($Hll),$Tll
- ldd $nhi($Hhh),$Thh
-
- and $mask0xf0,$nlo,$nhi
- depd,z $nlo,59,4,$nlo
- ldd $rem($rem_4bit),$rem
-
- xor $Tll,$Zll,$Zll
- addib,uv -1,$cnt,L\$oop_gmult_pa2
- xor $Thh,$Zhh,$Zhh
-
- xor $rem,$Zhh,$Zhh
- depd,z $Zll,60,4,$rem
-
- shrpd $Zhh,$Zll,4,$Zll
- extrd,u $Zhh,59,60,$Zhh
- ldd $nlo($Hll),$Tll
- ldd $nlo($Hhh),$Thh
-
- xor $Tll,$Zll,$Zll
- xor $Thh,$Zhh,$Zhh
- ldd $rem($rem_4bit),$rem
-
- xor $rem,$Zhh,$Zhh
- depd,z $Zll,60,4,$rem
-
- shrpd $Zhh,$Zll,4,$Zll
- extrd,u $Zhh,59,60,$Zhh
- ldd $nhi($Hll),$Tll
- ldd $nhi($Hhh),$Thh
-
- xor $Tll,$Zll,$Zll
- xor $Thh,$Zhh,$Zhh
- ldd $rem($rem_4bit),$rem
-
- xor $rem,$Zhh,$Zhh
- std $Zll,8($Xi)
- std $Zhh,0($Xi)
-___
-
-$code.=<<___ if ($SIZE_T==4);
- b L\$done_gmult
- nop
-
-L\$parisc1_gmult
- ldb 15($Xi),$nlo
- ldo 12($Htbl),$Hll
- ldo 8($Htbl),$Hlh
- ldo 4($Htbl),$Hhl
-
- and $mask0xf0,$nlo,$nhi
- zdep $nlo,27,4,$nlo
-
- ldwx $nlo($Hll),$Zll
- ldwx $nlo($Hlh),$Zlh
- ldwx $nlo($Hhl),$Zhl
- ldwx $nlo($Hhh),$Zhh
- zdep $Zll,28,4,$rem
- ldb 14($Xi),$nlo
- ldwx $rem($rem_4bit),$rem
- shrpw $Zlh,$Zll,4,$Zll
- ldwx $nhi($Hll),$Tll
- shrpw $Zhl,$Zlh,4,$Zlh
- ldwx $nhi($Hlh),$Tlh
- shrpw $Zhh,$Zhl,4,$Zhl
- ldwx $nhi($Hhl),$Thl
- extru $Zhh,27,28,$Zhh
- ldwx $nhi($Hhh),$Thh
- xor $rem,$Zhh,$Zhh
- and $mask0xf0,$nlo,$nhi
- zdep $nlo,27,4,$nlo
-
- xor $Tll,$Zll,$Zll
- ldwx $nlo($Hll),$Tll
- xor $Tlh,$Zlh,$Zlh
- ldwx $nlo($Hlh),$Tlh
- xor $Thl,$Zhl,$Zhl
- b L\$oop_gmult_pa1
- ldi 13,$cnt
-
- .ALIGN 8
-L\$oop_gmult_pa1
- zdep $Zll,28,4,$rem
- ldwx $nlo($Hhl),$Thl
- xor $Thh,$Zhh,$Zhh
- ldwx $rem($rem_4bit),$rem
- shrpw $Zlh,$Zll,4,$Zll
- ldwx $nlo($Hhh),$Thh
- shrpw $Zhl,$Zlh,4,$Zlh
- ldbx $cnt($Xi),$nlo
- xor $Tll,$Zll,$Zll
- ldwx $nhi($Hll),$Tll
- shrpw $Zhh,$Zhl,4,$Zhl
- xor $Tlh,$Zlh,$Zlh
- ldwx $nhi($Hlh),$Tlh
- extru $Zhh,27,28,$Zhh
- xor $Thl,$Zhl,$Zhl
- ldwx $nhi($Hhl),$Thl
- xor $rem,$Zhh,$Zhh
- zdep $Zll,28,4,$rem
- xor $Thh,$Zhh,$Zhh
- ldwx $nhi($Hhh),$Thh
- shrpw $Zlh,$Zll,4,$Zll
- ldwx $rem($rem_4bit),$rem
- shrpw $Zhl,$Zlh,4,$Zlh
- shrpw $Zhh,$Zhl,4,$Zhl
- and $mask0xf0,$nlo,$nhi
- extru $Zhh,27,28,$Zhh
- zdep $nlo,27,4,$nlo
- xor $Tll,$Zll,$Zll
- ldwx $nlo($Hll),$Tll
- xor $Tlh,$Zlh,$Zlh
- ldwx $nlo($Hlh),$Tlh
- xor $rem,$Zhh,$Zhh
- addib,uv -1,$cnt,L\$oop_gmult_pa1
- xor $Thl,$Zhl,$Zhl
-
- zdep $Zll,28,4,$rem
- ldwx $nlo($Hhl),$Thl
- xor $Thh,$Zhh,$Zhh
- ldwx $rem($rem_4bit),$rem
- shrpw $Zlh,$Zll,4,$Zll
- ldwx $nlo($Hhh),$Thh
- shrpw $Zhl,$Zlh,4,$Zlh
- xor $Tll,$Zll,$Zll
- ldwx $nhi($Hll),$Tll
- shrpw $Zhh,$Zhl,4,$Zhl
- xor $Tlh,$Zlh,$Zlh
- ldwx $nhi($Hlh),$Tlh
- extru $Zhh,27,28,$Zhh
- xor $rem,$Zhh,$Zhh
- xor $Thl,$Zhl,$Zhl
- ldwx $nhi($Hhl),$Thl
- xor $Thh,$Zhh,$Zhh
- ldwx $nhi($Hhh),$Thh
- zdep $Zll,28,4,$rem
- ldwx $rem($rem_4bit),$rem
- shrpw $Zlh,$Zll,4,$Zll
- shrpw $Zhl,$Zlh,4,$Zlh
- shrpw $Zhh,$Zhl,4,$Zhl
- extru $Zhh,27,28,$Zhh
- xor $Tll,$Zll,$Zll
- xor $Tlh,$Zlh,$Zlh
- xor $rem,$Zhh,$Zhh
- stw $Zll,12($Xi)
- xor $Thl,$Zhl,$Zhl
- stw $Zlh,8($Xi)
- xor $Thh,$Zhh,$Zhh
- stw $Zhl,4($Xi)
- stw $Zhh,0($Xi)
-___
-$code.=<<___;
-L\$done_gmult
- $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
- $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
- $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
- $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
-___
-$code.=<<___ if ($SIZE_T==4);
- $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
- $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
- $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
- $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
- $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
-___
-$code.=<<___;
- bv (%r2)
- .EXIT
- $POPMB -$FRAME(%sp),%r3
- .PROCEND
-
- .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
- .ALIGN 64
-gcm_ghash_4bit
- .PROC
- .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
- .ENTRY
- $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
- $PUSHMA %r3,$FRAME(%sp)
- $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
- $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
- $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
-___
-$code.=<<___ if ($SIZE_T==4);
- $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
- $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
- $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
- $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
- $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
-___
-$code.=<<___;
- blr %r0,$rem_4bit
- ldi 3,$rem
-L\$pic_ghash
- andcm $rem_4bit,$rem,$rem_4bit
- addl $inp,$len,$len
- ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
- ldi 0xf0,$mask0xf0
-___
-$code.=<<___ if ($SIZE_T==4);
- ldi 31,$rem
- mtctl $rem,%cr11
- extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
- b L\$parisc1_ghash
- nop
-___
-
-$code.=<<___;
- ldb 15($Xi),$nlo
- ldo 8($Htbl),$Hll
-
-L\$outer_ghash_pa2
- ldb 15($inp),$nhi
- xor $nhi,$nlo,$nlo
- and $mask0xf0,$nlo,$nhi
- depd,z $nlo,59,4,$nlo
-
- ldd $nlo($Hll),$Zll
- ldd $nlo($Hhh),$Zhh
-
- depd,z $Zll,60,4,$rem
- shrpd $Zhh,$Zll,4,$Zll
- extrd,u $Zhh,59,60,$Zhh
- ldb 14($Xi),$nlo
- ldb 14($inp),$byte
-
- ldd $nhi($Hll),$Tll
- ldd $nhi($Hhh),$Thh
- xor $byte,$nlo,$nlo
- and $mask0xf0,$nlo,$nhi
- depd,z $nlo,59,4,$nlo
-
- xor $Tll,$Zll,$Zll
- xor $Thh,$Zhh,$Zhh
- ldd $rem($rem_4bit),$rem
- b L\$oop_ghash_pa2
- ldi 13,$cnt
-
- .ALIGN 8
-L\$oop_ghash_pa2
- xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
- depd,z $Zll,60,4,$rem2
-
- shrpd $Zhh,$Zll,4,$Zll
- extrd,u $Zhh,59,60,$Zhh
- ldd $nlo($Hll),$Tll
- ldd $nlo($Hhh),$Thh
-
- xor $Tll,$Zll,$Zll
- xor $Thh,$Zhh,$Zhh
- ldbx $cnt($Xi),$nlo
- ldbx $cnt($inp),$byte
-
- depd,z $Zll,60,4,$rem
- shrpd $Zhh,$Zll,4,$Zll
- ldd $rem2($rem_4bit),$rem2
-
- xor $rem2,$Zhh,$Zhh
- xor $byte,$nlo,$nlo
- ldd $nhi($Hll),$Tll
- ldd $nhi($Hhh),$Thh
-
- and $mask0xf0,$nlo,$nhi
- depd,z $nlo,59,4,$nlo
-
- extrd,u $Zhh,59,60,$Zhh
- xor $Tll,$Zll,$Zll
-
- ldd $rem($rem_4bit),$rem
- addib,uv -1,$cnt,L\$oop_ghash_pa2
- xor $Thh,$Zhh,$Zhh
-
- xor $rem,$Zhh,$Zhh
- depd,z $Zll,60,4,$rem2
-
- shrpd $Zhh,$Zll,4,$Zll
- extrd,u $Zhh,59,60,$Zhh
- ldd $nlo($Hll),$Tll
- ldd $nlo($Hhh),$Thh
-
- xor $Tll,$Zll,$Zll
- xor $Thh,$Zhh,$Zhh
-
- depd,z $Zll,60,4,$rem
- shrpd $Zhh,$Zll,4,$Zll
- ldd $rem2($rem_4bit),$rem2
-
- xor $rem2,$Zhh,$Zhh
- ldd $nhi($Hll),$Tll
- ldd $nhi($Hhh),$Thh
-
- extrd,u $Zhh,59,60,$Zhh
- xor $Tll,$Zll,$Zll
- xor $Thh,$Zhh,$Zhh
- ldd $rem($rem_4bit),$rem
-
- xor $rem,$Zhh,$Zhh
- std $Zll,8($Xi)
- ldo 16($inp),$inp
- std $Zhh,0($Xi)
- cmpb,*<> $inp,$len,L\$outer_ghash_pa2
- copy $Zll,$nlo
-___
-
-$code.=<<___ if ($SIZE_T==4);
- b L\$done_ghash
- nop
-
-L\$parisc1_ghash
- ldb 15($Xi),$nlo
- ldo 12($Htbl),$Hll
- ldo 8($Htbl),$Hlh
- ldo 4($Htbl),$Hhl
-
-L\$outer_ghash_pa1
- ldb 15($inp),$byte
- xor $byte,$nlo,$nlo
- and $mask0xf0,$nlo,$nhi
- zdep $nlo,27,4,$nlo
-
- ldwx $nlo($Hll),$Zll
- ldwx $nlo($Hlh),$Zlh
- ldwx $nlo($Hhl),$Zhl
- ldwx $nlo($Hhh),$Zhh
- zdep $Zll,28,4,$rem
- ldb 14($Xi),$nlo
- ldb 14($inp),$byte
- ldwx $rem($rem_4bit),$rem
- shrpw $Zlh,$Zll,4,$Zll
- ldwx $nhi($Hll),$Tll
- shrpw $Zhl,$Zlh,4,$Zlh
- ldwx $nhi($Hlh),$Tlh
- shrpw $Zhh,$Zhl,4,$Zhl
- ldwx $nhi($Hhl),$Thl
- extru $Zhh,27,28,$Zhh
- ldwx $nhi($Hhh),$Thh
- xor $byte,$nlo,$nlo
- xor $rem,$Zhh,$Zhh
- and $mask0xf0,$nlo,$nhi
- zdep $nlo,27,4,$nlo
-
- xor $Tll,$Zll,$Zll
- ldwx $nlo($Hll),$Tll
- xor $Tlh,$Zlh,$Zlh
- ldwx $nlo($Hlh),$Tlh
- xor $Thl,$Zhl,$Zhl
- b L\$oop_ghash_pa1
- ldi 13,$cnt
-
- .ALIGN 8
-L\$oop_ghash_pa1
- zdep $Zll,28,4,$rem
- ldwx $nlo($Hhl),$Thl
- xor $Thh,$Zhh,$Zhh
- ldwx $rem($rem_4bit),$rem
- shrpw $Zlh,$Zll,4,$Zll
- ldwx $nlo($Hhh),$Thh
- shrpw $Zhl,$Zlh,4,$Zlh
- ldbx $cnt($Xi),$nlo
- xor $Tll,$Zll,$Zll
- ldwx $nhi($Hll),$Tll
- shrpw $Zhh,$Zhl,4,$Zhl
- ldbx $cnt($inp),$byte
- xor $Tlh,$Zlh,$Zlh
- ldwx $nhi($Hlh),$Tlh
- extru $Zhh,27,28,$Zhh
- xor $Thl,$Zhl,$Zhl
- ldwx $nhi($Hhl),$Thl
- xor $rem,$Zhh,$Zhh
- zdep $Zll,28,4,$rem
- xor $Thh,$Zhh,$Zhh
- ldwx $nhi($Hhh),$Thh
- shrpw $Zlh,$Zll,4,$Zll
- ldwx $rem($rem_4bit),$rem
- shrpw $Zhl,$Zlh,4,$Zlh
- xor $byte,$nlo,$nlo
- shrpw $Zhh,$Zhl,4,$Zhl
- and $mask0xf0,$nlo,$nhi
- extru $Zhh,27,28,$Zhh
- zdep $nlo,27,4,$nlo
- xor $Tll,$Zll,$Zll
- ldwx $nlo($Hll),$Tll
- xor $Tlh,$Zlh,$Zlh
- ldwx $nlo($Hlh),$Tlh
- xor $rem,$Zhh,$Zhh
- addib,uv -1,$cnt,L\$oop_ghash_pa1
- xor $Thl,$Zhl,$Zhl
-
- zdep $Zll,28,4,$rem
- ldwx $nlo($Hhl),$Thl
- xor $Thh,$Zhh,$Zhh
- ldwx $rem($rem_4bit),$rem
- shrpw $Zlh,$Zll,4,$Zll
- ldwx $nlo($Hhh),$Thh
- shrpw $Zhl,$Zlh,4,$Zlh
- xor $Tll,$Zll,$Zll
- ldwx $nhi($Hll),$Tll
- shrpw $Zhh,$Zhl,4,$Zhl
- xor $Tlh,$Zlh,$Zlh
- ldwx $nhi($Hlh),$Tlh
- extru $Zhh,27,28,$Zhh
- xor $rem,$Zhh,$Zhh
- xor $Thl,$Zhl,$Zhl
- ldwx $nhi($Hhl),$Thl
- xor $Thh,$Zhh,$Zhh
- ldwx $nhi($Hhh),$Thh
- zdep $Zll,28,4,$rem
- ldwx $rem($rem_4bit),$rem
- shrpw $Zlh,$Zll,4,$Zll
- shrpw $Zhl,$Zlh,4,$Zlh
- shrpw $Zhh,$Zhl,4,$Zhl
- extru $Zhh,27,28,$Zhh
- xor $Tll,$Zll,$Zll
- xor $Tlh,$Zlh,$Zlh
- xor $rem,$Zhh,$Zhh
- stw $Zll,12($Xi)
- xor $Thl,$Zhl,$Zhl
- stw $Zlh,8($Xi)
- xor $Thh,$Zhh,$Zhh
- stw $Zhl,4($Xi)
- ldo 16($inp),$inp
- stw $Zhh,0($Xi)
- comb,<> $inp,$len,L\$outer_ghash_pa1
- copy $Zll,$nlo
-___
-$code.=<<___;
-L\$done_ghash
- $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
- $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
- $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
- $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
-___
-$code.=<<___ if ($SIZE_T==4);
- $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
- $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
- $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
- $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
- $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
-___
-$code.=<<___;
- bv (%r2)
- .EXIT
- $POPMB -$FRAME(%sp),%r3
- .PROCEND
-
- .ALIGN 64
-L\$rem_4bit
- .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
- .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
- .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
- .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
- .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
- .ALIGN 64
-___
-
-# Explicitly encode PA-RISC 2.0 instructions used in this module, so
-# that it can be compiled with .LEVEL 1.0. It should be noted that I
-# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
-# directive...
-
-my $ldd = sub {
- my ($mod,$args) = @_;
- my $orig = "ldd$mod\t$args";
-
- if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
- { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
- sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
- }
- elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
- { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
- $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
- $opcode|=(1<<5) if ($mod =~ /^,m/);
- $opcode|=(1<<13) if ($mod =~ /^,mb/);
- sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
- }
- else { "\t".$orig; }
-};
-
-my $std = sub {
- my ($mod,$args) = @_;
- my $orig = "std$mod\t$args";
-
- if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
- { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
- sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
- }
- else { "\t".$orig; }
-};
-
-my $extrd = sub {
- my ($mod,$args) = @_;
- my $orig = "extrd$mod\t$args";
-
- # I only have ",u" completer, it's implicitly encoded...
- if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
- { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
- my $len=32-$3;
- $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
- $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
- sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
- }
- elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
- { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
- my $len=32-$2;
- $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
- $opcode |= (1<<13) if ($mod =~ /,\**=/);
- sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
- }
- else { "\t".$orig; }
-};
-
-my $shrpd = sub {
- my ($mod,$args) = @_;
- my $orig = "shrpd$mod\t$args";
-
- if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
- { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
- my $cpos=63-$3;
- $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
- sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
- }
- elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
- { sprintf "\t.WORD\t0x%08x\t; %s",
- (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
- }
- else { "\t".$orig; }
-};
-
-my $depd = sub {
- my ($mod,$args) = @_;
- my $orig = "depd$mod\t$args";
-
- # I only have ",z" completer, it's impicitly encoded...
- if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
- { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
- my $cpos=63-$2;
- my $len=32-$3;
- $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
- $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
- sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
- }
- else { "\t".$orig; }
-};
-
-sub assemble {
- my ($mnemonic,$mod,$args)=@_;
- my $opcode = eval("\$$mnemonic");
-
- ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
-}
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/ge;
- if ($SIZE_T==4) {
- s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
- s/cmpb,\*/comb,/;
- s/,\*/,/;
- }
- s/\bbv\b/bve/ if ($SIZE_T==8);
- print $_,"\n";
-}
-
-close STDOUT;
diff --git a/app/openssl/crypto/modes/asm/ghash-s390x.pl b/app/openssl/crypto/modes/asm/ghash-s390x.pl
deleted file mode 100644
index 6a40d5d8..00000000
--- a/app/openssl/crypto/modes/asm/ghash-s390x.pl
+++ /dev/null
@@ -1,262 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# September 2010.
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Performance
-# was measured to be ~18 cycles per processed byte on z10, which is
-# almost 40% better than gcc-generated code. It should be noted that
-# 18 cycles is worse result than expected: loop is scheduled for 12
-# and the result should be close to 12. In the lack of instruction-
-# level profiling data it's impossible to tell why...
-
-# November 2010.
-#
-# Adapt for -m31 build. If kernel supports what's called "highgprs"
-# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
-# instructions and achieve "64-bit" performance even in 31-bit legacy
-# application context. The feature is not specific to any particular
-# processor, as long as it's "z-CPU". Latter implies that the code
-# remains z/Architecture specific. On z990 it was measured to perform
-# 2.8x better than 32-bit code generated by gcc 4.3.
-
-# March 2011.
-#
-# Support for hardware KIMD-GHASH is verified to produce correct
-# result and therefore is engaged. On z196 it was measured to process
-# 8KB buffer ~7 faster than software implementation. It's not as
-# impressive for smaller buffer sizes and for smallest 16-bytes buffer
-# it's actually almost 2 times slower. Which is the reason why
-# KIMD-GHASH is not used in gcm_gmult_4bit.
-
-$flavour = shift;
-
-if ($flavour =~ /3[12]/) {
- $SIZE_T=4;
- $g="";
-} else {
- $SIZE_T=8;
- $g="g";
-}
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$softonly=0;
-
-$Zhi="%r0";
-$Zlo="%r1";
-
-$Xi="%r2"; # argument block
-$Htbl="%r3";
-$inp="%r4";
-$len="%r5";
-
-$rem0="%r6"; # variables
-$rem1="%r7";
-$nlo="%r8";
-$nhi="%r9";
-$xi="%r10";
-$cnt="%r11";
-$tmp="%r12";
-$x78="%r13";
-$rem_4bit="%r14";
-
-$sp="%r15";
-
-$code.=<<___;
-.text
-
-.globl gcm_gmult_4bit
-.align 32
-gcm_gmult_4bit:
-___
-$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
- larl %r1,OPENSSL_s390xcap_P
- lg %r0,0(%r1)
- tmhl %r0,0x4000 # check for message-security-assist
- jz .Lsoft_gmult
- lghi %r0,0
- la %r1,16($sp)
- .long 0xb93e0004 # kimd %r0,%r4
- lg %r1,24($sp)
- tmhh %r1,0x4000 # check for function 65
- jz .Lsoft_gmult
- stg %r0,16($sp) # arrange 16 bytes of zero input
- stg %r0,24($sp)
- lghi %r0,65 # function 65
- la %r1,0($Xi) # H lies right after Xi in gcm128_context
- la $inp,16($sp)
- lghi $len,16
- .long 0xb93e0004 # kimd %r0,$inp
- brc 1,.-4 # pay attention to "partial completion"
- br %r14
-.align 32
-.Lsoft_gmult:
-___
-$code.=<<___;
- stm${g} %r6,%r14,6*$SIZE_T($sp)
-
- aghi $Xi,-1
- lghi $len,1
- lghi $x78,`0xf<<3`
- larl $rem_4bit,rem_4bit
-
- lg $Zlo,8+1($Xi) # Xi
- j .Lgmult_shortcut
-.type gcm_gmult_4bit,\@function
-.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
-
-.globl gcm_ghash_4bit
-.align 32
-gcm_ghash_4bit:
-___
-$code.=<<___ if(!$softonly);
- larl %r1,OPENSSL_s390xcap_P
- lg %r0,0(%r1)
- tmhl %r0,0x4000 # check for message-security-assist
- jz .Lsoft_ghash
- lghi %r0,0
- la %r1,16($sp)
- .long 0xb93e0004 # kimd %r0,%r4
- lg %r1,24($sp)
- tmhh %r1,0x4000 # check for function 65
- jz .Lsoft_ghash
- lghi %r0,65 # function 65
- la %r1,0($Xi) # H lies right after Xi in gcm128_context
- .long 0xb93e0004 # kimd %r0,$inp
- brc 1,.-4 # pay attention to "partial completion"
- br %r14
-.align 32
-.Lsoft_ghash:
-___
-$code.=<<___ if ($flavour =~ /3[12]/);
- llgfr $len,$len
-___
-$code.=<<___;
- stm${g} %r6,%r14,6*$SIZE_T($sp)
-
- aghi $Xi,-1
- srlg $len,$len,4
- lghi $x78,`0xf<<3`
- larl $rem_4bit,rem_4bit
-
- lg $Zlo,8+1($Xi) # Xi
- lg $Zhi,0+1($Xi)
- lghi $tmp,0
-.Louter:
- xg $Zhi,0($inp) # Xi ^= inp
- xg $Zlo,8($inp)
- xgr $Zhi,$tmp
- stg $Zlo,8+1($Xi)
- stg $Zhi,0+1($Xi)
-
-.Lgmult_shortcut:
- lghi $tmp,0xf0
- sllg $nlo,$Zlo,4
- srlg $xi,$Zlo,8 # extract second byte
- ngr $nlo,$tmp
- lgr $nhi,$Zlo
- lghi $cnt,14
- ngr $nhi,$tmp
-
- lg $Zlo,8($nlo,$Htbl)
- lg $Zhi,0($nlo,$Htbl)
-
- sllg $nlo,$xi,4
- sllg $rem0,$Zlo,3
- ngr $nlo,$tmp
- ngr $rem0,$x78
- ngr $xi,$tmp
-
- sllg $tmp,$Zhi,60
- srlg $Zlo,$Zlo,4
- srlg $Zhi,$Zhi,4
- xg $Zlo,8($nhi,$Htbl)
- xg $Zhi,0($nhi,$Htbl)
- lgr $nhi,$xi
- sllg $rem1,$Zlo,3
- xgr $Zlo,$tmp
- ngr $rem1,$x78
- j .Lghash_inner
-.align 16
-.Lghash_inner:
- srlg $Zlo,$Zlo,4
- sllg $tmp,$Zhi,60
- xg $Zlo,8($nlo,$Htbl)
- srlg $Zhi,$Zhi,4
- llgc $xi,0($cnt,$Xi)
- xg $Zhi,0($nlo,$Htbl)
- sllg $nlo,$xi,4
- xg $Zhi,0($rem0,$rem_4bit)
- nill $nlo,0xf0
- sllg $rem0,$Zlo,3
- xgr $Zlo,$tmp
- ngr $rem0,$x78
- nill $xi,0xf0
-
- sllg $tmp,$Zhi,60
- srlg $Zlo,$Zlo,4
- srlg $Zhi,$Zhi,4
- xg $Zlo,8($nhi,$Htbl)
- xg $Zhi,0($nhi,$Htbl)
- lgr $nhi,$xi
- xg $Zhi,0($rem1,$rem_4bit)
- sllg $rem1,$Zlo,3
- xgr $Zlo,$tmp
- ngr $rem1,$x78
- brct $cnt,.Lghash_inner
-
- sllg $tmp,$Zhi,60
- srlg $Zlo,$Zlo,4
- srlg $Zhi,$Zhi,4
- xg $Zlo,8($nlo,$Htbl)
- xg $Zhi,0($nlo,$Htbl)
- sllg $xi,$Zlo,3
- xg $Zhi,0($rem0,$rem_4bit)
- xgr $Zlo,$tmp
- ngr $xi,$x78
-
- sllg $tmp,$Zhi,60
- srlg $Zlo,$Zlo,4
- srlg $Zhi,$Zhi,4
- xg $Zlo,8($nhi,$Htbl)
- xg $Zhi,0($nhi,$Htbl)
- xgr $Zlo,$tmp
- xg $Zhi,0($rem1,$rem_4bit)
-
- lg $tmp,0($xi,$rem_4bit)
- la $inp,16($inp)
- sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
- brctg $len,.Louter
-
- xgr $Zhi,$tmp
- stg $Zlo,8+1($Xi)
- stg $Zhi,0+1($Xi)
- lm${g} %r6,%r14,6*$SIZE_T($sp)
- br %r14
-.type gcm_ghash_4bit,\@function
-.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
-
-.align 64
-rem_4bit:
- .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
- .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
- .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
- .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
-.type rem_4bit,\@object
-.size rem_4bit,(.-rem_4bit)
-.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
-close STDOUT;
diff --git a/app/openssl/crypto/modes/asm/ghash-sparcv9.pl b/app/openssl/crypto/modes/asm/ghash-sparcv9.pl
deleted file mode 100644
index 70e7b044..00000000
--- a/app/openssl/crypto/modes/asm/ghash-sparcv9.pl
+++ /dev/null
@@ -1,330 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# March 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Performance
-# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
-# and are expressed in cycles per processed byte, less is better:
-#
-# gcc 3.3.x cc 5.2 this assembler
-#
-# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
-# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
-#
-# Here is data collected on UltraSPARC T1 system running Linux:
-#
-# gcc 4.4.1 this assembler
-#
-# 32-bit build 566 50 (+1000%)
-# 64-bit build 56 50 (+12%)
-#
-# I don't quite understand why difference between 32-bit and 64-bit
-# compiler-generated code is so big. Compilers *were* instructed to
-# generate code for UltraSPARC and should have used 64-bit registers
-# for Z vector (see C code) even in 32-bit build... Oh well, it only
-# means more impressive improvement coefficients for this assembler
-# module;-) Loops are aggressively modulo-scheduled in respect to
-# references to input data and Z.hi updates to achieve 12 cycles
-# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
-# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
-
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else { $bias=0; $frame=112; }
-
-$output=shift;
-open STDOUT,">$output";
-
-$Zhi="%o0"; # 64-bit values
-$Zlo="%o1";
-$Thi="%o2";
-$Tlo="%o3";
-$rem="%o4";
-$tmp="%o5";
-
-$nhi="%l0"; # small values and pointers
-$nlo="%l1";
-$xi0="%l2";
-$xi1="%l3";
-$rem_4bit="%l4";
-$remi="%l5";
-$Htblo="%l6";
-$cnt="%l7";
-
-$Xi="%i0"; # input argument block
-$Htbl="%i1";
-$inp="%i2";
-$len="%i3";
-
-$code.=<<___;
-.section ".text",#alloc,#execinstr
-
-.align 64
-rem_4bit:
- .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
- .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
- .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
- .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
-.type rem_4bit,#object
-.size rem_4bit,(.-rem_4bit)
-
-.globl gcm_ghash_4bit
-.align 32
-gcm_ghash_4bit:
- save %sp,-$frame,%sp
- ldub [$inp+15],$nlo
- ldub [$Xi+15],$xi0
- ldub [$Xi+14],$xi1
- add $len,$inp,$len
- add $Htbl,8,$Htblo
-
-1: call .+8
- add %o7,rem_4bit-1b,$rem_4bit
-
-.Louter:
- xor $xi0,$nlo,$nlo
- and $nlo,0xf0,$nhi
- and $nlo,0x0f,$nlo
- sll $nlo,4,$nlo
- ldx [$Htblo+$nlo],$Zlo
- ldx [$Htbl+$nlo],$Zhi
-
- ldub [$inp+14],$nlo
-
- ldx [$Htblo+$nhi],$Tlo
- and $Zlo,0xf,$remi
- ldx [$Htbl+$nhi],$Thi
- sll $remi,3,$remi
- ldx [$rem_4bit+$remi],$rem
- srlx $Zlo,4,$Zlo
- mov 13,$cnt
- sllx $Zhi,60,$tmp
- xor $Tlo,$Zlo,$Zlo
- srlx $Zhi,4,$Zhi
- xor $Zlo,$tmp,$Zlo
-
- xor $xi1,$nlo,$nlo
- and $Zlo,0xf,$remi
- and $nlo,0xf0,$nhi
- and $nlo,0x0f,$nlo
- ba .Lghash_inner
- sll $nlo,4,$nlo
-.align 32
-.Lghash_inner:
- ldx [$Htblo+$nlo],$Tlo
- sll $remi,3,$remi
- xor $Thi,$Zhi,$Zhi
- ldx [$Htbl+$nlo],$Thi
- srlx $Zlo,4,$Zlo
- xor $rem,$Zhi,$Zhi
- ldx [$rem_4bit+$remi],$rem
- sllx $Zhi,60,$tmp
- xor $Tlo,$Zlo,$Zlo
- ldub [$inp+$cnt],$nlo
- srlx $Zhi,4,$Zhi
- xor $Zlo,$tmp,$Zlo
- ldub [$Xi+$cnt],$xi1
- xor $Thi,$Zhi,$Zhi
- and $Zlo,0xf,$remi
-
- ldx [$Htblo+$nhi],$Tlo
- sll $remi,3,$remi
- xor $rem,$Zhi,$Zhi
- ldx [$Htbl+$nhi],$Thi
- srlx $Zlo,4,$Zlo
- ldx [$rem_4bit+$remi],$rem
- sllx $Zhi,60,$tmp
- xor $xi1,$nlo,$nlo
- srlx $Zhi,4,$Zhi
- and $nlo,0xf0,$nhi
- addcc $cnt,-1,$cnt
- xor $Zlo,$tmp,$Zlo
- and $nlo,0x0f,$nlo
- xor $Tlo,$Zlo,$Zlo
- sll $nlo,4,$nlo
- blu .Lghash_inner
- and $Zlo,0xf,$remi
-
- ldx [$Htblo+$nlo],$Tlo
- sll $remi,3,$remi
- xor $Thi,$Zhi,$Zhi
- ldx [$Htbl+$nlo],$Thi
- srlx $Zlo,4,$Zlo
- xor $rem,$Zhi,$Zhi
- ldx [$rem_4bit+$remi],$rem
- sllx $Zhi,60,$tmp
- xor $Tlo,$Zlo,$Zlo
- srlx $Zhi,4,$Zhi
- xor $Zlo,$tmp,$Zlo
- xor $Thi,$Zhi,$Zhi
-
- add $inp,16,$inp
- cmp $inp,$len
- be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
- and $Zlo,0xf,$remi
-
- ldx [$Htblo+$nhi],$Tlo
- sll $remi,3,$remi
- xor $rem,$Zhi,$Zhi
- ldx [$Htbl+$nhi],$Thi
- srlx $Zlo,4,$Zlo
- ldx [$rem_4bit+$remi],$rem
- sllx $Zhi,60,$tmp
- xor $Tlo,$Zlo,$Zlo
- ldub [$inp+15],$nlo
- srlx $Zhi,4,$Zhi
- xor $Zlo,$tmp,$Zlo
- xor $Thi,$Zhi,$Zhi
- stx $Zlo,[$Xi+8]
- xor $rem,$Zhi,$Zhi
- stx $Zhi,[$Xi]
- srl $Zlo,8,$xi1
- and $Zlo,0xff,$xi0
- ba .Louter
- and $xi1,0xff,$xi1
-.align 32
-.Ldone:
- ldx [$Htblo+$nhi],$Tlo
- sll $remi,3,$remi
- xor $rem,$Zhi,$Zhi
- ldx [$Htbl+$nhi],$Thi
- srlx $Zlo,4,$Zlo
- ldx [$rem_4bit+$remi],$rem
- sllx $Zhi,60,$tmp
- xor $Tlo,$Zlo,$Zlo
- srlx $Zhi,4,$Zhi
- xor $Zlo,$tmp,$Zlo
- xor $Thi,$Zhi,$Zhi
- stx $Zlo,[$Xi+8]
- xor $rem,$Zhi,$Zhi
- stx $Zhi,[$Xi]
-
- ret
- restore
-.type gcm_ghash_4bit,#function
-.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
-___
-
-undef $inp;
-undef $len;
-
-$code.=<<___;
-.globl gcm_gmult_4bit
-.align 32
-gcm_gmult_4bit:
- save %sp,-$frame,%sp
- ldub [$Xi+15],$nlo
- add $Htbl,8,$Htblo
-
-1: call .+8
- add %o7,rem_4bit-1b,$rem_4bit
-
- and $nlo,0xf0,$nhi
- and $nlo,0x0f,$nlo
- sll $nlo,4,$nlo
- ldx [$Htblo+$nlo],$Zlo
- ldx [$Htbl+$nlo],$Zhi
-
- ldub [$Xi+14],$nlo
-
- ldx [$Htblo+$nhi],$Tlo
- and $Zlo,0xf,$remi
- ldx [$Htbl+$nhi],$Thi
- sll $remi,3,$remi
- ldx [$rem_4bit+$remi],$rem
- srlx $Zlo,4,$Zlo
- mov 13,$cnt
- sllx $Zhi,60,$tmp
- xor $Tlo,$Zlo,$Zlo
- srlx $Zhi,4,$Zhi
- xor $Zlo,$tmp,$Zlo
-
- and $Zlo,0xf,$remi
- and $nlo,0xf0,$nhi
- and $nlo,0x0f,$nlo
- ba .Lgmult_inner
- sll $nlo,4,$nlo
-.align 32
-.Lgmult_inner:
- ldx [$Htblo+$nlo],$Tlo
- sll $remi,3,$remi
- xor $Thi,$Zhi,$Zhi
- ldx [$Htbl+$nlo],$Thi
- srlx $Zlo,4,$Zlo
- xor $rem,$Zhi,$Zhi
- ldx [$rem_4bit+$remi],$rem
- sllx $Zhi,60,$tmp
- xor $Tlo,$Zlo,$Zlo
- ldub [$Xi+$cnt],$nlo
- srlx $Zhi,4,$Zhi
- xor $Zlo,$tmp,$Zlo
- xor $Thi,$Zhi,$Zhi
- and $Zlo,0xf,$remi
-
- ldx [$Htblo+$nhi],$Tlo
- sll $remi,3,$remi
- xor $rem,$Zhi,$Zhi
- ldx [$Htbl+$nhi],$Thi
- srlx $Zlo,4,$Zlo
- ldx [$rem_4bit+$remi],$rem
- sllx $Zhi,60,$tmp
- srlx $Zhi,4,$Zhi
- and $nlo,0xf0,$nhi
- addcc $cnt,-1,$cnt
- xor $Zlo,$tmp,$Zlo
- and $nlo,0x0f,$nlo
- xor $Tlo,$Zlo,$Zlo
- sll $nlo,4,$nlo
- blu .Lgmult_inner
- and $Zlo,0xf,$remi
-
- ldx [$Htblo+$nlo],$Tlo
- sll $remi,3,$remi
- xor $Thi,$Zhi,$Zhi
- ldx [$Htbl+$nlo],$Thi
- srlx $Zlo,4,$Zlo
- xor $rem,$Zhi,$Zhi
- ldx [$rem_4bit+$remi],$rem
- sllx $Zhi,60,$tmp
- xor $Tlo,$Zlo,$Zlo
- srlx $Zhi,4,$Zhi
- xor $Zlo,$tmp,$Zlo
- xor $Thi,$Zhi,$Zhi
- and $Zlo,0xf,$remi
-
- ldx [$Htblo+$nhi],$Tlo
- sll $remi,3,$remi
- xor $rem,$Zhi,$Zhi
- ldx [$Htbl+$nhi],$Thi
- srlx $Zlo,4,$Zlo
- ldx [$rem_4bit+$remi],$rem
- sllx $Zhi,60,$tmp
- xor $Tlo,$Zlo,$Zlo
- srlx $Zhi,4,$Zhi
- xor $Zlo,$tmp,$Zlo
- xor $Thi,$Zhi,$Zhi
- stx $Zlo,[$Xi+8]
- xor $rem,$Zhi,$Zhi
- stx $Zhi,[$Xi]
-
- ret
- restore
-.type gcm_gmult_4bit,#function
-.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
-.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
-.align 4
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
-close STDOUT;
diff --git a/app/openssl/crypto/modes/asm/ghash-x86.S b/app/openssl/crypto/modes/asm/ghash-x86.S
deleted file mode 100644
index 50473201..00000000
--- a/app/openssl/crypto/modes/asm/ghash-x86.S
+++ /dev/null
@@ -1,1269 +0,0 @@
-.file "ghash-x86.s"
-.text
-.globl gcm_gmult_4bit_x86
-.type gcm_gmult_4bit_x86,@function
-.align 16
-gcm_gmult_4bit_x86:
-.L_gcm_gmult_4bit_x86_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- subl $84,%esp
- movl 104(%esp),%edi
- movl 108(%esp),%esi
- movl (%edi),%ebp
- movl 4(%edi),%edx
- movl 8(%edi),%ecx
- movl 12(%edi),%ebx
- movl $0,16(%esp)
- movl $471859200,20(%esp)
- movl $943718400,24(%esp)
- movl $610271232,28(%esp)
- movl $1887436800,32(%esp)
- movl $1822425088,36(%esp)
- movl $1220542464,40(%esp)
- movl $1423966208,44(%esp)
- movl $3774873600,48(%esp)
- movl $4246732800,52(%esp)
- movl $3644850176,56(%esp)
- movl $3311403008,60(%esp)
- movl $2441084928,64(%esp)
- movl $2376073216,68(%esp)
- movl $2847932416,72(%esp)
- movl $3051356160,76(%esp)
- movl %ebp,(%esp)
- movl %edx,4(%esp)
- movl %ecx,8(%esp)
- movl %ebx,12(%esp)
- shrl $20,%ebx
- andl $240,%ebx
- movl 4(%esi,%ebx,1),%ebp
- movl (%esi,%ebx,1),%edx
- movl 12(%esi,%ebx,1),%ecx
- movl 8(%esi,%ebx,1),%ebx
- xorl %eax,%eax
- movl $15,%edi
- jmp .L000x86_loop
-.align 16
-.L000x86_loop:
- movb %bl,%al
- shrdl $4,%ecx,%ebx
- andb $15,%al
- shrdl $4,%edx,%ecx
- shrdl $4,%ebp,%edx
- shrl $4,%ebp
- xorl 16(%esp,%eax,4),%ebp
- movb (%esp,%edi,1),%al
- andb $240,%al
- xorl 8(%esi,%eax,1),%ebx
- xorl 12(%esi,%eax,1),%ecx
- xorl (%esi,%eax,1),%edx
- xorl 4(%esi,%eax,1),%ebp
- decl %edi
- js .L001x86_break
- movb %bl,%al
- shrdl $4,%ecx,%ebx
- andb $15,%al
- shrdl $4,%edx,%ecx
- shrdl $4,%ebp,%edx
- shrl $4,%ebp
- xorl 16(%esp,%eax,4),%ebp
- movb (%esp,%edi,1),%al
- shlb $4,%al
- xorl 8(%esi,%eax,1),%ebx
- xorl 12(%esi,%eax,1),%ecx
- xorl (%esi,%eax,1),%edx
- xorl 4(%esi,%eax,1),%ebp
- jmp .L000x86_loop
-.align 16
-.L001x86_break:
- bswap %ebx
- bswap %ecx
- bswap %edx
- bswap %ebp
- movl 104(%esp),%edi
- movl %ebx,12(%edi)
- movl %ecx,8(%edi)
- movl %edx,4(%edi)
- movl %ebp,(%edi)
- addl $84,%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.size gcm_gmult_4bit_x86,.-.L_gcm_gmult_4bit_x86_begin
-.globl gcm_ghash_4bit_x86
-.type gcm_ghash_4bit_x86,@function
-.align 16
-gcm_ghash_4bit_x86:
-.L_gcm_ghash_4bit_x86_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- subl $84,%esp
- movl 104(%esp),%ebx
- movl 108(%esp),%esi
- movl 112(%esp),%edi
- movl 116(%esp),%ecx
- addl %edi,%ecx
- movl %ecx,116(%esp)
- movl (%ebx),%ebp
- movl 4(%ebx),%edx
- movl 8(%ebx),%ecx
- movl 12(%ebx),%ebx
- movl $0,16(%esp)
- movl $471859200,20(%esp)
- movl $943718400,24(%esp)
- movl $610271232,28(%esp)
- movl $1887436800,32(%esp)
- movl $1822425088,36(%esp)
- movl $1220542464,40(%esp)
- movl $1423966208,44(%esp)
- movl $3774873600,48(%esp)
- movl $4246732800,52(%esp)
- movl $3644850176,56(%esp)
- movl $3311403008,60(%esp)
- movl $2441084928,64(%esp)
- movl $2376073216,68(%esp)
- movl $2847932416,72(%esp)
- movl $3051356160,76(%esp)
-.align 16
-.L002x86_outer_loop:
- xorl 12(%edi),%ebx
- xorl 8(%edi),%ecx
- xorl 4(%edi),%edx
- xorl (%edi),%ebp
- movl %ebx,12(%esp)
- movl %ecx,8(%esp)
- movl %edx,4(%esp)
- movl %ebp,(%esp)
- shrl $20,%ebx
- andl $240,%ebx
- movl 4(%esi,%ebx,1),%ebp
- movl (%esi,%ebx,1),%edx
- movl 12(%esi,%ebx,1),%ecx
- movl 8(%esi,%ebx,1),%ebx
- xorl %eax,%eax
- movl $15,%edi
- jmp .L003x86_loop
-.align 16
-.L003x86_loop:
- movb %bl,%al
- shrdl $4,%ecx,%ebx
- andb $15,%al
- shrdl $4,%edx,%ecx
- shrdl $4,%ebp,%edx
- shrl $4,%ebp
- xorl 16(%esp,%eax,4),%ebp
- movb (%esp,%edi,1),%al
- andb $240,%al
- xorl 8(%esi,%eax,1),%ebx
- xorl 12(%esi,%eax,1),%ecx
- xorl (%esi,%eax,1),%edx
- xorl 4(%esi,%eax,1),%ebp
- decl %edi
- js .L004x86_break
- movb %bl,%al
- shrdl $4,%ecx,%ebx
- andb $15,%al
- shrdl $4,%edx,%ecx
- shrdl $4,%ebp,%edx
- shrl $4,%ebp
- xorl 16(%esp,%eax,4),%ebp
- movb (%esp,%edi,1),%al
- shlb $4,%al
- xorl 8(%esi,%eax,1),%ebx
- xorl 12(%esi,%eax,1),%ecx
- xorl (%esi,%eax,1),%edx
- xorl 4(%esi,%eax,1),%ebp
- jmp .L003x86_loop
-.align 16
-.L004x86_break:
- bswap %ebx
- bswap %ecx
- bswap %edx
- bswap %ebp
- movl 112(%esp),%edi
- leal 16(%edi),%edi
- cmpl 116(%esp),%edi
- movl %edi,112(%esp)
- jb .L002x86_outer_loop
- movl 104(%esp),%edi
- movl %ebx,12(%edi)
- movl %ecx,8(%edi)
- movl %edx,4(%edi)
- movl %ebp,(%edi)
- addl $84,%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.size gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
-.globl gcm_gmult_4bit_mmx
-.type gcm_gmult_4bit_mmx,@function
-.align 16
-gcm_gmult_4bit_mmx:
-.L_gcm_gmult_4bit_mmx_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%edi
- movl 24(%esp),%esi
- call .L005pic_point
-.L005pic_point:
- popl %eax
- leal .Lrem_4bit-.L005pic_point(%eax),%eax
- movzbl 15(%edi),%ebx
- xorl %ecx,%ecx
- movl %ebx,%edx
- movb %dl,%cl
- movl $14,%ebp
- shlb $4,%cl
- andl $240,%edx
- movq 8(%esi,%ecx,1),%mm0
- movq (%esi,%ecx,1),%mm1
- movd %mm0,%ebx
- jmp .L006mmx_loop
-.align 16
-.L006mmx_loop:
- psrlq $4,%mm0
- andl $15,%ebx
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb (%edi,%ebp,1),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- decl %ebp
- movd %mm0,%ebx
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- pxor %mm2,%mm0
- js .L007mmx_break
- shlb $4,%cl
- andl $15,%ebx
- psrlq $4,%mm0
- andl $240,%edx
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- movd %mm0,%ebx
- pxor (%esi,%ecx,1),%mm1
- pxor %mm2,%mm0
- jmp .L006mmx_loop
-.align 16
-.L007mmx_break:
- shlb $4,%cl
- andl $15,%ebx
- psrlq $4,%mm0
- andl $240,%edx
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- movd %mm0,%ebx
- pxor (%esi,%ecx,1),%mm1
- pxor %mm2,%mm0
- psrlq $4,%mm0
- andl $15,%ebx
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- movd %mm0,%ebx
- pxor (%esi,%edx,1),%mm1
- pxor %mm2,%mm0
- psrlq $32,%mm0
- movd %mm1,%edx
- psrlq $32,%mm1
- movd %mm0,%ecx
- movd %mm1,%ebp
- bswap %ebx
- bswap %edx
- bswap %ecx
- bswap %ebp
- emms
- movl %ebx,12(%edi)
- movl %edx,4(%edi)
- movl %ecx,8(%edi)
- movl %ebp,(%edi)
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.size gcm_gmult_4bit_mmx,.-.L_gcm_gmult_4bit_mmx_begin
-.globl gcm_ghash_4bit_mmx
-.type gcm_ghash_4bit_mmx,@function
-.align 16
-gcm_ghash_4bit_mmx:
-.L_gcm_ghash_4bit_mmx_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%eax
- movl 24(%esp),%ebx
- movl 28(%esp),%ecx
- movl 32(%esp),%edx
- movl %esp,%ebp
- call .L008pic_point
-.L008pic_point:
- popl %esi
- leal .Lrem_8bit-.L008pic_point(%esi),%esi
- subl $544,%esp
- andl $-64,%esp
- subl $16,%esp
- addl %ecx,%edx
- movl %eax,544(%esp)
- movl %edx,552(%esp)
- movl %ebp,556(%esp)
- addl $128,%ebx
- leal 144(%esp),%edi
- leal 400(%esp),%ebp
- movl -120(%ebx),%edx
- movq -120(%ebx),%mm0
- movq -128(%ebx),%mm3
- shll $4,%edx
- movb %dl,(%esp)
- movl -104(%ebx),%edx
- movq -104(%ebx),%mm2
- movq -112(%ebx),%mm5
- movq %mm0,-128(%edi)
- psrlq $4,%mm0
- movq %mm3,(%edi)
- movq %mm3,%mm7
- psrlq $4,%mm3
- shll $4,%edx
- movb %dl,1(%esp)
- movl -88(%ebx),%edx
- movq -88(%ebx),%mm1
- psllq $60,%mm7
- movq -96(%ebx),%mm4
- por %mm7,%mm0
- movq %mm2,-120(%edi)
- psrlq $4,%mm2
- movq %mm5,8(%edi)
- movq %mm5,%mm6
- movq %mm0,-128(%ebp)
- psrlq $4,%mm5
- movq %mm3,(%ebp)
- shll $4,%edx
- movb %dl,2(%esp)
- movl -72(%ebx),%edx
- movq -72(%ebx),%mm0
- psllq $60,%mm6
- movq -80(%ebx),%mm3
- por %mm6,%mm2
- movq %mm1,-112(%edi)
- psrlq $4,%mm1
- movq %mm4,16(%edi)
- movq %mm4,%mm7
- movq %mm2,-120(%ebp)
- psrlq $4,%mm4
- movq %mm5,8(%ebp)
- shll $4,%edx
- movb %dl,3(%esp)
- movl -56(%ebx),%edx
- movq -56(%ebx),%mm2
- psllq $60,%mm7
- movq -64(%ebx),%mm5
- por %mm7,%mm1
- movq %mm0,-104(%edi)
- psrlq $4,%mm0
- movq %mm3,24(%edi)
- movq %mm3,%mm6
- movq %mm1,-112(%ebp)
- psrlq $4,%mm3
- movq %mm4,16(%ebp)
- shll $4,%edx
- movb %dl,4(%esp)
- movl -40(%ebx),%edx
- movq -40(%ebx),%mm1
- psllq $60,%mm6
- movq -48(%ebx),%mm4
- por %mm6,%mm0
- movq %mm2,-96(%edi)
- psrlq $4,%mm2
- movq %mm5,32(%edi)
- movq %mm5,%mm7
- movq %mm0,-104(%ebp)
- psrlq $4,%mm5
- movq %mm3,24(%ebp)
- shll $4,%edx
- movb %dl,5(%esp)
- movl -24(%ebx),%edx
- movq -24(%ebx),%mm0
- psllq $60,%mm7
- movq -32(%ebx),%mm3
- por %mm7,%mm2
- movq %mm1,-88(%edi)
- psrlq $4,%mm1
- movq %mm4,40(%edi)
- movq %mm4,%mm6
- movq %mm2,-96(%ebp)
- psrlq $4,%mm4
- movq %mm5,32(%ebp)
- shll $4,%edx
- movb %dl,6(%esp)
- movl -8(%ebx),%edx
- movq -8(%ebx),%mm2
- psllq $60,%mm6
- movq -16(%ebx),%mm5
- por %mm6,%mm1
- movq %mm0,-80(%edi)
- psrlq $4,%mm0
- movq %mm3,48(%edi)
- movq %mm3,%mm7
- movq %mm1,-88(%ebp)
- psrlq $4,%mm3
- movq %mm4,40(%ebp)
- shll $4,%edx
- movb %dl,7(%esp)
- movl 8(%ebx),%edx
- movq 8(%ebx),%mm1
- psllq $60,%mm7
- movq (%ebx),%mm4
- por %mm7,%mm0
- movq %mm2,-72(%edi)
- psrlq $4,%mm2
- movq %mm5,56(%edi)
- movq %mm5,%mm6
- movq %mm0,-80(%ebp)
- psrlq $4,%mm5
- movq %mm3,48(%ebp)
- shll $4,%edx
- movb %dl,8(%esp)
- movl 24(%ebx),%edx
- movq 24(%ebx),%mm0
- psllq $60,%mm6
- movq 16(%ebx),%mm3
- por %mm6,%mm2
- movq %mm1,-64(%edi)
- psrlq $4,%mm1
- movq %mm4,64(%edi)
- movq %mm4,%mm7
- movq %mm2,-72(%ebp)
- psrlq $4,%mm4
- movq %mm5,56(%ebp)
- shll $4,%edx
- movb %dl,9(%esp)
- movl 40(%ebx),%edx
- movq 40(%ebx),%mm2
- psllq $60,%mm7
- movq 32(%ebx),%mm5
- por %mm7,%mm1
- movq %mm0,-56(%edi)
- psrlq $4,%mm0
- movq %mm3,72(%edi)
- movq %mm3,%mm6
- movq %mm1,-64(%ebp)
- psrlq $4,%mm3
- movq %mm4,64(%ebp)
- shll $4,%edx
- movb %dl,10(%esp)
- movl 56(%ebx),%edx
- movq 56(%ebx),%mm1
- psllq $60,%mm6
- movq 48(%ebx),%mm4
- por %mm6,%mm0
- movq %mm2,-48(%edi)
- psrlq $4,%mm2
- movq %mm5,80(%edi)
- movq %mm5,%mm7
- movq %mm0,-56(%ebp)
- psrlq $4,%mm5
- movq %mm3,72(%ebp)
- shll $4,%edx
- movb %dl,11(%esp)
- movl 72(%ebx),%edx
- movq 72(%ebx),%mm0
- psllq $60,%mm7
- movq 64(%ebx),%mm3
- por %mm7,%mm2
- movq %mm1,-40(%edi)
- psrlq $4,%mm1
- movq %mm4,88(%edi)
- movq %mm4,%mm6
- movq %mm2,-48(%ebp)
- psrlq $4,%mm4
- movq %mm5,80(%ebp)
- shll $4,%edx
- movb %dl,12(%esp)
- movl 88(%ebx),%edx
- movq 88(%ebx),%mm2
- psllq $60,%mm6
- movq 80(%ebx),%mm5
- por %mm6,%mm1
- movq %mm0,-32(%edi)
- psrlq $4,%mm0
- movq %mm3,96(%edi)
- movq %mm3,%mm7
- movq %mm1,-40(%ebp)
- psrlq $4,%mm3
- movq %mm4,88(%ebp)
- shll $4,%edx
- movb %dl,13(%esp)
- movl 104(%ebx),%edx
- movq 104(%ebx),%mm1
- psllq $60,%mm7
- movq 96(%ebx),%mm4
- por %mm7,%mm0
- movq %mm2,-24(%edi)
- psrlq $4,%mm2
- movq %mm5,104(%edi)
- movq %mm5,%mm6
- movq %mm0,-32(%ebp)
- psrlq $4,%mm5
- movq %mm3,96(%ebp)
- shll $4,%edx
- movb %dl,14(%esp)
- movl 120(%ebx),%edx
- movq 120(%ebx),%mm0
- psllq $60,%mm6
- movq 112(%ebx),%mm3
- por %mm6,%mm2
- movq %mm1,-16(%edi)
- psrlq $4,%mm1
- movq %mm4,112(%edi)
- movq %mm4,%mm7
- movq %mm2,-24(%ebp)
- psrlq $4,%mm4
- movq %mm5,104(%ebp)
- shll $4,%edx
- movb %dl,15(%esp)
- psllq $60,%mm7
- por %mm7,%mm1
- movq %mm0,-8(%edi)
- psrlq $4,%mm0
- movq %mm3,120(%edi)
- movq %mm3,%mm6
- movq %mm1,-16(%ebp)
- psrlq $4,%mm3
- movq %mm4,112(%ebp)
- psllq $60,%mm6
- por %mm6,%mm0
- movq %mm0,-8(%ebp)
- movq %mm3,120(%ebp)
- movq (%eax),%mm6
- movl 8(%eax),%ebx
- movl 12(%eax),%edx
-.align 16
-.L009outer:
- xorl 12(%ecx),%edx
- xorl 8(%ecx),%ebx
- pxor (%ecx),%mm6
- leal 16(%ecx),%ecx
- movl %ebx,536(%esp)
- movq %mm6,528(%esp)
- movl %ecx,548(%esp)
- xorl %eax,%eax
- roll $8,%edx
- movb %dl,%al
- movl %eax,%ebp
- andb $15,%al
- shrl $4,%ebp
- pxor %mm0,%mm0
- roll $8,%edx
- pxor %mm1,%mm1
- pxor %mm2,%mm2
- movq 16(%esp,%eax,8),%mm7
- movq 144(%esp,%eax,8),%mm6
- movb %dl,%al
- movd %mm7,%ebx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%edi
- psrlq $8,%mm6
- pxor 272(%esp,%ebp,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- shrl $4,%edi
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%ebp,8),%mm6
- xorb (%esp,%ebp,1),%bl
- movb %dl,%al
- movd %mm7,%ecx
- movzbl %bl,%ebx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%ebp
- psrlq $8,%mm6
- pxor 272(%esp,%edi,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- shrl $4,%ebp
- pinsrw $2,(%esi,%ebx,2),%mm2
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%edi,8),%mm6
- xorb (%esp,%edi,1),%cl
- movb %dl,%al
- movl 536(%esp),%edx
- movd %mm7,%ebx
- movzbl %cl,%ecx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%edi
- psrlq $8,%mm6
- pxor 272(%esp,%ebp,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm2,%mm6
- shrl $4,%edi
- pinsrw $2,(%esi,%ecx,2),%mm1
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%ebp,8),%mm6
- xorb (%esp,%ebp,1),%bl
- movb %dl,%al
- movd %mm7,%ecx
- movzbl %bl,%ebx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%ebp
- psrlq $8,%mm6
- pxor 272(%esp,%edi,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm1,%mm6
- shrl $4,%ebp
- pinsrw $2,(%esi,%ebx,2),%mm0
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%edi,8),%mm6
- xorb (%esp,%edi,1),%cl
- movb %dl,%al
- movd %mm7,%ebx
- movzbl %cl,%ecx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%edi
- psrlq $8,%mm6
- pxor 272(%esp,%ebp,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm0,%mm6
- shrl $4,%edi
- pinsrw $2,(%esi,%ecx,2),%mm2
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%ebp,8),%mm6
- xorb (%esp,%ebp,1),%bl
- movb %dl,%al
- movd %mm7,%ecx
- movzbl %bl,%ebx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%ebp
- psrlq $8,%mm6
- pxor 272(%esp,%edi,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm2,%mm6
- shrl $4,%ebp
- pinsrw $2,(%esi,%ebx,2),%mm1
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%edi,8),%mm6
- xorb (%esp,%edi,1),%cl
- movb %dl,%al
- movl 532(%esp),%edx
- movd %mm7,%ebx
- movzbl %cl,%ecx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%edi
- psrlq $8,%mm6
- pxor 272(%esp,%ebp,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm1,%mm6
- shrl $4,%edi
- pinsrw $2,(%esi,%ecx,2),%mm0
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%ebp,8),%mm6
- xorb (%esp,%ebp,1),%bl
- movb %dl,%al
- movd %mm7,%ecx
- movzbl %bl,%ebx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%ebp
- psrlq $8,%mm6
- pxor 272(%esp,%edi,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm0,%mm6
- shrl $4,%ebp
- pinsrw $2,(%esi,%ebx,2),%mm2
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%edi,8),%mm6
- xorb (%esp,%edi,1),%cl
- movb %dl,%al
- movd %mm7,%ebx
- movzbl %cl,%ecx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%edi
- psrlq $8,%mm6
- pxor 272(%esp,%ebp,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm2,%mm6
- shrl $4,%edi
- pinsrw $2,(%esi,%ecx,2),%mm1
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%ebp,8),%mm6
- xorb (%esp,%ebp,1),%bl
- movb %dl,%al
- movd %mm7,%ecx
- movzbl %bl,%ebx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%ebp
- psrlq $8,%mm6
- pxor 272(%esp,%edi,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm1,%mm6
- shrl $4,%ebp
- pinsrw $2,(%esi,%ebx,2),%mm0
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%edi,8),%mm6
- xorb (%esp,%edi,1),%cl
- movb %dl,%al
- movl 528(%esp),%edx
- movd %mm7,%ebx
- movzbl %cl,%ecx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%edi
- psrlq $8,%mm6
- pxor 272(%esp,%ebp,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm0,%mm6
- shrl $4,%edi
- pinsrw $2,(%esi,%ecx,2),%mm2
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%ebp,8),%mm6
- xorb (%esp,%ebp,1),%bl
- movb %dl,%al
- movd %mm7,%ecx
- movzbl %bl,%ebx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%ebp
- psrlq $8,%mm6
- pxor 272(%esp,%edi,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm2,%mm6
- shrl $4,%ebp
- pinsrw $2,(%esi,%ebx,2),%mm1
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%edi,8),%mm6
- xorb (%esp,%edi,1),%cl
- movb %dl,%al
- movd %mm7,%ebx
- movzbl %cl,%ecx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%edi
- psrlq $8,%mm6
- pxor 272(%esp,%ebp,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm1,%mm6
- shrl $4,%edi
- pinsrw $2,(%esi,%ecx,2),%mm0
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%ebp,8),%mm6
- xorb (%esp,%ebp,1),%bl
- movb %dl,%al
- movd %mm7,%ecx
- movzbl %bl,%ebx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%ebp
- psrlq $8,%mm6
- pxor 272(%esp,%edi,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm0,%mm6
- shrl $4,%ebp
- pinsrw $2,(%esi,%ebx,2),%mm2
- pxor 16(%esp,%eax,8),%mm7
- roll $8,%edx
- pxor 144(%esp,%eax,8),%mm6
- pxor %mm3,%mm7
- pxor 400(%esp,%edi,8),%mm6
- xorb (%esp,%edi,1),%cl
- movb %dl,%al
- movl 524(%esp),%edx
- movd %mm7,%ebx
- movzbl %cl,%ecx
- psrlq $8,%mm7
- movq %mm6,%mm3
- movl %eax,%edi
- psrlq $8,%mm6
- pxor 272(%esp,%ebp,8),%mm7
- andb $15,%al
- psllq $56,%mm3
- pxor %mm2,%mm6
- shrl $4,%edi
- pinsrw $2,(%esi,%ecx,2),%mm1
- pxor 16(%esp,%eax,8),%mm7
- pxor 144(%esp,%eax,8),%mm6
- xorb (%esp,%ebp,1),%bl
- pxor %mm3,%mm7
- pxor 400(%esp,%ebp,8),%mm6
- movzbl %bl,%ebx
- pxor %mm2,%mm2
- psllq $4,%mm1
- movd %mm7,%ecx
- psrlq $4,%mm7
- movq %mm6,%mm3
- psrlq $4,%mm6
- shll $4,%ecx
- pxor 16(%esp,%edi,8),%mm7
- psllq $60,%mm3
- movzbl %cl,%ecx
- pxor %mm3,%mm7
- pxor 144(%esp,%edi,8),%mm6
- pinsrw $2,(%esi,%ebx,2),%mm0
- pxor %mm1,%mm6
- movd %mm7,%edx
- pinsrw $3,(%esi,%ecx,2),%mm2
- psllq $12,%mm0
- pxor %mm0,%mm6
- psrlq $32,%mm7
- pxor %mm2,%mm6
- movl 548(%esp),%ecx
- movd %mm7,%ebx
- movq %mm6,%mm3
- psllw $8,%mm6
- psrlw $8,%mm3
- por %mm3,%mm6
- bswap %edx
- pshufw $27,%mm6,%mm6
- bswap %ebx
- cmpl 552(%esp),%ecx
- jne .L009outer
- movl 544(%esp),%eax
- movl %edx,12(%eax)
- movl %ebx,8(%eax)
- movq %mm6,(%eax)
- movl 556(%esp),%esp
- emms
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.size gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
-.globl gcm_init_clmul
-.type gcm_init_clmul,@function
-.align 16
-gcm_init_clmul:
-.L_gcm_init_clmul_begin:
- movl 4(%esp),%edx
- movl 8(%esp),%eax
- call .L010pic
-.L010pic:
- popl %ecx
- leal .Lbswap-.L010pic(%ecx),%ecx
- movdqu (%eax),%xmm2
- pshufd $78,%xmm2,%xmm2
- pshufd $255,%xmm2,%xmm4
- movdqa %xmm2,%xmm3
- psllq $1,%xmm2
- pxor %xmm5,%xmm5
- psrlq $63,%xmm3
- pcmpgtd %xmm4,%xmm5
- pslldq $8,%xmm3
- por %xmm3,%xmm2
- pand 16(%ecx),%xmm5
- pxor %xmm5,%xmm2
- movdqa %xmm2,%xmm0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- xorps %xmm0,%xmm3
- xorps %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- movdqu %xmm2,(%edx)
- movdqu %xmm0,16(%edx)
- ret
-.size gcm_init_clmul,.-.L_gcm_init_clmul_begin
-.globl gcm_gmult_clmul
-.type gcm_gmult_clmul,@function
-.align 16
-gcm_gmult_clmul:
-.L_gcm_gmult_clmul_begin:
- movl 4(%esp),%eax
- movl 8(%esp),%edx
- call .L011pic
-.L011pic:
- popl %ecx
- leal .Lbswap-.L011pic(%ecx),%ecx
- movdqu (%eax),%xmm0
- movdqa (%ecx),%xmm5
- movups (%edx),%xmm2
-.byte 102,15,56,0,197
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- xorps %xmm0,%xmm3
- xorps %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-.byte 102,15,56,0,197
- movdqu %xmm0,(%eax)
- ret
-.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
-.globl gcm_ghash_clmul
-.type gcm_ghash_clmul,@function
-.align 16
-gcm_ghash_clmul:
-.L_gcm_ghash_clmul_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%eax
- movl 24(%esp),%edx
- movl 28(%esp),%esi
- movl 32(%esp),%ebx
- call .L012pic
-.L012pic:
- popl %ecx
- leal .Lbswap-.L012pic(%ecx),%ecx
- movdqu (%eax),%xmm0
- movdqa (%ecx),%xmm5
- movdqu (%edx),%xmm2
-.byte 102,15,56,0,197
- subl $16,%ebx
- jz .L013odd_tail
- movdqu (%esi),%xmm3
- movdqu 16(%esi),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm6,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,242,0
-.byte 102,15,58,68,250,17
-.byte 102,15,58,68,220,0
- xorps %xmm6,%xmm3
- xorps %xmm7,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm7
- pxor %xmm4,%xmm6
- movups 16(%edx),%xmm2
- leal 32(%esi),%esi
- subl $32,%ebx
- jbe .L014even_tail
-.L015mod_loop:
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- xorps %xmm0,%xmm3
- xorps %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- movdqu (%esi),%xmm3
- movups (%edx),%xmm2
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
- movdqu 16(%esi),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- movdqa %xmm6,%xmm5
- movdqa %xmm6,%xmm7
- pxor %xmm3,%xmm1
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
-.byte 102,15,58,68,242,0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pshufd $78,%xmm5,%xmm3
- pxor %xmm4,%xmm1
- pxor %xmm5,%xmm3
- pshufd $78,%xmm2,%xmm5
- pxor %xmm2,%xmm5
-.byte 102,15,58,68,250,17
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-.byte 102,15,58,68,221,0
- movups 16(%edx),%xmm2
- xorps %xmm6,%xmm3
- xorps %xmm7,%xmm3
- movdqa %xmm3,%xmm5
- psrldq $8,%xmm3
- pslldq $8,%xmm5
- pxor %xmm3,%xmm7
- pxor %xmm5,%xmm6
- movdqa (%ecx),%xmm5
- leal 32(%esi),%esi
- subl $32,%ebx
- ja .L015mod_loop
-.L014even_tail:
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- xorps %xmm0,%xmm3
- xorps %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- testl %ebx,%ebx
- jnz .L016done
- movups (%edx),%xmm2
-.L013odd_tail:
- movdqu (%esi),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- xorps %xmm0,%xmm3
- xorps %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-.L016done:
-.byte 102,15,56,0,197
- movdqu %xmm0,(%eax)
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
-.align 64
-.Lbswap:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
-.align 64
-.Lrem_4bit:
-.long 0,0,0,471859200,0,943718400,0,610271232
-.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
-.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
-.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
-.align 64
-.Lrem_8bit:
-.value 0,450,900,582,1800,1738,1164,1358
-.value 3600,4050,3476,3158,2328,2266,2716,2910
-.value 7200,7650,8100,7782,6952,6890,6316,6510
-.value 4656,5106,4532,4214,5432,5370,5820,6014
-.value 14400,14722,15300,14854,16200,16010,15564,15630
-.value 13904,14226,13780,13334,12632,12442,13020,13086
-.value 9312,9634,10212,9766,9064,8874,8428,8494
-.value 10864,11186,10740,10294,11640,11450,12028,12094
-.value 28800,28994,29444,29382,30600,30282,29708,30158
-.value 32400,32594,32020,31958,31128,30810,31260,31710
-.value 27808,28002,28452,28390,27560,27242,26668,27118
-.value 25264,25458,24884,24822,26040,25722,26172,26622
-.value 18624,18690,19268,19078,20424,19978,19532,19854
-.value 18128,18194,17748,17558,16856,16410,16988,17310
-.value 21728,21794,22372,22182,21480,21034,20588,20910
-.value 23280,23346,22900,22710,24056,23610,24188,24510
-.value 57600,57538,57988,58182,58888,59338,58764,58446
-.value 61200,61138,60564,60758,59416,59866,60316,59998
-.value 64800,64738,65188,65382,64040,64490,63916,63598
-.value 62256,62194,61620,61814,62520,62970,63420,63102
-.value 55616,55426,56004,56070,56904,57226,56780,56334
-.value 55120,54930,54484,54550,53336,53658,54236,53790
-.value 50528,50338,50916,50982,49768,50090,49644,49198
-.value 52080,51890,51444,51510,52344,52666,53244,52798
-.value 37248,36930,37380,37830,38536,38730,38156,38094
-.value 40848,40530,39956,40406,39064,39258,39708,39646
-.value 36256,35938,36388,36838,35496,35690,35116,35054
-.value 33712,33394,32820,33270,33976,34170,34620,34558
-.value 43456,43010,43588,43910,44744,44810,44364,44174
-.value 42960,42514,42068,42390,41176,41242,41820,41630
-.value 46560,46114,46692,47014,45800,45866,45420,45230
-.value 48112,47666,47220,47542,48376,48442,49020,48830
-.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
-.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
-.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
-.byte 0
diff --git a/app/openssl/crypto/modes/asm/ghash-x86.pl b/app/openssl/crypto/modes/asm/ghash-x86.pl
deleted file mode 100644
index 2426cd0c..00000000
--- a/app/openssl/crypto/modes/asm/ghash-x86.pl
+++ /dev/null
@@ -1,1342 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March, May, June 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
-# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
-# of per-key storage [+512 bytes shared table]. Performance results
-# are for streamed GHASH subroutine and are expressed in cycles per
-# processed byte, less is better:
-#
-# gcc 2.95.3(*) MMX assembler x86 assembler
-#
-# Pentium 105/111(**) - 50
-# PIII 68 /75 12.2 24
-# P4 125/125 17.8 84(***)
-# Opteron 66 /70 10.1 30
-# Core2 54 /67 8.4 18
-#
-# (*) gcc 3.4.x was observed to generate few percent slower code,
-# which is one of reasons why 2.95.3 results were chosen,
-# another reason is lack of 3.4.x results for older CPUs;
-# comparison with MMX results is not completely fair, because C
-# results are for vanilla "256B" implementation, while
-# assembler results are for "528B";-)
-# (**) second number is result for code compiled with -fPIC flag,
-# which is actually more relevant, because assembler code is
-# position-independent;
-# (***) see comment in non-MMX routine for further details;
-#
-# To summarize, it's >2-5 times faster than gcc-generated code. To
-# anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
-
-# May 2010
-#
-# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
-# The question is how close is it to theoretical limit? The pclmulqdq
-# instruction latency appears to be 14 cycles and there can't be more
-# than 2 of them executing at any given time. This means that single
-# Karatsuba multiplication would take 28 cycles *plus* few cycles for
-# pre- and post-processing. Then multiplication has to be followed by
-# modulo-reduction. Given that aggregated reduction method [see
-# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
-# white paper by Intel] allows you to perform reduction only once in
-# a while we can assume that asymptotic performance can be estimated
-# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
-# and Naggr is the aggregation factor.
-#
-# Before we proceed to this implementation let's have closer look at
-# the best-performing code suggested by Intel in their white paper.
-# By tracing inter-register dependencies Tmod is estimated as ~19
-# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
-# processed byte. As implied, this is quite optimistic estimate,
-# because it does not account for Karatsuba pre- and post-processing,
-# which for a single multiplication is ~5 cycles. Unfortunately Intel
-# does not provide performance data for GHASH alone. But benchmarking
-# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
-# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
-# the result accounts even for pre-computing of degrees of the hash
-# key H, but its portion is negligible at 16KB buffer size.
-#
-# Moving on to the implementation in question. Tmod is estimated as
-# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
-# 2.16. How is it possible that measured performance is better than
-# optimistic theoretical estimate? There is one thing Intel failed
-# to recognize. By serializing GHASH with CTR in same subroutine
-# former's performance is really limited to above (Tmul + Tmod/Naggr)
-# equation. But if GHASH procedure is detached, the modulo-reduction
-# can be interleaved with Naggr-1 multiplications at instruction level
-# and under ideal conditions even disappear from the equation. So that
-# optimistic theoretical estimate for this implementation is ...
-# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
-# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
-# where Tproc is time required for Karatsuba pre- and post-processing,
-# is more realistic estimate. In this case it gives ... 1.91 cycles.
-# Or in other words, depending on how well we can interleave reduction
-# and one of the two multiplications the performance should be betwen
-# 1.91 and 2.16. As already mentioned, this implementation processes
-# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
-# - in 2.02. x86_64 performance is better, because larger register
-# bank allows to interleave reduction and multiplication better.
-#
-# Does it make sense to increase Naggr? To start with it's virtually
-# impossible in 32-bit mode, because of limited register bank
-# capacity. Otherwise improvement has to be weighed agiainst slower
-# setup, as well as code size and complexity increase. As even
-# optimistic estimate doesn't promise 30% performance improvement,
-# there are currently no plans to increase Naggr.
-#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
-
-# January 2010
-#
-# Tweaked to optimize transitions between integer and FP operations
-# on same XMM register, PCLMULQDQ subroutine was measured to process
-# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
-# The minor regression on Westmere is outweighed by ~15% improvement
-# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
-# similar manner resulted in almost 20% degradation on Sandy Bridge,
-# where original 64-bit code processes one byte in 1.95 cycles.
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
-
-$sse2=0;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
-$inp = "edi";
-$Htbl = "esi";
-
-$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
- # than unrolled, which has to be weighted against
- # 2.5x x86-specific code size reduction.
-
-sub x86_loop {
- my $off = shift;
- my $rem = "eax";
-
- &mov ($Zhh,&DWP(4,$Htbl,$Zll));
- &mov ($Zhl,&DWP(0,$Htbl,$Zll));
- &mov ($Zlh,&DWP(12,$Htbl,$Zll));
- &mov ($Zll,&DWP(8,$Htbl,$Zll));
- &xor ($rem,$rem); # avoid partial register stalls on PIII
-
- # shrd practically kills P4, 2.5x deterioration, but P4 has
- # MMX code-path to execute. shrd runs tad faster [than twice
- # the shifts, move's and or's] on pre-MMX Pentium (as well as
- # PIII and Core2), *but* minimizes code size, spares register
- # and thus allows to fold the loop...
- if (!$unroll) {
- my $cnt = $inp;
- &mov ($cnt,15);
- &jmp (&label("x86_loop"));
- &set_label("x86_loop",16);
- for($i=1;$i<=2;$i++) {
- &mov (&LB($rem),&LB($Zll));
- &shrd ($Zll,$Zlh,4);
- &and (&LB($rem),0xf);
- &shrd ($Zlh,$Zhl,4);
- &shrd ($Zhl,$Zhh,4);
- &shr ($Zhh,4);
- &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
-
- &mov (&LB($rem),&BP($off,"esp",$cnt));
- if ($i&1) {
- &and (&LB($rem),0xf0);
- } else {
- &shl (&LB($rem),4);
- }
-
- &xor ($Zll,&DWP(8,$Htbl,$rem));
- &xor ($Zlh,&DWP(12,$Htbl,$rem));
- &xor ($Zhl,&DWP(0,$Htbl,$rem));
- &xor ($Zhh,&DWP(4,$Htbl,$rem));
-
- if ($i&1) {
- &dec ($cnt);
- &js (&label("x86_break"));
- } else {
- &jmp (&label("x86_loop"));
- }
- }
- &set_label("x86_break",16);
- } else {
- for($i=1;$i<32;$i++) {
- &comment($i);
- &mov (&LB($rem),&LB($Zll));
- &shrd ($Zll,$Zlh,4);
- &and (&LB($rem),0xf);
- &shrd ($Zlh,$Zhl,4);
- &shrd ($Zhl,$Zhh,4);
- &shr ($Zhh,4);
- &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
-
- if ($i&1) {
- &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
- &and (&LB($rem),0xf0);
- } else {
- &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
- &shl (&LB($rem),4);
- }
-
- &xor ($Zll,&DWP(8,$Htbl,$rem));
- &xor ($Zlh,&DWP(12,$Htbl,$rem));
- &xor ($Zhl,&DWP(0,$Htbl,$rem));
- &xor ($Zhh,&DWP(4,$Htbl,$rem));
- }
- }
- &bswap ($Zll);
- &bswap ($Zlh);
- &bswap ($Zhl);
- if (!$x86only) {
- &bswap ($Zhh);
- } else {
- &mov ("eax",$Zhh);
- &bswap ("eax");
- &mov ($Zhh,"eax");
- }
-}
-
-if ($unroll) {
- &function_begin_B("_x86_gmult_4bit_inner");
- &x86_loop(4);
- &ret ();
- &function_end_B("_x86_gmult_4bit_inner");
-}
-
-sub deposit_rem_4bit {
- my $bias = shift;
-
- &mov (&DWP($bias+0, "esp"),0x0000<<16);
- &mov (&DWP($bias+4, "esp"),0x1C20<<16);
- &mov (&DWP($bias+8, "esp"),0x3840<<16);
- &mov (&DWP($bias+12,"esp"),0x2460<<16);
- &mov (&DWP($bias+16,"esp"),0x7080<<16);
- &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
- &mov (&DWP($bias+24,"esp"),0x48C0<<16);
- &mov (&DWP($bias+28,"esp"),0x54E0<<16);
- &mov (&DWP($bias+32,"esp"),0xE100<<16);
- &mov (&DWP($bias+36,"esp"),0xFD20<<16);
- &mov (&DWP($bias+40,"esp"),0xD940<<16);
- &mov (&DWP($bias+44,"esp"),0xC560<<16);
- &mov (&DWP($bias+48,"esp"),0x9180<<16);
- &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
- &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
- &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
-}
-
-$suffix = $x86only ? "" : "_x86";
-
-&function_begin("gcm_gmult_4bit".$suffix);
- &stack_push(16+4+1); # +1 for stack alignment
- &mov ($inp,&wparam(0)); # load Xi
- &mov ($Htbl,&wparam(1)); # load Htable
-
- &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
- &mov ($Zhl,&DWP(4,$inp));
- &mov ($Zlh,&DWP(8,$inp));
- &mov ($Zll,&DWP(12,$inp));
-
- &deposit_rem_4bit(16);
-
- &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
- &mov (&DWP(4,"esp"),$Zhl);
- &mov (&DWP(8,"esp"),$Zlh);
- &mov (&DWP(12,"esp"),$Zll);
- &shr ($Zll,20);
- &and ($Zll,0xf0);
-
- if ($unroll) {
- &call ("_x86_gmult_4bit_inner");
- } else {
- &x86_loop(0);
- &mov ($inp,&wparam(0));
- }
-
- &mov (&DWP(12,$inp),$Zll);
- &mov (&DWP(8,$inp),$Zlh);
- &mov (&DWP(4,$inp),$Zhl);
- &mov (&DWP(0,$inp),$Zhh);
- &stack_pop(16+4+1);
-&function_end("gcm_gmult_4bit".$suffix);
-
-&function_begin("gcm_ghash_4bit".$suffix);
- &stack_push(16+4+1); # +1 for 64-bit alignment
- &mov ($Zll,&wparam(0)); # load Xi
- &mov ($Htbl,&wparam(1)); # load Htable
- &mov ($inp,&wparam(2)); # load in
- &mov ("ecx",&wparam(3)); # load len
- &add ("ecx",$inp);
- &mov (&wparam(3),"ecx");
-
- &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
- &mov ($Zhl,&DWP(4,$Zll));
- &mov ($Zlh,&DWP(8,$Zll));
- &mov ($Zll,&DWP(12,$Zll));
-
- &deposit_rem_4bit(16);
-
- &set_label("x86_outer_loop",16);
- &xor ($Zll,&DWP(12,$inp)); # xor with input
- &xor ($Zlh,&DWP(8,$inp));
- &xor ($Zhl,&DWP(4,$inp));
- &xor ($Zhh,&DWP(0,$inp));
- &mov (&DWP(12,"esp"),$Zll); # dump it on stack
- &mov (&DWP(8,"esp"),$Zlh);
- &mov (&DWP(4,"esp"),$Zhl);
- &mov (&DWP(0,"esp"),$Zhh);
-
- &shr ($Zll,20);
- &and ($Zll,0xf0);
-
- if ($unroll) {
- &call ("_x86_gmult_4bit_inner");
- } else {
- &x86_loop(0);
- &mov ($inp,&wparam(2));
- }
- &lea ($inp,&DWP(16,$inp));
- &cmp ($inp,&wparam(3));
- &mov (&wparam(2),$inp) if (!$unroll);
- &jb (&label("x86_outer_loop"));
-
- &mov ($inp,&wparam(0)); # load Xi
- &mov (&DWP(12,$inp),$Zll);
- &mov (&DWP(8,$inp),$Zlh);
- &mov (&DWP(4,$inp),$Zhl);
- &mov (&DWP(0,$inp),$Zhh);
- &stack_pop(16+4+1);
-&function_end("gcm_ghash_4bit".$suffix);
-
-if (!$x86only) {{{
-
-&static_label("rem_4bit");
-
-if (!$sse2) {{ # pure-MMX "May" version...
-
-$S=12; # shift factor for rem_4bit
-
-&function_begin_B("_mmx_gmult_4bit_inner");
-# MMX version performs 3.5 times better on P4 (see comment in non-MMX
-# routine for further details), 100% better on Opteron, ~70% better
-# on Core2 and PIII... In other words effort is considered to be well
-# spent... Since initial release the loop was unrolled in order to
-# "liberate" register previously used as loop counter. Instead it's
-# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
-# The path involves move of Z.lo from MMX to integer register,
-# effective address calculation and finally merge of value to Z.hi.
-# Reference to rem_4bit is scheduled so late that I had to >>4
-# rem_4bit elements. This resulted in 20-45% procent improvement
-# on contemporary µ-archs.
-{
- my $cnt;
- my $rem_4bit = "eax";
- my @rem = ($Zhh,$Zll);
- my $nhi = $Zhl;
- my $nlo = $Zlh;
-
- my ($Zlo,$Zhi) = ("mm0","mm1");
- my $tmp = "mm2";
-
- &xor ($nlo,$nlo); # avoid partial register stalls on PIII
- &mov ($nhi,$Zll);
- &mov (&LB($nlo),&LB($nhi));
- &shl (&LB($nlo),4);
- &and ($nhi,0xf0);
- &movq ($Zlo,&QWP(8,$Htbl,$nlo));
- &movq ($Zhi,&QWP(0,$Htbl,$nlo));
- &movd ($rem[0],$Zlo);
-
- for ($cnt=28;$cnt>=-2;$cnt--) {
- my $odd = $cnt&1;
- my $nix = $odd ? $nlo : $nhi;
-
- &shl (&LB($nlo),4) if ($odd);
- &psrlq ($Zlo,4);
- &movq ($tmp,$Zhi);
- &psrlq ($Zhi,4);
- &pxor ($Zlo,&QWP(8,$Htbl,$nix));
- &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
- &psllq ($tmp,60);
- &and ($nhi,0xf0) if ($odd);
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
- &and ($rem[0],0xf);
- &pxor ($Zhi,&QWP(0,$Htbl,$nix));
- &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
- &movd ($rem[1],$Zlo);
- &pxor ($Zlo,$tmp);
-
- push (@rem,shift(@rem)); # "rotate" registers
- }
-
- &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
-
- &psrlq ($Zlo,32); # lower part of Zlo is already there
- &movd ($Zhl,$Zhi);
- &psrlq ($Zhi,32);
- &movd ($Zlh,$Zlo);
- &movd ($Zhh,$Zhi);
- &shl ($inp,4); # compensate for rem_4bit[i] being >>4
-
- &bswap ($Zll);
- &bswap ($Zhl);
- &bswap ($Zlh);
- &xor ($Zhh,$inp);
- &bswap ($Zhh);
-
- &ret ();
-}
-&function_end_B("_mmx_gmult_4bit_inner");
-
-&function_begin("gcm_gmult_4bit_mmx");
- &mov ($inp,&wparam(0)); # load Xi
- &mov ($Htbl,&wparam(1)); # load Htable
-
- &call (&label("pic_point"));
- &set_label("pic_point");
- &blindpop("eax");
- &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-
- &movz ($Zll,&BP(15,$inp));
-
- &call ("_mmx_gmult_4bit_inner");
-
- &mov ($inp,&wparam(0)); # load Xi
- &emms ();
- &mov (&DWP(12,$inp),$Zll);
- &mov (&DWP(4,$inp),$Zhl);
- &mov (&DWP(8,$inp),$Zlh);
- &mov (&DWP(0,$inp),$Zhh);
-&function_end("gcm_gmult_4bit_mmx");
-
-# Streamed version performs 20% better on P4, 7% on Opteron,
-# 10% on Core2 and PIII...
-&function_begin("gcm_ghash_4bit_mmx");
- &mov ($Zhh,&wparam(0)); # load Xi
- &mov ($Htbl,&wparam(1)); # load Htable
- &mov ($inp,&wparam(2)); # load in
- &mov ($Zlh,&wparam(3)); # load len
-
- &call (&label("pic_point"));
- &set_label("pic_point");
- &blindpop("eax");
- &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-
- &add ($Zlh,$inp);
- &mov (&wparam(3),$Zlh); # len to point at the end of input
- &stack_push(4+1); # +1 for stack alignment
-
- &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
- &mov ($Zhl,&DWP(4,$Zhh));
- &mov ($Zlh,&DWP(8,$Zhh));
- &mov ($Zhh,&DWP(0,$Zhh));
- &jmp (&label("mmx_outer_loop"));
-
- &set_label("mmx_outer_loop",16);
- &xor ($Zll,&DWP(12,$inp));
- &xor ($Zhl,&DWP(4,$inp));
- &xor ($Zlh,&DWP(8,$inp));
- &xor ($Zhh,&DWP(0,$inp));
- &mov (&wparam(2),$inp);
- &mov (&DWP(12,"esp"),$Zll);
- &mov (&DWP(4,"esp"),$Zhl);
- &mov (&DWP(8,"esp"),$Zlh);
- &mov (&DWP(0,"esp"),$Zhh);
-
- &mov ($inp,"esp");
- &shr ($Zll,24);
-
- &call ("_mmx_gmult_4bit_inner");
-
- &mov ($inp,&wparam(2));
- &lea ($inp,&DWP(16,$inp));
- &cmp ($inp,&wparam(3));
- &jb (&label("mmx_outer_loop"));
-
- &mov ($inp,&wparam(0)); # load Xi
- &emms ();
- &mov (&DWP(12,$inp),$Zll);
- &mov (&DWP(4,$inp),$Zhl);
- &mov (&DWP(8,$inp),$Zlh);
- &mov (&DWP(0,$inp),$Zhh);
-
- &stack_pop(4+1);
-&function_end("gcm_ghash_4bit_mmx");
-
-}} else {{ # "June" MMX version...
- # ... has slower "April" gcm_gmult_4bit_mmx with folded
- # loop. This is done to conserve code size...
-$S=16; # shift factor for rem_4bit
-
-sub mmx_loop() {
-# MMX version performs 2.8 times better on P4 (see comment in non-MMX
-# routine for further details), 40% better on Opteron and Core2, 50%
-# better on PIII... In other words effort is considered to be well
-# spent...
- my $inp = shift;
- my $rem_4bit = shift;
- my $cnt = $Zhh;
- my $nhi = $Zhl;
- my $nlo = $Zlh;
- my $rem = $Zll;
-
- my ($Zlo,$Zhi) = ("mm0","mm1");
- my $tmp = "mm2";
-
- &xor ($nlo,$nlo); # avoid partial register stalls on PIII
- &mov ($nhi,$Zll);
- &mov (&LB($nlo),&LB($nhi));
- &mov ($cnt,14);
- &shl (&LB($nlo),4);
- &and ($nhi,0xf0);
- &movq ($Zlo,&QWP(8,$Htbl,$nlo));
- &movq ($Zhi,&QWP(0,$Htbl,$nlo));
- &movd ($rem,$Zlo);
- &jmp (&label("mmx_loop"));
-
- &set_label("mmx_loop",16);
- &psrlq ($Zlo,4);
- &and ($rem,0xf);
- &movq ($tmp,$Zhi);
- &psrlq ($Zhi,4);
- &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
- &mov (&LB($nlo),&BP(0,$inp,$cnt));
- &psllq ($tmp,60);
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
- &dec ($cnt);
- &movd ($rem,$Zlo);
- &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
- &mov ($nhi,$nlo);
- &pxor ($Zlo,$tmp);
- &js (&label("mmx_break"));
-
- &shl (&LB($nlo),4);
- &and ($rem,0xf);
- &psrlq ($Zlo,4);
- &and ($nhi,0xf0);
- &movq ($tmp,$Zhi);
- &psrlq ($Zhi,4);
- &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
- &psllq ($tmp,60);
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
- &movd ($rem,$Zlo);
- &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
- &pxor ($Zlo,$tmp);
- &jmp (&label("mmx_loop"));
-
- &set_label("mmx_break",16);
- &shl (&LB($nlo),4);
- &and ($rem,0xf);
- &psrlq ($Zlo,4);
- &and ($nhi,0xf0);
- &movq ($tmp,$Zhi);
- &psrlq ($Zhi,4);
- &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
- &psllq ($tmp,60);
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
- &movd ($rem,$Zlo);
- &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
- &pxor ($Zlo,$tmp);
-
- &psrlq ($Zlo,4);
- &and ($rem,0xf);
- &movq ($tmp,$Zhi);
- &psrlq ($Zhi,4);
- &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
- &psllq ($tmp,60);
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
- &movd ($rem,$Zlo);
- &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
- &pxor ($Zlo,$tmp);
-
- &psrlq ($Zlo,32); # lower part of Zlo is already there
- &movd ($Zhl,$Zhi);
- &psrlq ($Zhi,32);
- &movd ($Zlh,$Zlo);
- &movd ($Zhh,$Zhi);
-
- &bswap ($Zll);
- &bswap ($Zhl);
- &bswap ($Zlh);
- &bswap ($Zhh);
-}
-
-&function_begin("gcm_gmult_4bit_mmx");
- &mov ($inp,&wparam(0)); # load Xi
- &mov ($Htbl,&wparam(1)); # load Htable
-
- &call (&label("pic_point"));
- &set_label("pic_point");
- &blindpop("eax");
- &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-
- &movz ($Zll,&BP(15,$inp));
-
- &mmx_loop($inp,"eax");
-
- &emms ();
- &mov (&DWP(12,$inp),$Zll);
- &mov (&DWP(4,$inp),$Zhl);
- &mov (&DWP(8,$inp),$Zlh);
- &mov (&DWP(0,$inp),$Zhh);
-&function_end("gcm_gmult_4bit_mmx");
-
-######################################################################
-# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
-# (see gcm128.c for details). It provides further 20-40% performance
-# improvement over above mentioned "May" version.
-
-&static_label("rem_8bit");
-
-&function_begin("gcm_ghash_4bit_mmx");
-{ my ($Zlo,$Zhi) = ("mm7","mm6");
- my $rem_8bit = "esi";
- my $Htbl = "ebx";
-
- # parameter block
- &mov ("eax",&wparam(0)); # Xi
- &mov ("ebx",&wparam(1)); # Htable
- &mov ("ecx",&wparam(2)); # inp
- &mov ("edx",&wparam(3)); # len
- &mov ("ebp","esp"); # original %esp
- &call (&label("pic_point"));
- &set_label ("pic_point");
- &blindpop ($rem_8bit);
- &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
-
- &sub ("esp",512+16+16); # allocate stack frame...
- &and ("esp",-64); # ...and align it
- &sub ("esp",16); # place for (u8)(H[]<<4)
-
- &add ("edx","ecx"); # pointer to the end of input
- &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
- &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
- &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
-
- { my @lo = ("mm0","mm1","mm2");
- my @hi = ("mm3","mm4","mm5");
- my @tmp = ("mm6","mm7");
- my ($off1,$off2,$i) = (0,0,);
-
- &add ($Htbl,128); # optimize for size
- &lea ("edi",&DWP(16+128,"esp"));
- &lea ("ebp",&DWP(16+256+128,"esp"));
-
- # decompose Htable (low and high parts are kept separately),
- # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
- for ($i=0;$i<18;$i++) {
-
- &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
- &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
- &psllq ($tmp[1],60) if ($i>1);
- &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
- &por ($lo[2],$tmp[1]) if ($i>1);
- &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
- &psrlq ($lo[1],4) if ($i>0 && $i<17);
- &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
- &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
- &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
- &psrlq ($hi[1],4) if ($i>0 && $i<17);
- &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
- &shl ("edx",4) if ($i<16);
- &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
-
- unshift (@lo,pop(@lo)); # "rotate" registers
- unshift (@hi,pop(@hi));
- unshift (@tmp,pop(@tmp));
- $off1 += 8 if ($i>0);
- $off2 += 8 if ($i>1);
- }
- }
-
- &movq ($Zhi,&QWP(0,"eax"));
- &mov ("ebx",&DWP(8,"eax"));
- &mov ("edx",&DWP(12,"eax")); # load Xi
-
-&set_label("outer",16);
- { my $nlo = "eax";
- my $dat = "edx";
- my @nhi = ("edi","ebp");
- my @rem = ("ebx","ecx");
- my @red = ("mm0","mm1","mm2");
- my $tmp = "mm3";
-
- &xor ($dat,&DWP(12,"ecx")); # merge input data
- &xor ("ebx",&DWP(8,"ecx"));
- &pxor ($Zhi,&QWP(0,"ecx"));
- &lea ("ecx",&DWP(16,"ecx")); # inp+=16
- #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
- &mov (&DWP(528+8,"esp"),"ebx");
- &movq (&QWP(528+0,"esp"),$Zhi);
- &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
-
- &xor ($nlo,$nlo);
- &rol ($dat,8);
- &mov (&LB($nlo),&LB($dat));
- &mov ($nhi[1],$nlo);
- &and (&LB($nlo),0x0f);
- &shr ($nhi[1],4);
- &pxor ($red[0],$red[0]);
- &rol ($dat,8); # next byte
- &pxor ($red[1],$red[1]);
- &pxor ($red[2],$red[2]);
-
- # Just like in "May" verson modulo-schedule for critical path in
- # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
- # is scheduled so late that rem_8bit[] has to be shifted *right*
- # by 16, which is why last argument to pinsrw is 2, which
- # corresponds to <<32=<<48>>16...
- for ($j=11,$i=0;$i<15;$i++) {
-
- if ($i>0) {
- &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
- &rol ($dat,8); # next byte
- &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
-
- &pxor ($Zlo,$tmp);
- &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
- &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
- } else {
- &movq ($Zlo,&QWP(16,"esp",$nlo,8));
- &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
- }
-
- &mov (&LB($nlo),&LB($dat));
- &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
-
- &movd ($rem[0],$Zlo);
- &movz ($rem[1],&LB($rem[1])) if ($i>0);
- &psrlq ($Zlo,8); # Z>>=8
-
- &movq ($tmp,$Zhi);
- &mov ($nhi[0],$nlo);
- &psrlq ($Zhi,8);
-
- &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
- &and (&LB($nlo),0x0f);
- &psllq ($tmp,56);
-
- &pxor ($Zhi,$red[1]) if ($i>1);
- &shr ($nhi[0],4);
- &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
-
- unshift (@red,pop(@red)); # "rotate" registers
- unshift (@rem,pop(@rem));
- unshift (@nhi,pop(@nhi));
- }
-
- &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
- &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
- &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
-
- &pxor ($Zlo,$tmp);
- &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
- &movz ($rem[1],&LB($rem[1]));
-
- &pxor ($red[2],$red[2]); # clear 2nd word
- &psllq ($red[1],4);
-
- &movd ($rem[0],$Zlo);
- &psrlq ($Zlo,4); # Z>>=4
-
- &movq ($tmp,$Zhi);
- &psrlq ($Zhi,4);
- &shl ($rem[0],4); # rem<<4
-
- &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
- &psllq ($tmp,60);
- &movz ($rem[0],&LB($rem[0]));
-
- &pxor ($Zlo,$tmp);
- &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
-
- &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
- &pxor ($Zhi,$red[1]);
-
- &movd ($dat,$Zlo);
- &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
-
- &psllq ($red[0],12); # correct by <<16>>4
- &pxor ($Zhi,$red[0]);
- &psrlq ($Zlo,32);
- &pxor ($Zhi,$red[2]);
-
- &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
- &movd ("ebx",$Zlo);
- &movq ($tmp,$Zhi); # 01234567
- &psllw ($Zhi,8); # 1.3.5.7.
- &psrlw ($tmp,8); # .0.2.4.6
- &por ($Zhi,$tmp); # 10325476
- &bswap ($dat);
- &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
- &bswap ("ebx");
-
- &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
- &jne (&label("outer"));
- }
-
- &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
- &mov (&DWP(12,"eax"),"edx");
- &mov (&DWP(8,"eax"),"ebx");
- &movq (&QWP(0,"eax"),$Zhi);
-
- &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
- &emms ();
-}
-&function_end("gcm_ghash_4bit_mmx");
-}}
-
-if ($sse2) {{
-######################################################################
-# PCLMULQDQ version.
-
-$Xip="eax";
-$Htbl="edx";
-$const="ecx";
-$inp="esi";
-$len="ebx";
-
-($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
-($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
-($Xn,$Xhn)=("xmm6","xmm7");
-
-&static_label("bswap");
-
-sub clmul64x64_T2 { # minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
-
- &movdqa ($Xhi,$Xi); #
- &pshufd ($T1,$Xi,0b01001110);
- &pshufd ($T2,$Hkey,0b01001110);
- &pxor ($T1,$Xi); #
- &pxor ($T2,$Hkey);
-
- &pclmulqdq ($Xi,$Hkey,0x00); #######
- &pclmulqdq ($Xhi,$Hkey,0x11); #######
- &pclmulqdq ($T1,$T2,0x00); #######
- &xorps ($T1,$Xi); #
- &xorps ($T1,$Xhi); #
-
- &movdqa ($T2,$T1); #
- &psrldq ($T1,8);
- &pslldq ($T2,8); #
- &pxor ($Xhi,$T1);
- &pxor ($Xi,$T2); #
-}
-
-sub clmul64x64_T3 {
-# Even though this subroutine offers visually better ILP, it
-# was empirically found to be a tad slower than above version.
-# At least in gcm_ghash_clmul context. But it's just as well,
-# because loop modulo-scheduling is possible only thanks to
-# minimized "register" pressure...
-my ($Xhi,$Xi,$Hkey)=@_;
-
- &movdqa ($T1,$Xi); #
- &movdqa ($Xhi,$Xi);
- &pclmulqdq ($Xi,$Hkey,0x00); #######
- &pclmulqdq ($Xhi,$Hkey,0x11); #######
- &pshufd ($T2,$T1,0b01001110); #
- &pshufd ($T3,$Hkey,0b01001110);
- &pxor ($T2,$T1); #
- &pxor ($T3,$Hkey);
- &pclmulqdq ($T2,$T3,0x00); #######
- &pxor ($T2,$Xi); #
- &pxor ($T2,$Xhi); #
-
- &movdqa ($T3,$T2); #
- &psrldq ($T2,8);
- &pslldq ($T3,8); #
- &pxor ($Xhi,$T2);
- &pxor ($Xi,$T3); #
-}
-
-if (1) { # Algorithm 9 with <<1 twist.
- # Reduction is shorter and uses only two
- # temporary registers, which makes it better
- # candidate for interleaving with 64x64
- # multiplication. Pre-modulo-scheduled loop
- # was found to be ~20% faster than Algorithm 5
- # below. Algorithm 9 was therefore chosen for
- # further optimization...
-
-sub reduction_alg9 { # 17/13 times faster than Intel version
-my ($Xhi,$Xi) = @_;
-
- # 1st phase
- &movdqa ($T1,$Xi); #
- &psllq ($Xi,1);
- &pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
- &psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
- &pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pxor ($Xhi,$T2); #
-
- # 2nd phase
- &movdqa ($T2,$Xi);
- &psrlq ($Xi,5);
- &pxor ($Xi,$T2); #
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
-}
-
-&function_begin_B("gcm_init_clmul");
- &mov ($Htbl,&wparam(0));
- &mov ($Xip,&wparam(1));
-
- &call (&label("pic"));
-&set_label("pic");
- &blindpop ($const);
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-
- &movdqu ($Hkey,&QWP(0,$Xip));
- &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
-
- # <<1 twist
- &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
- &movdqa ($T1,$Hkey);
- &psllq ($Hkey,1);
- &pxor ($T3,$T3); #
- &psrlq ($T1,63);
- &pcmpgtd ($T3,$T2); # broadcast carry bit
- &pslldq ($T1,8);
- &por ($Hkey,$T1); # H<<=1
-
- # magic reduction
- &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
- &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
-
- # calculate H^2
- &movdqa ($Xi,$Hkey);
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
- &reduction_alg9 ($Xhi,$Xi);
-
- &movdqu (&QWP(0,$Htbl),$Hkey); # save H
- &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
-
- &ret ();
-&function_end_B("gcm_init_clmul");
-
-&function_begin_B("gcm_gmult_clmul");
- &mov ($Xip,&wparam(0));
- &mov ($Htbl,&wparam(1));
-
- &call (&label("pic"));
-&set_label("pic");
- &blindpop ($const);
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-
- &movdqu ($Xi,&QWP(0,$Xip));
- &movdqa ($T3,&QWP(0,$const));
- &movups ($Hkey,&QWP(0,$Htbl));
- &pshufb ($Xi,$T3);
-
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
- &reduction_alg9 ($Xhi,$Xi);
-
- &pshufb ($Xi,$T3);
- &movdqu (&QWP(0,$Xip),$Xi);
-
- &ret ();
-&function_end_B("gcm_gmult_clmul");
-
-&function_begin("gcm_ghash_clmul");
- &mov ($Xip,&wparam(0));
- &mov ($Htbl,&wparam(1));
- &mov ($inp,&wparam(2));
- &mov ($len,&wparam(3));
-
- &call (&label("pic"));
-&set_label("pic");
- &blindpop ($const);
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-
- &movdqu ($Xi,&QWP(0,$Xip));
- &movdqa ($T3,&QWP(0,$const));
- &movdqu ($Hkey,&QWP(0,$Htbl));
- &pshufb ($Xi,$T3);
-
- &sub ($len,0x10);
- &jz (&label("odd_tail"));
-
- #######
- # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
- # [(H*Ii+1) + (H*Xi+1)] mod P =
- # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
- #
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
- &pshufb ($T1,$T3);
- &pshufb ($Xn,$T3);
- &pxor ($Xi,$T1); # Ii+Xi
-
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
-
- &lea ($inp,&DWP(32,$inp)); # i+=2
- &sub ($len,0x20);
- &jbe (&label("even_tail"));
-
-&set_label("mod_loop");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
-
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
-
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
- &pshufb ($T1,$T3);
- &pshufb ($Xn,$T3);
-
- &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
- &movdqa ($Xhn,$Xn);
- &pxor ($Xhi,$T1); # "Ii+Xi", consume early
-
- &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
- &psllq ($Xi,1);
- &pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
- &pclmulqdq ($Xn,$Hkey,0x00); #######
- &psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
- &pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pshufd ($T1,$T3,0b01001110);
- &pxor ($Xhi,$T2); #
- &pxor ($T1,$T3);
- &pshufd ($T3,$Hkey,0b01001110);
- &pxor ($T3,$Hkey); #
-
- &pclmulqdq ($Xhn,$Hkey,0x11); #######
- &movdqa ($T2,$Xi); # 2nd phase
- &psrlq ($Xi,5);
- &pxor ($Xi,$T2); #
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
-
- &pclmulqdq ($T1,$T3,0x00); #######
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
- &xorps ($T1,$Xn); #
- &xorps ($T1,$Xhn); #
-
- &movdqa ($T3,$T1); #
- &psrldq ($T1,8);
- &pslldq ($T3,8); #
- &pxor ($Xhn,$T1);
- &pxor ($Xn,$T3); #
- &movdqa ($T3,&QWP(0,$const));
-
- &lea ($inp,&DWP(32,$inp));
- &sub ($len,0x20);
- &ja (&label("mod_loop"));
-
-&set_label("even_tail");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
-
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
-
- &reduction_alg9 ($Xhi,$Xi);
-
- &test ($len,$len);
- &jnz (&label("done"));
-
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
-&set_label("odd_tail");
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &pshufb ($T1,$T3);
- &pxor ($Xi,$T1); # Ii+Xi
-
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
- &reduction_alg9 ($Xhi,$Xi);
-
-&set_label("done");
- &pshufb ($Xi,$T3);
- &movdqu (&QWP(0,$Xip),$Xi);
-&function_end("gcm_ghash_clmul");
-
-} else { # Algorith 5. Kept for reference purposes.
-
-sub reduction_alg5 { # 19/16 times faster than Intel version
-my ($Xhi,$Xi)=@_;
-
- # <<1
- &movdqa ($T1,$Xi); #
- &movdqa ($T2,$Xhi);
- &pslld ($Xi,1);
- &pslld ($Xhi,1); #
- &psrld ($T1,31);
- &psrld ($T2,31); #
- &movdqa ($T3,$T1);
- &pslldq ($T1,4);
- &psrldq ($T3,12); #
- &pslldq ($T2,4);
- &por ($Xhi,$T3); #
- &por ($Xi,$T1);
- &por ($Xhi,$T2); #
-
- # 1st phase
- &movdqa ($T1,$Xi);
- &movdqa ($T2,$Xi);
- &movdqa ($T3,$Xi); #
- &pslld ($T1,31);
- &pslld ($T2,30);
- &pslld ($Xi,25); #
- &pxor ($T1,$T2);
- &pxor ($T1,$Xi); #
- &movdqa ($T2,$T1); #
- &pslldq ($T1,12);
- &psrldq ($T2,4); #
- &pxor ($T3,$T1);
-
- # 2nd phase
- &pxor ($Xhi,$T3); #
- &movdqa ($Xi,$T3);
- &movdqa ($T1,$T3);
- &psrld ($Xi,1); #
- &psrld ($T1,2);
- &psrld ($T3,7); #
- &pxor ($Xi,$T1);
- &pxor ($Xhi,$T2);
- &pxor ($Xi,$T3); #
- &pxor ($Xi,$Xhi); #
-}
-
-&function_begin_B("gcm_init_clmul");
- &mov ($Htbl,&wparam(0));
- &mov ($Xip,&wparam(1));
-
- &call (&label("pic"));
-&set_label("pic");
- &blindpop ($const);
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-
- &movdqu ($Hkey,&QWP(0,$Xip));
- &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
-
- # calculate H^2
- &movdqa ($Xi,$Hkey);
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
- &reduction_alg5 ($Xhi,$Xi);
-
- &movdqu (&QWP(0,$Htbl),$Hkey); # save H
- &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
-
- &ret ();
-&function_end_B("gcm_init_clmul");
-
-&function_begin_B("gcm_gmult_clmul");
- &mov ($Xip,&wparam(0));
- &mov ($Htbl,&wparam(1));
-
- &call (&label("pic"));
-&set_label("pic");
- &blindpop ($const);
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-
- &movdqu ($Xi,&QWP(0,$Xip));
- &movdqa ($Xn,&QWP(0,$const));
- &movdqu ($Hkey,&QWP(0,$Htbl));
- &pshufb ($Xi,$Xn);
-
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
- &reduction_alg5 ($Xhi,$Xi);
-
- &pshufb ($Xi,$Xn);
- &movdqu (&QWP(0,$Xip),$Xi);
-
- &ret ();
-&function_end_B("gcm_gmult_clmul");
-
-&function_begin("gcm_ghash_clmul");
- &mov ($Xip,&wparam(0));
- &mov ($Htbl,&wparam(1));
- &mov ($inp,&wparam(2));
- &mov ($len,&wparam(3));
-
- &call (&label("pic"));
-&set_label("pic");
- &blindpop ($const);
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-
- &movdqu ($Xi,&QWP(0,$Xip));
- &movdqa ($T3,&QWP(0,$const));
- &movdqu ($Hkey,&QWP(0,$Htbl));
- &pshufb ($Xi,$T3);
-
- &sub ($len,0x10);
- &jz (&label("odd_tail"));
-
- #######
- # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
- # [(H*Ii+1) + (H*Xi+1)] mod P =
- # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
- #
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
- &pshufb ($T1,$T3);
- &pshufb ($Xn,$T3);
- &pxor ($Xi,$T1); # Ii+Xi
-
- &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
- &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
-
- &sub ($len,0x20);
- &lea ($inp,&DWP(32,$inp)); # i+=2
- &jbe (&label("even_tail"));
-
-&set_label("mod_loop");
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
- &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
-
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
-
- &reduction_alg5 ($Xhi,$Xi);
-
- #######
- &movdqa ($T3,&QWP(0,$const));
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
- &pshufb ($T1,$T3);
- &pshufb ($Xn,$T3);
- &pxor ($Xi,$T1); # Ii+Xi
-
- &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
- &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
-
- &sub ($len,0x20);
- &lea ($inp,&DWP(32,$inp));
- &ja (&label("mod_loop"));
-
-&set_label("even_tail");
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
-
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
-
- &reduction_alg5 ($Xhi,$Xi);
-
- &movdqa ($T3,&QWP(0,$const));
- &test ($len,$len);
- &jnz (&label("done"));
-
- &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
-&set_label("odd_tail");
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &pshufb ($T1,$T3);
- &pxor ($Xi,$T1); # Ii+Xi
-
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
- &reduction_alg5 ($Xhi,$Xi);
-
- &movdqa ($T3,&QWP(0,$const));
-&set_label("done");
- &pshufb ($Xi,$T3);
- &movdqu (&QWP(0,$Xip),$Xi);
-&function_end("gcm_ghash_clmul");
-
-}
-
-&set_label("bswap",64);
- &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
- &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
-}} # $sse2
-
-&set_label("rem_4bit",64);
- &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
- &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
- &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
- &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
-&set_label("rem_8bit",64);
- &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
- &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
- &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
- &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
- &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
- &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
- &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
- &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
- &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
- &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
- &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
- &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
- &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
- &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
- &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
- &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
- &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
- &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
- &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
- &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
- &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
- &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
- &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
- &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
- &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
- &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
- &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
- &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
- &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
- &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
- &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
- &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
-}}} # !$x86only
-
-&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
-&asm_finish();
-
-# A question was risen about choice of vanilla MMX. Or rather why wasn't
-# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
-# CPUs such as PIII, "4-bit" MMX version was observed to provide better
-# performance than *corresponding* SSE2 one even on contemporary CPUs.
-# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
-# implementation featuring full range of lookup-table sizes, but with
-# per-invocation lookup table setup. Latter means that table size is
-# chosen depending on how much data is to be hashed in every given call,
-# more data - larger table. Best reported result for Core2 is ~4 cycles
-# per processed byte out of 64KB block. This number accounts even for
-# 64KB table setup overhead. As discussed in gcm128.c we choose to be
-# more conservative in respect to lookup table sizes, but how do the
-# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
-# on same platform. As also discussed in gcm128.c, next in line "8-bit
-# Shoup's" or "4KB" method should deliver twice the performance of
-# "256B" one, in other words not worse than ~6 cycles per byte. It
-# should be also be noted that in SSE2 case improvement can be "super-
-# linear," i.e. more than twice, mostly because >>8 maps to single
-# instruction on SSE2 register. This is unlike "4-bit" case when >>4
-# maps to same amount of instructions in both MMX and SSE2 cases.
-# Bottom line is that switch to SSE2 is considered to be justifiable
-# only in case we choose to implement "8-bit" method...
diff --git a/app/openssl/crypto/modes/asm/ghash-x86_64.S b/app/openssl/crypto/modes/asm/ghash-x86_64.S
deleted file mode 100644
index 62d39c65..00000000
--- a/app/openssl/crypto/modes/asm/ghash-x86_64.S
+++ /dev/null
@@ -1,1026 +0,0 @@
-.text
-
-.globl gcm_gmult_4bit
-.type gcm_gmult_4bit,@function
-.align 16
-gcm_gmult_4bit:
- pushq %rbx
- pushq %rbp
- pushq %r12
-.Lgmult_prologue:
-
- movzbq 15(%rdi),%r8
- leaq .Lrem_4bit(%rip),%r11
- xorq %rax,%rax
- xorq %rbx,%rbx
- movb %r8b,%al
- movb %r8b,%bl
- shlb $4,%al
- movq $14,%rcx
- movq 8(%rsi,%rax,1),%r8
- movq (%rsi,%rax,1),%r9
- andb $240,%bl
- movq %r8,%rdx
- jmp .Loop1
-
-.align 16
-.Loop1:
- shrq $4,%r8
- andq $15,%rdx
- movq %r9,%r10
- movb (%rdi,%rcx,1),%al
- shrq $4,%r9
- xorq 8(%rsi,%rbx,1),%r8
- shlq $60,%r10
- xorq (%rsi,%rbx,1),%r9
- movb %al,%bl
- xorq (%r11,%rdx,8),%r9
- movq %r8,%rdx
- shlb $4,%al
- xorq %r10,%r8
- decq %rcx
- js .Lbreak1
-
- shrq $4,%r8
- andq $15,%rdx
- movq %r9,%r10
- shrq $4,%r9
- xorq 8(%rsi,%rax,1),%r8
- shlq $60,%r10
- xorq (%rsi,%rax,1),%r9
- andb $240,%bl
- xorq (%r11,%rdx,8),%r9
- movq %r8,%rdx
- xorq %r10,%r8
- jmp .Loop1
-
-.align 16
-.Lbreak1:
- shrq $4,%r8
- andq $15,%rdx
- movq %r9,%r10
- shrq $4,%r9
- xorq 8(%rsi,%rax,1),%r8
- shlq $60,%r10
- xorq (%rsi,%rax,1),%r9
- andb $240,%bl
- xorq (%r11,%rdx,8),%r9
- movq %r8,%rdx
- xorq %r10,%r8
-
- shrq $4,%r8
- andq $15,%rdx
- movq %r9,%r10
- shrq $4,%r9
- xorq 8(%rsi,%rbx,1),%r8
- shlq $60,%r10
- xorq (%rsi,%rbx,1),%r9
- xorq %r10,%r8
- xorq (%r11,%rdx,8),%r9
-
- bswapq %r8
- bswapq %r9
- movq %r8,8(%rdi)
- movq %r9,(%rdi)
-
- movq 16(%rsp),%rbx
- leaq 24(%rsp),%rsp
-.Lgmult_epilogue:
- .byte 0xf3,0xc3
-.size gcm_gmult_4bit,.-gcm_gmult_4bit
-.globl gcm_ghash_4bit
-.type gcm_ghash_4bit,@function
-.align 16
-gcm_ghash_4bit:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $280,%rsp
-.Lghash_prologue:
- movq %rdx,%r14
- movq %rcx,%r15
- subq $-128,%rsi
- leaq 16+128(%rsp),%rbp
- xorl %edx,%edx
- movq 0+0-128(%rsi),%r8
- movq 0+8-128(%rsi),%rax
- movb %al,%dl
- shrq $4,%rax
- movq %r8,%r10
- shrq $4,%r8
- movq 16+0-128(%rsi),%r9
- shlb $4,%dl
- movq 16+8-128(%rsi),%rbx
- shlq $60,%r10
- movb %dl,0(%rsp)
- orq %r10,%rax
- movb %bl,%dl
- shrq $4,%rbx
- movq %r9,%r10
- shrq $4,%r9
- movq %r8,0(%rbp)
- movq 32+0-128(%rsi),%r8
- shlb $4,%dl
- movq %rax,0-128(%rbp)
- movq 32+8-128(%rsi),%rax
- shlq $60,%r10
- movb %dl,1(%rsp)
- orq %r10,%rbx
- movb %al,%dl
- shrq $4,%rax
- movq %r8,%r10
- shrq $4,%r8
- movq %r9,8(%rbp)
- movq 48+0-128(%rsi),%r9
- shlb $4,%dl
- movq %rbx,8-128(%rbp)
- movq 48+8-128(%rsi),%rbx
- shlq $60,%r10
- movb %dl,2(%rsp)
- orq %r10,%rax
- movb %bl,%dl
- shrq $4,%rbx
- movq %r9,%r10
- shrq $4,%r9
- movq %r8,16(%rbp)
- movq 64+0-128(%rsi),%r8
- shlb $4,%dl
- movq %rax,16-128(%rbp)
- movq 64+8-128(%rsi),%rax
- shlq $60,%r10
- movb %dl,3(%rsp)
- orq %r10,%rbx
- movb %al,%dl
- shrq $4,%rax
- movq %r8,%r10
- shrq $4,%r8
- movq %r9,24(%rbp)
- movq 80+0-128(%rsi),%r9
- shlb $4,%dl
- movq %rbx,24-128(%rbp)
- movq 80+8-128(%rsi),%rbx
- shlq $60,%r10
- movb %dl,4(%rsp)
- orq %r10,%rax
- movb %bl,%dl
- shrq $4,%rbx
- movq %r9,%r10
- shrq $4,%r9
- movq %r8,32(%rbp)
- movq 96+0-128(%rsi),%r8
- shlb $4,%dl
- movq %rax,32-128(%rbp)
- movq 96+8-128(%rsi),%rax
- shlq $60,%r10
- movb %dl,5(%rsp)
- orq %r10,%rbx
- movb %al,%dl
- shrq $4,%rax
- movq %r8,%r10
- shrq $4,%r8
- movq %r9,40(%rbp)
- movq 112+0-128(%rsi),%r9
- shlb $4,%dl
- movq %rbx,40-128(%rbp)
- movq 112+8-128(%rsi),%rbx
- shlq $60,%r10
- movb %dl,6(%rsp)
- orq %r10,%rax
- movb %bl,%dl
- shrq $4,%rbx
- movq %r9,%r10
- shrq $4,%r9
- movq %r8,48(%rbp)
- movq 128+0-128(%rsi),%r8
- shlb $4,%dl
- movq %rax,48-128(%rbp)
- movq 128+8-128(%rsi),%rax
- shlq $60,%r10
- movb %dl,7(%rsp)
- orq %r10,%rbx
- movb %al,%dl
- shrq $4,%rax
- movq %r8,%r10
- shrq $4,%r8
- movq %r9,56(%rbp)
- movq 144+0-128(%rsi),%r9
- shlb $4,%dl
- movq %rbx,56-128(%rbp)
- movq 144+8-128(%rsi),%rbx
- shlq $60,%r10
- movb %dl,8(%rsp)
- orq %r10,%rax
- movb %bl,%dl
- shrq $4,%rbx
- movq %r9,%r10
- shrq $4,%r9
- movq %r8,64(%rbp)
- movq 160+0-128(%rsi),%r8
- shlb $4,%dl
- movq %rax,64-128(%rbp)
- movq 160+8-128(%rsi),%rax
- shlq $60,%r10
- movb %dl,9(%rsp)
- orq %r10,%rbx
- movb %al,%dl
- shrq $4,%rax
- movq %r8,%r10
- shrq $4,%r8
- movq %r9,72(%rbp)
- movq 176+0-128(%rsi),%r9
- shlb $4,%dl
- movq %rbx,72-128(%rbp)
- movq 176+8-128(%rsi),%rbx
- shlq $60,%r10
- movb %dl,10(%rsp)
- orq %r10,%rax
- movb %bl,%dl
- shrq $4,%rbx
- movq %r9,%r10
- shrq $4,%r9
- movq %r8,80(%rbp)
- movq 192+0-128(%rsi),%r8
- shlb $4,%dl
- movq %rax,80-128(%rbp)
- movq 192+8-128(%rsi),%rax
- shlq $60,%r10
- movb %dl,11(%rsp)
- orq %r10,%rbx
- movb %al,%dl
- shrq $4,%rax
- movq %r8,%r10
- shrq $4,%r8
- movq %r9,88(%rbp)
- movq 208+0-128(%rsi),%r9
- shlb $4,%dl
- movq %rbx,88-128(%rbp)
- movq 208+8-128(%rsi),%rbx
- shlq $60,%r10
- movb %dl,12(%rsp)
- orq %r10,%rax
- movb %bl,%dl
- shrq $4,%rbx
- movq %r9,%r10
- shrq $4,%r9
- movq %r8,96(%rbp)
- movq 224+0-128(%rsi),%r8
- shlb $4,%dl
- movq %rax,96-128(%rbp)
- movq 224+8-128(%rsi),%rax
- shlq $60,%r10
- movb %dl,13(%rsp)
- orq %r10,%rbx
- movb %al,%dl
- shrq $4,%rax
- movq %r8,%r10
- shrq $4,%r8
- movq %r9,104(%rbp)
- movq 240+0-128(%rsi),%r9
- shlb $4,%dl
- movq %rbx,104-128(%rbp)
- movq 240+8-128(%rsi),%rbx
- shlq $60,%r10
- movb %dl,14(%rsp)
- orq %r10,%rax
- movb %bl,%dl
- shrq $4,%rbx
- movq %r9,%r10
- shrq $4,%r9
- movq %r8,112(%rbp)
- shlb $4,%dl
- movq %rax,112-128(%rbp)
- shlq $60,%r10
- movb %dl,15(%rsp)
- orq %r10,%rbx
- movq %r9,120(%rbp)
- movq %rbx,120-128(%rbp)
- addq $-128,%rsi
- movq 8(%rdi),%r8
- movq 0(%rdi),%r9
- addq %r14,%r15
- leaq .Lrem_8bit(%rip),%r11
- jmp .Louter_loop
-.align 16
-.Louter_loop:
- xorq (%r14),%r9
- movq 8(%r14),%rdx
- leaq 16(%r14),%r14
- xorq %r8,%rdx
- movq %r9,(%rdi)
- movq %rdx,8(%rdi)
- shrq $32,%rdx
- xorq %rax,%rax
- roll $8,%edx
- movb %dl,%al
- movzbl %dl,%ebx
- shlb $4,%al
- shrl $4,%ebx
- roll $8,%edx
- movq 8(%rsi,%rax,1),%r8
- movq (%rsi,%rax,1),%r9
- movb %dl,%al
- movzbl %dl,%ecx
- shlb $4,%al
- movzbq (%rsp,%rbx,1),%r12
- shrl $4,%ecx
- xorq %r8,%r12
- movq %r9,%r10
- shrq $8,%r8
- movzbq %r12b,%r12
- shrq $8,%r9
- xorq -128(%rbp,%rbx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rbx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r12,2),%r12
- movzbl %dl,%ebx
- shlb $4,%al
- movzbq (%rsp,%rcx,1),%r13
- shrl $4,%ebx
- shlq $48,%r12
- xorq %r8,%r13
- movq %r9,%r10
- xorq %r12,%r9
- shrq $8,%r8
- movzbq %r13b,%r13
- shrq $8,%r9
- xorq -128(%rbp,%rcx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rcx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r13,2),%r13
- movzbl %dl,%ecx
- shlb $4,%al
- movzbq (%rsp,%rbx,1),%r12
- shrl $4,%ecx
- shlq $48,%r13
- xorq %r8,%r12
- movq %r9,%r10
- xorq %r13,%r9
- shrq $8,%r8
- movzbq %r12b,%r12
- movl 8(%rdi),%edx
- shrq $8,%r9
- xorq -128(%rbp,%rbx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rbx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r12,2),%r12
- movzbl %dl,%ebx
- shlb $4,%al
- movzbq (%rsp,%rcx,1),%r13
- shrl $4,%ebx
- shlq $48,%r12
- xorq %r8,%r13
- movq %r9,%r10
- xorq %r12,%r9
- shrq $8,%r8
- movzbq %r13b,%r13
- shrq $8,%r9
- xorq -128(%rbp,%rcx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rcx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r13,2),%r13
- movzbl %dl,%ecx
- shlb $4,%al
- movzbq (%rsp,%rbx,1),%r12
- shrl $4,%ecx
- shlq $48,%r13
- xorq %r8,%r12
- movq %r9,%r10
- xorq %r13,%r9
- shrq $8,%r8
- movzbq %r12b,%r12
- shrq $8,%r9
- xorq -128(%rbp,%rbx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rbx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r12,2),%r12
- movzbl %dl,%ebx
- shlb $4,%al
- movzbq (%rsp,%rcx,1),%r13
- shrl $4,%ebx
- shlq $48,%r12
- xorq %r8,%r13
- movq %r9,%r10
- xorq %r12,%r9
- shrq $8,%r8
- movzbq %r13b,%r13
- shrq $8,%r9
- xorq -128(%rbp,%rcx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rcx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r13,2),%r13
- movzbl %dl,%ecx
- shlb $4,%al
- movzbq (%rsp,%rbx,1),%r12
- shrl $4,%ecx
- shlq $48,%r13
- xorq %r8,%r12
- movq %r9,%r10
- xorq %r13,%r9
- shrq $8,%r8
- movzbq %r12b,%r12
- movl 4(%rdi),%edx
- shrq $8,%r9
- xorq -128(%rbp,%rbx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rbx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r12,2),%r12
- movzbl %dl,%ebx
- shlb $4,%al
- movzbq (%rsp,%rcx,1),%r13
- shrl $4,%ebx
- shlq $48,%r12
- xorq %r8,%r13
- movq %r9,%r10
- xorq %r12,%r9
- shrq $8,%r8
- movzbq %r13b,%r13
- shrq $8,%r9
- xorq -128(%rbp,%rcx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rcx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r13,2),%r13
- movzbl %dl,%ecx
- shlb $4,%al
- movzbq (%rsp,%rbx,1),%r12
- shrl $4,%ecx
- shlq $48,%r13
- xorq %r8,%r12
- movq %r9,%r10
- xorq %r13,%r9
- shrq $8,%r8
- movzbq %r12b,%r12
- shrq $8,%r9
- xorq -128(%rbp,%rbx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rbx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r12,2),%r12
- movzbl %dl,%ebx
- shlb $4,%al
- movzbq (%rsp,%rcx,1),%r13
- shrl $4,%ebx
- shlq $48,%r12
- xorq %r8,%r13
- movq %r9,%r10
- xorq %r12,%r9
- shrq $8,%r8
- movzbq %r13b,%r13
- shrq $8,%r9
- xorq -128(%rbp,%rcx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rcx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r13,2),%r13
- movzbl %dl,%ecx
- shlb $4,%al
- movzbq (%rsp,%rbx,1),%r12
- shrl $4,%ecx
- shlq $48,%r13
- xorq %r8,%r12
- movq %r9,%r10
- xorq %r13,%r9
- shrq $8,%r8
- movzbq %r12b,%r12
- movl 0(%rdi),%edx
- shrq $8,%r9
- xorq -128(%rbp,%rbx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rbx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r12,2),%r12
- movzbl %dl,%ebx
- shlb $4,%al
- movzbq (%rsp,%rcx,1),%r13
- shrl $4,%ebx
- shlq $48,%r12
- xorq %r8,%r13
- movq %r9,%r10
- xorq %r12,%r9
- shrq $8,%r8
- movzbq %r13b,%r13
- shrq $8,%r9
- xorq -128(%rbp,%rcx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rcx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r13,2),%r13
- movzbl %dl,%ecx
- shlb $4,%al
- movzbq (%rsp,%rbx,1),%r12
- shrl $4,%ecx
- shlq $48,%r13
- xorq %r8,%r12
- movq %r9,%r10
- xorq %r13,%r9
- shrq $8,%r8
- movzbq %r12b,%r12
- shrq $8,%r9
- xorq -128(%rbp,%rbx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rbx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r12,2),%r12
- movzbl %dl,%ebx
- shlb $4,%al
- movzbq (%rsp,%rcx,1),%r13
- shrl $4,%ebx
- shlq $48,%r12
- xorq %r8,%r13
- movq %r9,%r10
- xorq %r12,%r9
- shrq $8,%r8
- movzbq %r13b,%r13
- shrq $8,%r9
- xorq -128(%rbp,%rcx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rcx,8),%r9
- roll $8,%edx
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- movb %dl,%al
- xorq %r10,%r8
- movzwq (%r11,%r13,2),%r13
- movzbl %dl,%ecx
- shlb $4,%al
- movzbq (%rsp,%rbx,1),%r12
- andl $240,%ecx
- shlq $48,%r13
- xorq %r8,%r12
- movq %r9,%r10
- xorq %r13,%r9
- shrq $8,%r8
- movzbq %r12b,%r12
- movl -4(%rdi),%edx
- shrq $8,%r9
- xorq -128(%rbp,%rbx,8),%r8
- shlq $56,%r10
- xorq (%rbp,%rbx,8),%r9
- movzwq (%r11,%r12,2),%r12
- xorq 8(%rsi,%rax,1),%r8
- xorq (%rsi,%rax,1),%r9
- shlq $48,%r12
- xorq %r10,%r8
- xorq %r12,%r9
- movzbq %r8b,%r13
- shrq $4,%r8
- movq %r9,%r10
- shlb $4,%r13b
- shrq $4,%r9
- xorq 8(%rsi,%rcx,1),%r8
- movzwq (%r11,%r13,2),%r13
- shlq $60,%r10
- xorq (%rsi,%rcx,1),%r9
- xorq %r10,%r8
- shlq $48,%r13
- bswapq %r8
- xorq %r13,%r9
- bswapq %r9
- cmpq %r15,%r14
- jb .Louter_loop
- movq %r8,8(%rdi)
- movq %r9,(%rdi)
-
- leaq 280(%rsp),%rsi
- movq 0(%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lghash_epilogue:
- .byte 0xf3,0xc3
-.size gcm_ghash_4bit,.-gcm_ghash_4bit
-.globl gcm_init_clmul
-.type gcm_init_clmul,@function
-.align 16
-gcm_init_clmul:
- movdqu (%rsi),%xmm2
- pshufd $78,%xmm2,%xmm2
-
-
- pshufd $255,%xmm2,%xmm4
- movdqa %xmm2,%xmm3
- psllq $1,%xmm2
- pxor %xmm5,%xmm5
- psrlq $63,%xmm3
- pcmpgtd %xmm4,%xmm5
- pslldq $8,%xmm3
- por %xmm3,%xmm2
-
-
- pand .L0x1c2_polynomial(%rip),%xmm5
- pxor %xmm5,%xmm2
-
-
- movdqa %xmm2,%xmm0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- movdqu %xmm2,(%rdi)
- movdqu %xmm0,16(%rdi)
- .byte 0xf3,0xc3
-.size gcm_init_clmul,.-gcm_init_clmul
-.globl gcm_gmult_clmul
-.type gcm_gmult_clmul,@function
-.align 16
-gcm_gmult_clmul:
- movdqu (%rdi),%xmm0
- movdqa .Lbswap_mask(%rip),%xmm5
- movdqu (%rsi),%xmm2
-.byte 102,15,56,0,197
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-.byte 102,15,56,0,197
- movdqu %xmm0,(%rdi)
- .byte 0xf3,0xc3
-.size gcm_gmult_clmul,.-gcm_gmult_clmul
-.globl gcm_ghash_clmul
-.type gcm_ghash_clmul,@function
-.align 16
-gcm_ghash_clmul:
- movdqa .Lbswap_mask(%rip),%xmm5
-
- movdqu (%rdi),%xmm0
- movdqu (%rsi),%xmm2
-.byte 102,15,56,0,197
-
- subq $16,%rcx
- jz .Lodd_tail
-
- movdqu 16(%rsi),%xmm8
-
-
-
-
-
- movdqu (%rdx),%xmm3
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm6,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,242,0
-.byte 102,15,58,68,250,17
-.byte 102,15,58,68,220,0
- pxor %xmm6,%xmm3
- pxor %xmm7,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm7
- pxor %xmm4,%xmm6
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
-
- leaq 32(%rdx),%rdx
- subq $32,%rcx
- jbe .Leven_tail
-
-.Lmod_loop:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- movdqu (%rdx),%xmm3
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
-
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
-
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm9
- pshufd $78,%xmm2,%xmm10
- pxor %xmm6,%xmm9
- pxor %xmm2,%xmm10
- pxor %xmm3,%xmm1
-
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
-.byte 102,15,58,68,242,0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
-
-.byte 102,15,58,68,250,17
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-
-.byte 102,69,15,58,68,202,0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
-
- pxor %xmm6,%xmm9
- pxor %xmm7,%xmm9
- movdqa %xmm9,%xmm10
- psrldq $8,%xmm9
- pslldq $8,%xmm10
- pxor %xmm9,%xmm7
- pxor %xmm10,%xmm6
-
- leaq 32(%rdx),%rdx
- subq $32,%rcx
- ja .Lmod_loop
-
-.Leven_tail:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
-
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- testq %rcx,%rcx
- jnz .Ldone
-
-.Lodd_tail:
- movdqu (%rdx),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $5,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm4
- pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-.Ldone:
-.byte 102,15,56,0,197
- movdqu %xmm0,(%rdi)
- .byte 0xf3,0xc3
-.LSEH_end_gcm_ghash_clmul:
-.size gcm_ghash_clmul,.-gcm_ghash_clmul
-.align 64
-.Lbswap_mask:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.L0x1c2_polynomial:
-.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-.align 64
-.type .Lrem_4bit,@object
-.Lrem_4bit:
-.long 0,0,0,471859200,0,943718400,0,610271232
-.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
-.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
-.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
-.type .Lrem_8bit,@object
-.Lrem_8bit:
-.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
-.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
-.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
-.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
-.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
-.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
-.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
-.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
-.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
-.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
-.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
-.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
-.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
-.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
-.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
-.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
-.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
-.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
-.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
-.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
-.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
-.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
-.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
-.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
-.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
-.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
-.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
-.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
-.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
-.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
-.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
-.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
-
-.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 64
diff --git a/app/openssl/crypto/modes/asm/ghash-x86_64.pl b/app/openssl/crypto/modes/asm/ghash-x86_64.pl
deleted file mode 100644
index 38d779ed..00000000
--- a/app/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ /dev/null
@@ -1,806 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March, June 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that
-# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
-# function features so called "528B" variant utilizing additional
-# 256+16 bytes of per-key storage [+512 bytes shared table].
-# Performance results are for this streamed GHASH subroutine and are
-# expressed in cycles per processed byte, less is better:
-#
-# gcc 3.4.x(*) assembler
-#
-# P4 28.6 14.0 +100%
-# Opteron 19.3 7.7 +150%
-# Core2 17.8 8.1(**) +120%
-#
-# (*) comparison is not completely fair, because C results are
-# for vanilla "256B" implementation, while assembler results
-# are for "528B";-)
-# (**) it's mystery [to me] why Core2 result is not same as for
-# Opteron;
-
-# May 2010
-#
-# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
-# See ghash-x86.pl for background information and details about coding
-# techniques.
-#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" $xlate $flavour $output";
-*STDOUT=*OUT;
-
-# common register layout
-$nlo="%rax";
-$nhi="%rbx";
-$Zlo="%r8";
-$Zhi="%r9";
-$tmp="%r10";
-$rem_4bit = "%r11";
-
-$Xi="%rdi";
-$Htbl="%rsi";
-
-# per-function register layout
-$cnt="%rcx";
-$rem="%rdx";
-
-sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
- $r =~ s/%[er]([sd]i)/%\1l/ or
- $r =~ s/%[er](bp)/%\1l/ or
- $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
-
-sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
- my $arg = pop;
- $arg = "\$$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
-}
-
-{ my $N;
- sub loop() {
- my $inp = shift;
-
- $N++;
-$code.=<<___;
- xor $nlo,$nlo
- xor $nhi,$nhi
- mov `&LB("$Zlo")`,`&LB("$nlo")`
- mov `&LB("$Zlo")`,`&LB("$nhi")`
- shl \$4,`&LB("$nlo")`
- mov \$14,$cnt
- mov 8($Htbl,$nlo),$Zlo
- mov ($Htbl,$nlo),$Zhi
- and \$0xf0,`&LB("$nhi")`
- mov $Zlo,$rem
- jmp .Loop$N
-
-.align 16
-.Loop$N:
- shr \$4,$Zlo
- and \$0xf,$rem
- mov $Zhi,$tmp
- mov ($inp,$cnt),`&LB("$nlo")`
- shr \$4,$Zhi
- xor 8($Htbl,$nhi),$Zlo
- shl \$60,$tmp
- xor ($Htbl,$nhi),$Zhi
- mov `&LB("$nlo")`,`&LB("$nhi")`
- xor ($rem_4bit,$rem,8),$Zhi
- mov $Zlo,$rem
- shl \$4,`&LB("$nlo")`
- xor $tmp,$Zlo
- dec $cnt
- js .Lbreak$N
-
- shr \$4,$Zlo
- and \$0xf,$rem
- mov $Zhi,$tmp
- shr \$4,$Zhi
- xor 8($Htbl,$nlo),$Zlo
- shl \$60,$tmp
- xor ($Htbl,$nlo),$Zhi
- and \$0xf0,`&LB("$nhi")`
- xor ($rem_4bit,$rem,8),$Zhi
- mov $Zlo,$rem
- xor $tmp,$Zlo
- jmp .Loop$N
-
-.align 16
-.Lbreak$N:
- shr \$4,$Zlo
- and \$0xf,$rem
- mov $Zhi,$tmp
- shr \$4,$Zhi
- xor 8($Htbl,$nlo),$Zlo
- shl \$60,$tmp
- xor ($Htbl,$nlo),$Zhi
- and \$0xf0,`&LB("$nhi")`
- xor ($rem_4bit,$rem,8),$Zhi
- mov $Zlo,$rem
- xor $tmp,$Zlo
-
- shr \$4,$Zlo
- and \$0xf,$rem
- mov $Zhi,$tmp
- shr \$4,$Zhi
- xor 8($Htbl,$nhi),$Zlo
- shl \$60,$tmp
- xor ($Htbl,$nhi),$Zhi
- xor $tmp,$Zlo
- xor ($rem_4bit,$rem,8),$Zhi
-
- bswap $Zlo
- bswap $Zhi
-___
-}}
-
-$code=<<___;
-.text
-
-.globl gcm_gmult_4bit
-.type gcm_gmult_4bit,\@function,2
-.align 16
-gcm_gmult_4bit:
- push %rbx
- push %rbp # %rbp and %r12 are pushed exclusively in
- push %r12 # order to reuse Win64 exception handler...
-.Lgmult_prologue:
-
- movzb 15($Xi),$Zlo
- lea .Lrem_4bit(%rip),$rem_4bit
-___
- &loop ($Xi);
-$code.=<<___;
- mov $Zlo,8($Xi)
- mov $Zhi,($Xi)
-
- mov 16(%rsp),%rbx
- lea 24(%rsp),%rsp
-.Lgmult_epilogue:
- ret
-.size gcm_gmult_4bit,.-gcm_gmult_4bit
-___
-
-# per-function register layout
-$inp="%rdx";
-$len="%rcx";
-$rem_8bit=$rem_4bit;
-
-$code.=<<___;
-.globl gcm_ghash_4bit
-.type gcm_ghash_4bit,\@function,4
-.align 16
-gcm_ghash_4bit:
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- sub \$280,%rsp
-.Lghash_prologue:
- mov $inp,%r14 # reassign couple of args
- mov $len,%r15
-___
-{ my $inp="%r14";
- my $dat="%edx";
- my $len="%r15";
- my @nhi=("%ebx","%ecx");
- my @rem=("%r12","%r13");
- my $Hshr4="%rbp";
-
- &sub ($Htbl,-128); # size optimization
- &lea ($Hshr4,"16+128(%rsp)");
- { my @lo =($nlo,$nhi);
- my @hi =($Zlo,$Zhi);
-
- &xor ($dat,$dat);
- for ($i=0,$j=-2;$i<18;$i++,$j++) {
- &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
- &or ($lo[0],$tmp) if ($i>1);
- &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
- &shr ($lo[1],4) if ($i>0 && $i<17);
- &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
- &shr ($hi[1],4) if ($i>0 && $i<17);
- &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
- &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
- &shl (&LB($dat),4) if ($i>0 && $i<17);
- &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
- &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
- &shl ($tmp,60) if ($i>0 && $i<17);
-
- push (@lo,shift(@lo));
- push (@hi,shift(@hi));
- }
- }
- &add ($Htbl,-128);
- &mov ($Zlo,"8($Xi)");
- &mov ($Zhi,"0($Xi)");
- &add ($len,$inp); # pointer to the end of data
- &lea ($rem_8bit,".Lrem_8bit(%rip)");
- &jmp (".Louter_loop");
-
-$code.=".align 16\n.Louter_loop:\n";
- &xor ($Zhi,"($inp)");
- &mov ("%rdx","8($inp)");
- &lea ($inp,"16($inp)");
- &xor ("%rdx",$Zlo);
- &mov ("($Xi)",$Zhi);
- &mov ("8($Xi)","%rdx");
- &shr ("%rdx",32);
-
- &xor ($nlo,$nlo);
- &rol ($dat,8);
- &mov (&LB($nlo),&LB($dat));
- &movz ($nhi[0],&LB($dat));
- &shl (&LB($nlo),4);
- &shr ($nhi[0],4);
-
- for ($j=11,$i=0;$i<15;$i++) {
- &rol ($dat,8);
- &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
- &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
- &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
- &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
-
- &mov (&LB($nlo),&LB($dat));
- &xor ($Zlo,$tmp) if ($i>0);
- &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
-
- &movz ($nhi[1],&LB($dat));
- &shl (&LB($nlo),4);
- &movzb ($rem[0],"(%rsp,$nhi[0])");
-
- &shr ($nhi[1],4) if ($i<14);
- &and ($nhi[1],0xf0) if ($i==14);
- &shl ($rem[1],48) if ($i>0);
- &xor ($rem[0],$Zlo);
-
- &mov ($tmp,$Zhi);
- &xor ($Zhi,$rem[1]) if ($i>0);
- &shr ($Zlo,8);
-
- &movz ($rem[0],&LB($rem[0]));
- &mov ($dat,"$j($Xi)") if (--$j%4==0);
- &shr ($Zhi,8);
-
- &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
- &shl ($tmp,56);
- &xor ($Zhi,"($Hshr4,$nhi[0],8)");
-
- unshift (@nhi,pop(@nhi)); # "rotate" registers
- unshift (@rem,pop(@rem));
- }
- &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
- &xor ($Zlo,"8($Htbl,$nlo)");
- &xor ($Zhi,"($Htbl,$nlo)");
-
- &shl ($rem[1],48);
- &xor ($Zlo,$tmp);
-
- &xor ($Zhi,$rem[1]);
- &movz ($rem[0],&LB($Zlo));
- &shr ($Zlo,4);
-
- &mov ($tmp,$Zhi);
- &shl (&LB($rem[0]),4);
- &shr ($Zhi,4);
-
- &xor ($Zlo,"8($Htbl,$nhi[0])");
- &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
- &shl ($tmp,60);
-
- &xor ($Zhi,"($Htbl,$nhi[0])");
- &xor ($Zlo,$tmp);
- &shl ($rem[0],48);
-
- &bswap ($Zlo);
- &xor ($Zhi,$rem[0]);
-
- &bswap ($Zhi);
- &cmp ($inp,$len);
- &jb (".Louter_loop");
-}
-$code.=<<___;
- mov $Zlo,8($Xi)
- mov $Zhi,($Xi)
-
- lea 280(%rsp),%rsi
- mov 0(%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
-.Lghash_epilogue:
- ret
-.size gcm_ghash_4bit,.-gcm_ghash_4bit
-___
-
-######################################################################
-# PCLMULQDQ version.
-
-@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
-
-($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
-($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
-
-sub clmul64x64_T2 { # minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
-
-$code.=<<___ if (!defined($modulo));
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey,$T2
- pxor $Xi,$T1 #
- pxor $Hkey,$T2
-___
-$code.=<<___;
- pclmulqdq \$0x00,$Hkey,$Xi #######
- pclmulqdq \$0x11,$Hkey,$Xhi #######
- pclmulqdq \$0x00,$T2,$T1 #######
- pxor $Xi,$T1 #
- pxor $Xhi,$T1 #
-
- movdqa $T1,$T2 #
- psrldq \$8,$T1
- pslldq \$8,$T2 #
- pxor $T1,$Xhi
- pxor $T2,$Xi #
-___
-}
-
-sub reduction_alg9 { # 17/13 times faster than Intel version
-my ($Xhi,$Xi) = @_;
-
-$code.=<<___;
- # 1st phase
- movdqa $Xi,$T1 #
- psllq \$1,$Xi
- pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
- psllq \$57,$Xi #
- movdqa $Xi,$T2 #
- pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
-
- # 2nd phase
- movdqa $Xi,$T2
- psrlq \$5,$Xi
- pxor $T2,$Xi #
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
-___
-}
-
-{ my ($Htbl,$Xip)=@_4args;
-
-$code.=<<___;
-.globl gcm_init_clmul
-.type gcm_init_clmul,\@abi-omnipotent
-.align 16
-gcm_init_clmul:
- movdqu ($Xip),$Hkey
- pshufd \$0b01001110,$Hkey,$Hkey # dword swap
-
- # <<1 twist
- pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
- movdqa $Hkey,$T1
- psllq \$1,$Hkey
- pxor $T3,$T3 #
- psrlq \$63,$T1
- pcmpgtd $T2,$T3 # broadcast carry bit
- pslldq \$8,$T1
- por $T1,$Hkey # H<<=1
-
- # magic reduction
- pand .L0x1c2_polynomial(%rip),$T3
- pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
-
- # calculate H^2
- movdqa $Hkey,$Xi
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
- &reduction_alg9 ($Xhi,$Xi);
-$code.=<<___;
- movdqu $Hkey,($Htbl) # save H
- movdqu $Xi,16($Htbl) # save H^2
- ret
-.size gcm_init_clmul,.-gcm_init_clmul
-___
-}
-
-{ my ($Xip,$Htbl)=@_4args;
-
-$code.=<<___;
-.globl gcm_gmult_clmul
-.type gcm_gmult_clmul,\@abi-omnipotent
-.align 16
-gcm_gmult_clmul:
- movdqu ($Xip),$Xi
- movdqa .Lbswap_mask(%rip),$T3
- movdqu ($Htbl),$Hkey
- pshufb $T3,$Xi
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
- &reduction_alg9 ($Xhi,$Xi);
-$code.=<<___;
- pshufb $T3,$Xi
- movdqu $Xi,($Xip)
- ret
-.size gcm_gmult_clmul,.-gcm_gmult_clmul
-___
-}
-
-{ my ($Xip,$Htbl,$inp,$len)=@_4args;
- my $Xn="%xmm6";
- my $Xhn="%xmm7";
- my $Hkey2="%xmm8";
- my $T1n="%xmm9";
- my $T2n="%xmm10";
-
-$code.=<<___;
-.globl gcm_ghash_clmul
-.type gcm_ghash_clmul,\@abi-omnipotent
-.align 16
-gcm_ghash_clmul:
-___
-$code.=<<___ if ($win64);
-.LSEH_begin_gcm_ghash_clmul:
- # I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
- .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
- .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
- .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
-___
-$code.=<<___;
- movdqa .Lbswap_mask(%rip),$T3
-
- movdqu ($Xip),$Xi
- movdqu ($Htbl),$Hkey
- pshufb $T3,$Xi
-
- sub \$0x10,$len
- jz .Lodd_tail
-
- movdqu 16($Htbl),$Hkey2
- #######
- # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
- # [(H*Ii+1) + (H*Xi+1)] mod P =
- # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
- #
- movdqu ($inp),$T1 # Ii
- movdqu 16($inp),$Xn # Ii+1
- pshufb $T3,$T1
- pshufb $T3,$Xn
- pxor $T1,$Xi # Ii+Xi
-___
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
-$code.=<<___;
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
-
- lea 32($inp),$inp # i+=2
- sub \$0x20,$len
- jbe .Leven_tail
-
-.Lmod_loop:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- movdqu ($inp),$T1 # Ii
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
- pxor $Xhn,$Xhi
-
- movdqu 16($inp),$Xn # Ii+1
- pshufb $T3,$T1
- pshufb $T3,$Xn
-
- movdqa $Xn,$Xhn #
- pshufd \$0b01001110,$Xn,$T1n
- pshufd \$0b01001110,$Hkey,$T2n
- pxor $Xn,$T1n #
- pxor $Hkey,$T2n
- pxor $T1,$Xhi # "Ii+Xi", consume early
-
- movdqa $Xi,$T1 # 1st phase
- psllq \$1,$Xi
- pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
- pclmulqdq \$0x00,$Hkey,$Xn #######
- psllq \$57,$Xi #
- movdqa $Xi,$T2 #
- pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
-
- pclmulqdq \$0x11,$Hkey,$Xhn #######
- movdqa $Xi,$T2 # 2nd phase
- psrlq \$5,$Xi
- pxor $T2,$Xi #
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
-
- pclmulqdq \$0x00,$T2n,$T1n #######
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
-
- pxor $Xn,$T1n #
- pxor $Xhn,$T1n #
- movdqa $T1n,$T2n #
- psrldq \$8,$T1n
- pslldq \$8,$T2n #
- pxor $T1n,$Xhn
- pxor $T2n,$Xn #
-
- lea 32($inp),$inp
- sub \$0x20,$len
- ja .Lmod_loop
-
-.Leven_tail:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
- pxor $Xhn,$Xhi
-___
- &reduction_alg9 ($Xhi,$Xi);
-$code.=<<___;
- test $len,$len
- jnz .Ldone
-
-.Lodd_tail:
- movdqu ($inp),$T1 # Ii
- pshufb $T3,$T1
- pxor $T1,$Xi # Ii+Xi
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
- &reduction_alg9 ($Xhi,$Xi);
-$code.=<<___;
-.Ldone:
- pshufb $T3,$Xi
- movdqu $Xi,($Xip)
-___
-$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- movaps 0x20(%rsp),%xmm8
- movaps 0x30(%rsp),%xmm9
- movaps 0x40(%rsp),%xmm10
- add \$0x58,%rsp
-___
-$code.=<<___;
- ret
-.LSEH_end_gcm_ghash_clmul:
-.size gcm_ghash_clmul,.-gcm_ghash_clmul
-___
-}
-
-$code.=<<___;
-.align 64
-.Lbswap_mask:
- .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.L0x1c2_polynomial:
- .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-.align 64
-.type .Lrem_4bit,\@object
-.Lrem_4bit:
- .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
- .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
- .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
- .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
-.type .Lrem_8bit,\@object
-.Lrem_8bit:
- .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
- .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
- .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
- .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
- .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
- .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
- .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
- .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
- .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
- .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
- .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
- .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
- .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
- .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
- .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
- .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
- .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
- .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
- .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
- .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
- .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
- .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
- .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
- .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
- .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
- .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
- .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
- .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
- .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
- .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
- .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
- .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
-
-.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align 64
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
-.align 16
-se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lin_prologue
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lin_prologue
-
- lea 24(%rax),%rax # adjust "rsp"
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
-
-.Lin_prologue:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$`1232/8`,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
-.size se_handler,.-se_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_gcm_gmult_4bit
- .rva .LSEH_end_gcm_gmult_4bit
- .rva .LSEH_info_gcm_gmult_4bit
-
- .rva .LSEH_begin_gcm_ghash_4bit
- .rva .LSEH_end_gcm_ghash_4bit
- .rva .LSEH_info_gcm_ghash_4bit
-
- .rva .LSEH_begin_gcm_ghash_clmul
- .rva .LSEH_end_gcm_ghash_clmul
- .rva .LSEH_info_gcm_ghash_clmul
-
-.section .xdata
-.align 8
-.LSEH_info_gcm_gmult_4bit:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
-.LSEH_info_gcm_ghash_4bit:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
-.LSEH_info_gcm_ghash_clmul:
- .byte 0x01,0x1f,0x0b,0x00
- .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
- .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
- .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
- .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
-___
-}
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-print $code;
-
-close STDOUT;
diff --git a/app/openssl/crypto/modes/asm/ghashv8-armx-64.S b/app/openssl/crypto/modes/asm/ghashv8-armx-64.S
deleted file mode 100644
index b77b6c40..00000000
--- a/app/openssl/crypto/modes/asm/ghashv8-armx-64.S
+++ /dev/null
@@ -1,115 +0,0 @@
-#include "arm_arch.h"
-
-.text
-.arch armv8-a+crypto
-.global gcm_init_v8
-.type gcm_init_v8,%function
-.align 4
-gcm_init_v8:
- ld1 {v17.2d},[x1] //load H
- movi v16.16b,#0xe1
- ext v3.16b,v17.16b,v17.16b,#8
- shl v16.2d,v16.2d,#57
- ushr v18.2d,v16.2d,#63
- ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01
- dup v17.4s,v17.s[1]
- ushr v19.2d,v3.2d,#63
- sshr v17.4s,v17.4s,#31 //broadcast carry bit
- and v19.16b,v19.16b,v16.16b
- shl v3.2d,v3.2d,#1
- ext v19.16b,v19.16b,v19.16b,#8
- and v16.16b,v16.16b,v17.16b
- orr v3.16b,v3.16b,v19.16b //H<<<=1
- eor v3.16b,v3.16b,v16.16b //twisted H
- st1 {v3.2d},[x0]
-
- ret
-.size gcm_init_v8,.-gcm_init_v8
-
-.global gcm_gmult_v8
-.type gcm_gmult_v8,%function
-.align 4
-gcm_gmult_v8:
- ld1 {v17.2d},[x0] //load Xi
- movi v19.16b,#0xe1
- ld1 {v20.2d},[x1] //load twisted H
- shl v19.2d,v19.2d,#57
-#ifndef __ARMEB__
- rev64 v17.16b,v17.16b
-#endif
- ext v21.16b,v20.16b,v20.16b,#8
- mov x3,#0
- ext v3.16b,v17.16b,v17.16b,#8
- mov x12,#0
- eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
- mov x2,x0
- b .Lgmult_v8
-.size gcm_gmult_v8,.-gcm_gmult_v8
-
-.global gcm_ghash_v8
-.type gcm_ghash_v8,%function
-.align 4
-gcm_ghash_v8:
- ld1 {v0.2d},[x0] //load [rotated] Xi
- subs x3,x3,#16
- movi v19.16b,#0xe1
- mov x12,#16
- ld1 {v20.2d},[x1] //load twisted H
- csel x12,xzr,x12,eq
- ext v0.16b,v0.16b,v0.16b,#8
- shl v19.2d,v19.2d,#57
- ld1 {v17.2d},[x2],x12 //load [rotated] inp
- ext v21.16b,v20.16b,v20.16b,#8
-#ifndef __ARMEB__
- rev64 v0.16b,v0.16b
- rev64 v17.16b,v17.16b
-#endif
- eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
- ext v3.16b,v17.16b,v17.16b,#8
- b .Loop_v8
-
-.align 4
-.Loop_v8:
- ext v18.16b,v0.16b,v0.16b,#8
- eor v3.16b,v3.16b,v0.16b //inp^=Xi
- eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi
-
-.Lgmult_v8:
- pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
- eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
- pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
- subs x3,x3,#16
- pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
- csel x12,xzr,x12,eq
-
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v17.16b
- ld1 {v17.2d},[x2],x12 //load [rotated] inp
- eor v1.16b,v1.16b,v18.16b
- pmull v18.1q,v0.1d,v19.1d //1st phase
-
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
-#ifndef __ARMEB__
- rev64 v17.16b,v17.16b
-#endif
- eor v0.16b,v1.16b,v18.16b
- ext v3.16b,v17.16b,v17.16b,#8
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
- pmull v0.1q,v0.1d,v19.1d
- eor v18.16b,v18.16b,v2.16b
- eor v0.16b,v0.16b,v18.16b
- b.hs .Loop_v8
-
-#ifndef __ARMEB__
- rev64 v0.16b,v0.16b
-#endif
- ext v0.16b,v0.16b,v0.16b,#8
- st1 {v0.2d},[x0] //write out Xi
-
- ret
-.size gcm_ghash_v8,.-gcm_ghash_v8
-.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
diff --git a/app/openssl/crypto/modes/asm/ghashv8-armx.S b/app/openssl/crypto/modes/asm/ghashv8-armx.S
deleted file mode 100644
index f388c54e..00000000
--- a/app/openssl/crypto/modes/asm/ghashv8-armx.S
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "arm_arch.h"
-
-.text
-.fpu neon
-.code 32
-.global gcm_init_v8
-.type gcm_init_v8,%function
-.align 4
-gcm_init_v8:
- vld1.64 {q9},[r1] @ load H
- vmov.i8 q8,#0xe1
- vext.8 q3,q9,q9,#8
- vshl.i64 q8,q8,#57
- vshr.u64 q10,q8,#63
- vext.8 q8,q10,q8,#8 @ t0=0xc2....01
- vdup.32 q9,d18[1]
- vshr.u64 q11,q3,#63
- vshr.s32 q9,q9,#31 @ broadcast carry bit
- vand q11,q11,q8
- vshl.i64 q3,q3,#1
- vext.8 q11,q11,q11,#8
- vand q8,q8,q9
- vorr q3,q3,q11 @ H<<<=1
- veor q3,q3,q8 @ twisted H
- vst1.64 {q3},[r0]
-
- bx lr
-.size gcm_init_v8,.-gcm_init_v8
-
-.global gcm_gmult_v8
-.type gcm_gmult_v8,%function
-.align 4
-gcm_gmult_v8:
- vld1.64 {q9},[r0] @ load Xi
- vmov.i8 q11,#0xe1
- vld1.64 {q12},[r1] @ load twisted H
- vshl.u64 q11,q11,#57
-#ifndef __ARMEB__
- vrev64.8 q9,q9
-#endif
- vext.8 q13,q12,q12,#8
- mov r3,#0
- vext.8 q3,q9,q9,#8
- mov r12,#0
- veor q13,q13,q12 @ Karatsuba pre-processing
- mov r2,r0
- b .Lgmult_v8
-.size gcm_gmult_v8,.-gcm_gmult_v8
-
-.global gcm_ghash_v8
-.type gcm_ghash_v8,%function
-.align 4
-gcm_ghash_v8:
- vld1.64 {q0},[r0] @ load [rotated] Xi
- subs r3,r3,#16
- vmov.i8 q11,#0xe1
- mov r12,#16
- vld1.64 {q12},[r1] @ load twisted H
- moveq r12,#0
- vext.8 q0,q0,q0,#8
- vshl.u64 q11,q11,#57
- vld1.64 {q9},[r2],r12 @ load [rotated] inp
- vext.8 q13,q12,q12,#8
-#ifndef __ARMEB__
- vrev64.8 q0,q0
- vrev64.8 q9,q9
-#endif
- veor q13,q13,q12 @ Karatsuba pre-processing
- vext.8 q3,q9,q9,#8
- b .Loop_v8
-
-.align 4
-.Loop_v8:
- vext.8 q10,q0,q0,#8
- veor q3,q3,q0 @ inp^=Xi
- veor q9,q9,q10 @ q9 is rotated inp^Xi
-
-.Lgmult_v8:
- .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
- veor q9,q9,q3 @ Karatsuba pre-processing
- .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
- subs r3,r3,#16
- .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
- moveq r12,#0
-
- vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
- veor q10,q0,q2
- veor q1,q1,q9
- vld1.64 {q9},[r2],r12 @ load [rotated] inp
- veor q1,q1,q10
- .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
-
- vmov d4,d3 @ Xh|Xm - 256-bit result
- vmov d3,d0 @ Xm is rotated Xl
-#ifndef __ARMEB__
- vrev64.8 q9,q9
-#endif
- veor q0,q1,q10
- vext.8 q3,q9,q9,#8
-
- vext.8 q10,q0,q0,#8 @ 2nd phase
- .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
- veor q10,q10,q2
- veor q0,q0,q10
- bhs .Loop_v8
-
-#ifndef __ARMEB__
- vrev64.8 q0,q0
-#endif
- vext.8 q0,q0,q0,#8
- vst1.64 {q0},[r0] @ write out Xi
-
- bx lr
-.size gcm_ghash_v8,.-gcm_ghash_v8
-.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
diff --git a/app/openssl/crypto/modes/asm/ghashv8-armx.pl b/app/openssl/crypto/modes/asm/ghashv8-armx.pl
deleted file mode 100644
index 69e863e7..00000000
--- a/app/openssl/crypto/modes/asm/ghashv8-armx.pl
+++ /dev/null
@@ -1,240 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
-#
-# June 2014
-#
-# Initial version was developed in tight cooperation with Ard
-# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
-# other assembly modules. Just like aesv8-armx.pl this module
-# supports both AArch32 and AArch64 execution modes.
-#
-# Current performance in cycles per processed byte:
-#
-# PMULL[2] 32-bit NEON(*)
-# Apple A7 1.76 5.62
-# Cortex-A5x n/a n/a
-#
-# (*) presented for reference/comparison purposes;
-
-$flavour = shift;
-open STDOUT,">".shift;
-
-$Xi="x0"; # argument block
-$Htbl="x1";
-$inp="x2";
-$len="x3";
-
-$inc="x12";
-
-{
-my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
-my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
-
-$code=<<___;
-#include "arm_arch.h"
-
-.text
-___
-$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
-$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
-
-$code.=<<___;
-.global gcm_init_v8
-.type gcm_init_v8,%function
-.align 4
-gcm_init_v8:
- vld1.64 {$t1},[x1] @ load H
- vmov.i8 $t0,#0xe1
- vext.8 $IN,$t1,$t1,#8
- vshl.i64 $t0,$t0,#57
- vshr.u64 $t2,$t0,#63
- vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01
- vdup.32 $t1,${t1}[1]
- vshr.u64 $t3,$IN,#63
- vshr.s32 $t1,$t1,#31 @ broadcast carry bit
- vand $t3,$t3,$t0
- vshl.i64 $IN,$IN,#1
- vext.8 $t3,$t3,$t3,#8
- vand $t0,$t0,$t1
- vorr $IN,$IN,$t3 @ H<<<=1
- veor $IN,$IN,$t0 @ twisted H
- vst1.64 {$IN},[x0]
-
- ret
-.size gcm_init_v8,.-gcm_init_v8
-
-.global gcm_gmult_v8
-.type gcm_gmult_v8,%function
-.align 4
-gcm_gmult_v8:
- vld1.64 {$t1},[$Xi] @ load Xi
- vmov.i8 $t3,#0xe1
- vld1.64 {$H},[$Htbl] @ load twisted H
- vshl.u64 $t3,$t3,#57
-#ifndef __ARMEB__
- vrev64.8 $t1,$t1
-#endif
- vext.8 $Hhl,$H,$H,#8
- mov $len,#0
- vext.8 $IN,$t1,$t1,#8
- mov $inc,#0
- veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
- mov $inp,$Xi
- b .Lgmult_v8
-.size gcm_gmult_v8,.-gcm_gmult_v8
-
-.global gcm_ghash_v8
-.type gcm_ghash_v8,%function
-.align 4
-gcm_ghash_v8:
- vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
- subs $len,$len,#16
- vmov.i8 $t3,#0xe1
- mov $inc,#16
- vld1.64 {$H},[$Htbl] @ load twisted H
- cclr $inc,eq
- vext.8 $Xl,$Xl,$Xl,#8
- vshl.u64 $t3,$t3,#57
- vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
- vext.8 $Hhl,$H,$H,#8
-#ifndef __ARMEB__
- vrev64.8 $Xl,$Xl
- vrev64.8 $t1,$t1
-#endif
- veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
- vext.8 $IN,$t1,$t1,#8
- b .Loop_v8
-
-.align 4
-.Loop_v8:
- vext.8 $t2,$Xl,$Xl,#8
- veor $IN,$IN,$Xl @ inp^=Xi
- veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi
-
-.Lgmult_v8:
- vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
- veor $t1,$t1,$IN @ Karatsuba pre-processing
- vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
- subs $len,$len,#16
- vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
- cclr $inc,eq
-
- vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
- veor $t2,$Xl,$Xh
- veor $Xm,$Xm,$t1
- vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
- veor $Xm,$Xm,$t2
- vpmull.p64 $t2,$Xl,$t3 @ 1st phase
-
- vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
- vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
-#ifndef __ARMEB__
- vrev64.8 $t1,$t1
-#endif
- veor $Xl,$Xm,$t2
- vext.8 $IN,$t1,$t1,#8
-
- vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
- vpmull.p64 $Xl,$Xl,$t3
- veor $t2,$t2,$Xh
- veor $Xl,$Xl,$t2
- b.hs .Loop_v8
-
-#ifndef __ARMEB__
- vrev64.8 $Xl,$Xl
-#endif
- vext.8 $Xl,$Xl,$Xl,#8
- vst1.64 {$Xl},[$Xi] @ write out Xi
-
- ret
-.size gcm_ghash_v8,.-gcm_ghash_v8
-___
-}
-$code.=<<___;
-.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-___
-
-if ($flavour =~ /64/) { ######## 64-bit code
- sub unvmov {
- my $arg=shift;
-
- $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
- sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
- }
- foreach(split("\n",$code)) {
- s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
- s/vmov\.i8/movi/o or # fix up legacy mnemonics
- s/vmov\s+(.*)/unvmov($1)/geo or
- s/vext\.8/ext/o or
- s/vshr\.s/sshr\.s/o or
- s/vshr/ushr/o or
- s/^(\s+)v/$1/o or # strip off v prefix
- s/\bbx\s+lr\b/ret/o;
-
- s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
- s/@\s/\/\//o; # old->new style commentary
-
- # fix up remainig legacy suffixes
- s/\.[ui]?8(\s)/$1/o;
- s/\.[uis]?32//o and s/\.16b/\.4s/go;
- m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
- m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
- s/\.[uisp]?64//o and s/\.16b/\.2d/go;
- s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
-
- print $_,"\n";
- }
-} else { ######## 32-bit code
- sub unvdup32 {
- my $arg=shift;
-
- $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
- sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
- }
- sub unvpmullp64 {
- my ($mnemonic,$arg)=@_;
-
- if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
- my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
- |(($2&7)<<17)|(($2&8)<<4)
- |(($3&7)<<1) |(($3&8)<<2);
- $word |= 0x00010001 if ($mnemonic =~ "2");
- # since ARMv7 instructions are always encoded little-endian.
- # correct solution is to use .inst directive, but older
- # assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
- $word&0xff,($word>>8)&0xff,
- ($word>>16)&0xff,($word>>24)&0xff,
- $mnemonic,$arg;
- }
- }
-
- foreach(split("\n",$code)) {
- s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
- s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
- s/\/\/\s?/@ /o; # new->old style commentary
-
- # fix up remainig new-style suffixes
- s/\],#[0-9]+/]!/o;
-
- s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
- s/vdup\.32\s+(.*)/unvdup32($1)/geo or
- s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/^(\s+)b\./$1b/o or
- s/^(\s+)ret/$1bx\tlr/o;
-
- print $_,"\n";
- }
-}
-
-close STDOUT; # enforce flush
diff --git a/app/openssl/crypto/modes/cbc128.c b/app/openssl/crypto/modes/cbc128.c
deleted file mode 100644
index 0e54f754..00000000
--- a/app/openssl/crypto/modes/cbc128.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-# define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
-#ifndef STRICT_ALIGNMENT
-# define STRICT_ALIGNMENT 0
-#endif
-
-void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block)
-{
- size_t n;
- const unsigned char *iv = ivec;
-
- assert(in && out && key && ivec);
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (STRICT_ALIGNMENT &&
- ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
- while (len>=16) {
- for(n=0; n<16; ++n)
- out[n] = in[n] ^ iv[n];
- (*block)(out, out, key);
- iv = out;
- len -= 16;
- in += 16;
- out += 16;
- }
- } else {
- while (len>=16) {
- for(n=0; n<16; n+=sizeof(size_t))
- *(size_t*)(out+n) =
- *(size_t*)(in+n) ^ *(size_t*)(iv+n);
- (*block)(out, out, key);
- iv = out;
- len -= 16;
- in += 16;
- out += 16;
- }
- }
-#endif
- while (len) {
- for(n=0; n<16 && n<len; ++n)
- out[n] = in[n] ^ iv[n];
- for(; n<16; ++n)
- out[n] = iv[n];
- (*block)(out, out, key);
- iv = out;
- if (len<=16) break;
- len -= 16;
- in += 16;
- out += 16;
- }
- memcpy(ivec,iv,16);
-}
-
-void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block)
-{
- size_t n;
- union { size_t t[16/sizeof(size_t)]; unsigned char c[16]; } tmp;
-
- assert(in && out && key && ivec);
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (in != out) {
- const unsigned char *iv = ivec;
-
- if (STRICT_ALIGNMENT &&
- ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
- while (len>=16) {
- (*block)(in, out, key);
- for(n=0; n<16; ++n)
- out[n] ^= iv[n];
- iv = in;
- len -= 16;
- in += 16;
- out += 16;
- }
- }
- else if (16%sizeof(size_t) == 0) { /* always true */
- while (len>=16) {
- size_t *out_t=(size_t *)out, *iv_t=(size_t *)iv;
-
- (*block)(in, out, key);
- for(n=0; n<16/sizeof(size_t); n++)
- out_t[n] ^= iv_t[n];
- iv = in;
- len -= 16;
- in += 16;
- out += 16;
- }
- }
- memcpy(ivec,iv,16);
- } else {
- if (STRICT_ALIGNMENT &&
- ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
- unsigned char c;
- while (len>=16) {
- (*block)(in, tmp.c, key);
- for(n=0; n<16; ++n) {
- c = in[n];
- out[n] = tmp.c[n] ^ ivec[n];
- ivec[n] = c;
- }
- len -= 16;
- in += 16;
- out += 16;
- }
- }
- else if (16%sizeof(size_t) == 0) { /* always true */
- while (len>=16) {
- size_t c, *out_t=(size_t *)out, *ivec_t=(size_t *)ivec;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(in, tmp.c, key);
- for(n=0; n<16/sizeof(size_t); n++) {
- c = in_t[n];
- out_t[n] = tmp.t[n] ^ ivec_t[n];
- ivec_t[n] = c;
- }
- len -= 16;
- in += 16;
- out += 16;
- }
- }
- }
-#endif
- while (len) {
- unsigned char c;
- (*block)(in, tmp.c, key);
- for(n=0; n<16 && n<len; ++n) {
- c = in[n];
- out[n] = tmp.c[n] ^ ivec[n];
- ivec[n] = c;
- }
- if (len<=16) {
- for (; n<16; ++n)
- ivec[n] = in[n];
- break;
- }
- len -= 16;
- in += 16;
- out += 16;
- }
-}
diff --git a/app/openssl/crypto/modes/ccm128.c b/app/openssl/crypto/modes/ccm128.c
deleted file mode 100644
index 3ce11d0d..00000000
--- a/app/openssl/crypto/modes/ccm128.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-# define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
-/* First you setup M and L parameters and pass the key schedule.
- * This is called once per session setup... */
-void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
- unsigned int M,unsigned int L,void *key,block128_f block)
-{
- memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
- ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
- ctx->blocks = 0;
- ctx->block = block;
- ctx->key = key;
-}
-
-/* !!! Following interfaces are to be called *once* per packet !!! */
-
-/* Then you setup per-message nonce and pass the length of the message */
-int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
- const unsigned char *nonce,size_t nlen,size_t mlen)
-{
- unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */
-
- if (nlen<(14-L)) return -1; /* nonce is too short */
-
- if (sizeof(mlen)==8 && L>=3) {
- ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8)));
- ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8)));
- ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
- ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
- }
- else
- ctx->nonce.u[1] = 0;
-
- ctx->nonce.c[12] = (u8)(mlen>>24);
- ctx->nonce.c[13] = (u8)(mlen>>16);
- ctx->nonce.c[14] = (u8)(mlen>>8);
- ctx->nonce.c[15] = (u8)mlen;
-
- ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
- memcpy(&ctx->nonce.c[1],nonce,14-L);
-
- return 0;
-}
-
-/* Then you pass additional authentication data, this is optional */
-void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
- const unsigned char *aad,size_t alen)
-{ unsigned int i;
- block128_f block = ctx->block;
-
- if (alen==0) return;
-
- ctx->nonce.c[0] |= 0x40; /* set Adata flag */
- (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
- ctx->blocks++;
-
- if (alen<(0x10000-0x100)) {
- ctx->cmac.c[0] ^= (u8)(alen>>8);
- ctx->cmac.c[1] ^= (u8)alen;
- i=2;
- }
- else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
- ctx->cmac.c[0] ^= 0xFF;
- ctx->cmac.c[1] ^= 0xFF;
- ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
- ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
- ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
- ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
- ctx->cmac.c[6] ^= (u8)(alen>>24);
- ctx->cmac.c[7] ^= (u8)(alen>>16);
- ctx->cmac.c[8] ^= (u8)(alen>>8);
- ctx->cmac.c[9] ^= (u8)alen;
- i=10;
- }
- else {
- ctx->cmac.c[0] ^= 0xFF;
- ctx->cmac.c[1] ^= 0xFE;
- ctx->cmac.c[2] ^= (u8)(alen>>24);
- ctx->cmac.c[3] ^= (u8)(alen>>16);
- ctx->cmac.c[4] ^= (u8)(alen>>8);
- ctx->cmac.c[5] ^= (u8)alen;
- i=6;
- }
-
- do {
- for(;i<16 && alen;++i,++aad,--alen)
- ctx->cmac.c[i] ^= *aad;
- (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
- ctx->blocks++;
- i=0;
- } while (alen);
-}
-
-/* Finally you encrypt or decrypt the message */
-
-/* counter part of nonce may not be larger than L*8 bits,
- * L is not larger than 8, therefore 64-bit counter... */
-static void ctr64_inc(unsigned char *counter) {
- unsigned int n=8;
- u8 c;
-
- counter += 8;
- do {
- --n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c) return;
- } while (n);
-}
-
-int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out,
- size_t len)
-{
- size_t n;
- unsigned int i,L;
- unsigned char flags0 = ctx->nonce.c[0];
- block128_f block = ctx->block;
- void * key = ctx->key;
- union { u64 u[2]; u8 c[16]; } scratch;
-
- if (!(flags0&0x40))
- (*block)(ctx->nonce.c,ctx->cmac.c,key),
- ctx->blocks++;
-
- ctx->nonce.c[0] = L = flags0&7;
- for (n=0,i=15-L;i<15;++i) {
- n |= ctx->nonce.c[i];
- ctx->nonce.c[i]=0;
- n <<= 8;
- }
- n |= ctx->nonce.c[15]; /* reconstructed length */
- ctx->nonce.c[15]=1;
-
- if (n!=len) return -1; /* length mismatch */
-
- ctx->blocks += ((len+15)>>3)|1;
- if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
-
- while (len>=16) {
-#if defined(STRICT_ALIGNMENT)
- union { u64 u[2]; u8 c[16]; } temp;
-
- memcpy (temp.c,inp,16);
- ctx->cmac.u[0] ^= temp.u[0];
- ctx->cmac.u[1] ^= temp.u[1];
-#else
- ctx->cmac.u[0] ^= ((u64*)inp)[0];
- ctx->cmac.u[1] ^= ((u64*)inp)[1];
-#endif
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- (*block)(ctx->nonce.c,scratch.c,key);
- ctr64_inc(ctx->nonce.c);
-#if defined(STRICT_ALIGNMENT)
- temp.u[0] ^= scratch.u[0];
- temp.u[1] ^= scratch.u[1];
- memcpy(out,temp.c,16);
-#else
- ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
- ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
-#endif
- inp += 16;
- out += 16;
- len -= 16;
- }
-
- if (len) {
- for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- (*block)(ctx->nonce.c,scratch.c,key);
- for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
- }
-
- for (i=15-L;i<16;++i)
- ctx->nonce.c[i]=0;
-
- (*block)(ctx->nonce.c,scratch.c,key);
- ctx->cmac.u[0] ^= scratch.u[0];
- ctx->cmac.u[1] ^= scratch.u[1];
-
- ctx->nonce.c[0] = flags0;
-
- return 0;
-}
-
-int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out,
- size_t len)
-{
- size_t n;
- unsigned int i,L;
- unsigned char flags0 = ctx->nonce.c[0];
- block128_f block = ctx->block;
- void * key = ctx->key;
- union { u64 u[2]; u8 c[16]; } scratch;
-
- if (!(flags0&0x40))
- (*block)(ctx->nonce.c,ctx->cmac.c,key);
-
- ctx->nonce.c[0] = L = flags0&7;
- for (n=0,i=15-L;i<15;++i) {
- n |= ctx->nonce.c[i];
- ctx->nonce.c[i]=0;
- n <<= 8;
- }
- n |= ctx->nonce.c[15]; /* reconstructed length */
- ctx->nonce.c[15]=1;
-
- if (n!=len) return -1;
-
- while (len>=16) {
-#if defined(STRICT_ALIGNMENT)
- union { u64 u[2]; u8 c[16]; } temp;
-#endif
- (*block)(ctx->nonce.c,scratch.c,key);
- ctr64_inc(ctx->nonce.c);
-#if defined(STRICT_ALIGNMENT)
- memcpy (temp.c,inp,16);
- ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
- ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
- memcpy (out,scratch.c,16);
-#else
- ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
- ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
-#endif
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
-
- inp += 16;
- out += 16;
- len -= 16;
- }
-
- if (len) {
- (*block)(ctx->nonce.c,scratch.c,key);
- for (i=0; i<len; ++i)
- ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- }
-
- for (i=15-L;i<16;++i)
- ctx->nonce.c[i]=0;
-
- (*block)(ctx->nonce.c,scratch.c,key);
- ctx->cmac.u[0] ^= scratch.u[0];
- ctx->cmac.u[1] ^= scratch.u[1];
-
- ctx->nonce.c[0] = flags0;
-
- return 0;
-}
-
-static void ctr64_add (unsigned char *counter,size_t inc)
-{ size_t n=8, val=0;
-
- counter += 8;
- do {
- --n;
- val += counter[n] + (inc&0xff);
- counter[n] = (unsigned char)val;
- val >>= 8; /* carry bit */
- inc >>= 8;
- } while(n && (inc || val));
-}
-
-int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out,
- size_t len,ccm128_f stream)
-{
- size_t n;
- unsigned int i,L;
- unsigned char flags0 = ctx->nonce.c[0];
- block128_f block = ctx->block;
- void * key = ctx->key;
- union { u64 u[2]; u8 c[16]; } scratch;
-
- if (!(flags0&0x40))
- (*block)(ctx->nonce.c,ctx->cmac.c,key),
- ctx->blocks++;
-
- ctx->nonce.c[0] = L = flags0&7;
- for (n=0,i=15-L;i<15;++i) {
- n |= ctx->nonce.c[i];
- ctx->nonce.c[i]=0;
- n <<= 8;
- }
- n |= ctx->nonce.c[15]; /* reconstructed length */
- ctx->nonce.c[15]=1;
-
- if (n!=len) return -1; /* length mismatch */
-
- ctx->blocks += ((len+15)>>3)|1;
- if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
-
- if ((n=len/16)) {
- (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
- n *= 16;
- inp += n;
- out += n;
- len -= n;
- if (len) ctr64_add(ctx->nonce.c,n/16);
- }
-
- if (len) {
- for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- (*block)(ctx->nonce.c,scratch.c,key);
- for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
- }
-
- for (i=15-L;i<16;++i)
- ctx->nonce.c[i]=0;
-
- (*block)(ctx->nonce.c,scratch.c,key);
- ctx->cmac.u[0] ^= scratch.u[0];
- ctx->cmac.u[1] ^= scratch.u[1];
-
- ctx->nonce.c[0] = flags0;
-
- return 0;
-}
-
-int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out,
- size_t len,ccm128_f stream)
-{
- size_t n;
- unsigned int i,L;
- unsigned char flags0 = ctx->nonce.c[0];
- block128_f block = ctx->block;
- void * key = ctx->key;
- union { u64 u[2]; u8 c[16]; } scratch;
-
- if (!(flags0&0x40))
- (*block)(ctx->nonce.c,ctx->cmac.c,key);
-
- ctx->nonce.c[0] = L = flags0&7;
- for (n=0,i=15-L;i<15;++i) {
- n |= ctx->nonce.c[i];
- ctx->nonce.c[i]=0;
- n <<= 8;
- }
- n |= ctx->nonce.c[15]; /* reconstructed length */
- ctx->nonce.c[15]=1;
-
- if (n!=len) return -1;
-
- if ((n=len/16)) {
- (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
- n *= 16;
- inp += n;
- out += n;
- len -= n;
- if (len) ctr64_add(ctx->nonce.c,n/16);
- }
-
- if (len) {
- (*block)(ctx->nonce.c,scratch.c,key);
- for (i=0; i<len; ++i)
- ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- }
-
- for (i=15-L;i<16;++i)
- ctx->nonce.c[i]=0;
-
- (*block)(ctx->nonce.c,scratch.c,key);
- ctx->cmac.u[0] ^= scratch.u[0];
- ctx->cmac.u[1] ^= scratch.u[1];
-
- ctx->nonce.c[0] = flags0;
-
- return 0;
-}
-
-size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
-{ unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */
-
- M *= 2; M += 2;
- if (len<M) return 0;
- memcpy(tag,ctx->cmac.c,M);
- return M;
-}
diff --git a/app/openssl/crypto/modes/cfb128.c b/app/openssl/crypto/modes/cfb128.c
deleted file mode 100644
index 4e6f5d35..00000000
--- a/app/openssl/crypto/modes/cfb128.c
+++ /dev/null
@@ -1,242 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-# define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
-/* The input and output encrypted as though 128bit cfb mode is being
- * used. The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
- */
-void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], int *num,
- int enc, block128_f block)
-{
- unsigned int n;
- size_t l = 0;
-
- assert(in && out && key && ivec && num);
-
- n = *num;
-
- if (enc) {
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- while (n && len) {
- *(out++) = ivec[n] ^= *(in++);
- --len;
- n = (n+1) % 16;
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
- break;
-#endif
- while (len>=16) {
- (*block)(ivec, ivec, key);
- for (; n<16; n+=sizeof(size_t)) {
- *(size_t*)(out+n) =
- *(size_t*)(ivec+n) ^= *(size_t*)(in+n);
- }
- len -= 16;
- out += 16;
- in += 16;
- n = 0;
- }
- if (len) {
- (*block)(ivec, ivec, key);
- while (len--) {
- out[n] = ivec[n] ^= in[n];
- ++n;
- }
- }
- *num = n;
- return;
- } while (0);
- /* the rest would be commonly eliminated by x86* compiler */
-#endif
- while (l<len) {
- if (n == 0) {
- (*block)(ivec, ivec, key);
- }
- out[l] = ivec[n] ^= in[l];
- ++l;
- n = (n+1) % 16;
- }
- *num = n;
- } else {
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- while (n && len) {
- unsigned char c;
- *(out++) = ivec[n] ^ (c = *(in++)); ivec[n] = c;
- --len;
- n = (n+1) % 16;
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
- break;
-#endif
- while (len>=16) {
- (*block)(ivec, ivec, key);
- for (; n<16; n+=sizeof(size_t)) {
- size_t t = *(size_t*)(in+n);
- *(size_t*)(out+n) = *(size_t*)(ivec+n) ^ t;
- *(size_t*)(ivec+n) = t;
- }
- len -= 16;
- out += 16;
- in += 16;
- n = 0;
- }
- if (len) {
- (*block)(ivec, ivec, key);
- while (len--) {
- unsigned char c;
- out[n] = ivec[n] ^ (c = in[n]); ivec[n] = c;
- ++n;
- }
- }
- *num = n;
- return;
- } while (0);
- /* the rest would be commonly eliminated by x86* compiler */
-#endif
- while (l<len) {
- unsigned char c;
- if (n == 0) {
- (*block)(ivec, ivec, key);
- }
- out[l] = ivec[n] ^ (c = in[l]); ivec[n] = c;
- ++l;
- n = (n+1) % 16;
- }
- *num=n;
- }
-}
-
-/* This expects a single block of size nbits for both in and out. Note that
- it corrupts any extra bits in the last byte of out */
-static void cfbr_encrypt_block(const unsigned char *in,unsigned char *out,
- int nbits,const void *key,
- unsigned char ivec[16],int enc,
- block128_f block)
-{
- int n,rem,num;
- unsigned char ovec[16*2 + 1]; /* +1 because we dererefence (but don't use) one byte off the end */
-
- if (nbits<=0 || nbits>128) return;
-
- /* fill in the first half of the new IV with the current IV */
- memcpy(ovec,ivec,16);
- /* construct the new IV */
- (*block)(ivec,ivec,key);
- num = (nbits+7)/8;
- if (enc) /* encrypt the input */
- for(n=0 ; n < num ; ++n)
- out[n] = (ovec[16+n] = in[n] ^ ivec[n]);
- else /* decrypt the input */
- for(n=0 ; n < num ; ++n)
- out[n] = (ovec[16+n] = in[n]) ^ ivec[n];
- /* shift ovec left... */
- rem = nbits%8;
- num = nbits/8;
- if(rem==0)
- memcpy(ivec,ovec+num,16);
- else
- for(n=0 ; n < 16 ; ++n)
- ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem);
-
- /* it is not necessary to cleanse ovec, since the IV is not secret */
-}
-
-/* N.B. This expects the input to be packed, MS bit first */
-void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
- size_t bits, const void *key,
- unsigned char ivec[16], int *num,
- int enc, block128_f block)
-{
- size_t n;
- unsigned char c[1],d[1];
-
- assert(in && out && key && ivec && num);
- assert(*num == 0);
-
- for(n=0 ; n<bits ; ++n)
- {
- c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0;
- cfbr_encrypt_block(c,d,1,key,ivec,enc,block);
- out[n/8]=(out[n/8]&~(1 << (unsigned int)(7-n%8))) |
- ((d[0]&0x80) >> (unsigned int)(n%8));
- }
-}
-
-void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
- size_t length, const void *key,
- unsigned char ivec[16], int *num,
- int enc, block128_f block)
-{
- size_t n;
-
- assert(in && out && key && ivec && num);
- assert(*num == 0);
-
- for(n=0 ; n<length ; ++n)
- cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc,block);
-}
-
diff --git a/app/openssl/crypto/modes/ctr128.c b/app/openssl/crypto/modes/ctr128.c
deleted file mode 100644
index ee642c58..00000000
--- a/app/openssl/crypto/modes/ctr128.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-# define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
-/* NOTE: the IV/counter CTR mode is big-endian. The code itself
- * is endian-neutral. */
-
-/* increment counter (128-bit int) by 1 */
-static void ctr128_inc(unsigned char *counter) {
- u32 n=16;
- u8 c;
-
- do {
- --n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c) return;
- } while (n);
-}
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-static void ctr128_inc_aligned(unsigned char *counter) {
- size_t *data,c,n;
- const union { long one; char little; } is_endian = {1};
-
- if (is_endian.little) {
- ctr128_inc(counter);
- return;
- }
-
- data = (size_t *)counter;
- n = 16/sizeof(size_t);
- do {
- --n;
- c = data[n];
- ++c;
- data[n] = c;
- if (c) return;
- } while (n);
-}
-#endif
-
-/* The input encrypted as though 128bit counter mode is being
- * used. The extra state information to record how much of the
- * 128bit block we have used is contained in *num, and the
- * encrypted counter is kept in ecount_buf. Both *num and
- * ecount_buf must be initialised with zeros before the first
- * call to CRYPTO_ctr128_encrypt().
- *
- * This algorithm assumes that the counter is in the x lower bits
- * of the IV (ivec), and that the application has full control over
- * overflow and the rest of the IV. This implementation takes NO
- * responsability for checking that the counter doesn't overflow
- * into the rest of the IV when incremented.
- */
-void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], unsigned char ecount_buf[16],
- unsigned int *num, block128_f block)
-{
- unsigned int n;
- size_t l=0;
-
- assert(in && out && key && ecount_buf && num);
- assert(*num < 16);
-
- n = *num;
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- while (n && len) {
- *(out++) = *(in++) ^ ecount_buf[n];
- --len;
- n = (n+1) % 16;
- }
-
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
- break;
-#endif
- while (len>=16) {
- (*block)(ivec, ecount_buf, key);
- ctr128_inc_aligned(ivec);
- for (; n<16; n+=sizeof(size_t))
- *(size_t *)(out+n) =
- *(size_t *)(in+n) ^ *(size_t *)(ecount_buf+n);
- len -= 16;
- out += 16;
- in += 16;
- n = 0;
- }
- if (len) {
- (*block)(ivec, ecount_buf, key);
- ctr128_inc_aligned(ivec);
- while (len--) {
- out[n] = in[n] ^ ecount_buf[n];
- ++n;
- }
- }
- *num = n;
- return;
- } while(0);
- /* the rest would be commonly eliminated by x86* compiler */
-#endif
- while (l<len) {
- if (n==0) {
- (*block)(ivec, ecount_buf, key);
- ctr128_inc(ivec);
- }
- out[l] = in[l] ^ ecount_buf[n];
- ++l;
- n = (n+1) % 16;
- }
-
- *num=n;
-}
-
-/* increment upper 96 bits of 128-bit counter by 1 */
-static void ctr96_inc(unsigned char *counter) {
- u32 n=12;
- u8 c;
-
- do {
- --n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c) return;
- } while (n);
-}
-
-void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], unsigned char ecount_buf[16],
- unsigned int *num, ctr128_f func)
-{
- unsigned int n,ctr32;
-
- assert(in && out && key && ecount_buf && num);
- assert(*num < 16);
-
- n = *num;
-
- while (n && len) {
- *(out++) = *(in++) ^ ecount_buf[n];
- --len;
- n = (n+1) % 16;
- }
-
- ctr32 = GETU32(ivec+12);
- while (len>=16) {
- size_t blocks = len/16;
- /*
- * 1<<28 is just a not-so-small yet not-so-large number...
- * Below condition is practically never met, but it has to
- * be checked for code correctness.
- */
- if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
- blocks = (1U<<28);
- /*
- * As (*func) operates on 32-bit counter, caller
- * has to handle overflow. 'if' below detects the
- * overflow, which is then handled by limiting the
- * amount of blocks to the exact overflow point...
- */
- ctr32 += (u32)blocks;
- if (ctr32 < blocks) {
- blocks -= ctr32;
- ctr32 = 0;
- }
- (*func)(in,out,blocks,key,ivec);
- /* (*ctr) does not update ivec, caller does: */
- PUTU32(ivec+12,ctr32);
- /* ... overflow was detected, propogate carry. */
- if (ctr32 == 0) ctr96_inc(ivec);
- blocks *= 16;
- len -= blocks;
- out += blocks;
- in += blocks;
- }
- if (len) {
- memset(ecount_buf,0,16);
- (*func)(ecount_buf,ecount_buf,1,key,ivec);
- ++ctr32;
- PUTU32(ivec+12,ctr32);
- if (ctr32 == 0) ctr96_inc(ivec);
- while (len--) {
- out[n] = in[n] ^ ecount_buf[n];
- ++n;
- }
- }
-
- *num=n;
-}
diff --git a/app/openssl/crypto/modes/gcm128.c b/app/openssl/crypto/modes/gcm128.c
deleted file mode 100644
index 79ebb66e..00000000
--- a/app/openssl/crypto/modes/gcm128.c
+++ /dev/null
@@ -1,1924 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-
-#define OPENSSL_FIPSAPI
-
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-# define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
-#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
-/* redefine, because alignment is ensured */
-#undef GETU32
-#define GETU32(p) BSWAP4(*(const u32 *)(p))
-#undef PUTU32
-#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
-#endif
-
-#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
-#define REDUCE1BIT(V) do { \
- if (sizeof(size_t)==8) { \
- u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
- V.lo = (V.hi<<63)|(V.lo>>1); \
- V.hi = (V.hi>>1 )^T; \
- } \
- else { \
- u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
- V.lo = (V.hi<<63)|(V.lo>>1); \
- V.hi = (V.hi>>1 )^((u64)T<<32); \
- } \
-} while(0)
-
-/*
- * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
- * never be set to 8. 8 is effectively reserved for testing purposes.
- * TABLE_BITS>1 are lookup-table-driven implementations referred to as
- * "Shoup's" in GCM specification. In other words OpenSSL does not cover
- * whole spectrum of possible table driven implementations. Why? In
- * non-"Shoup's" case memory access pattern is segmented in such manner,
- * that it's trivial to see that cache timing information can reveal
- * fair portion of intermediate hash value. Given that ciphertext is
- * always available to attacker, it's possible for him to attempt to
- * deduce secret parameter H and if successful, tamper with messages
- * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
- * not as trivial, but there is no reason to believe that it's resistant
- * to cache-timing attack. And the thing about "8-bit" implementation is
- * that it consumes 16 (sixteen) times more memory, 4KB per individual
- * key + 1KB shared. Well, on pros side it should be twice as fast as
- * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
- * was observed to run ~75% faster, closer to 100% for commercial
- * compilers... Yet "4-bit" procedure is preferred, because it's
- * believed to provide better security-performance balance and adequate
- * all-round performance. "All-round" refers to things like:
- *
- * - shorter setup time effectively improves overall timing for
- * handling short messages;
- * - larger table allocation can become unbearable because of VM
- * subsystem penalties (for example on Windows large enough free
- * results in VM working set trimming, meaning that consequent
- * malloc would immediately incur working set expansion);
- * - larger table has larger cache footprint, which can affect
- * performance of other code paths (not necessarily even from same
- * thread in Hyper-Threading world);
- *
- * Value of 1 is not appropriate for performance reasons.
- */
-#if TABLE_BITS==8
-
-static void gcm_init_8bit(u128 Htable[256], u64 H[2])
-{
- int i, j;
- u128 V;
-
- Htable[0].hi = 0;
- Htable[0].lo = 0;
- V.hi = H[0];
- V.lo = H[1];
-
- for (Htable[128]=V, i=64; i>0; i>>=1) {
- REDUCE1BIT(V);
- Htable[i] = V;
- }
-
- for (i=2; i<256; i<<=1) {
- u128 *Hi = Htable+i, H0 = *Hi;
- for (j=1; j<i; ++j) {
- Hi[j].hi = H0.hi^Htable[j].hi;
- Hi[j].lo = H0.lo^Htable[j].lo;
- }
- }
-}
-
-static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
-{
- u128 Z = { 0, 0};
- const u8 *xi = (const u8 *)Xi+15;
- size_t rem, n = *xi;
- const union { long one; char little; } is_endian = {1};
- static const size_t rem_8bit[256] = {
- PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
- PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
- PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
- PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
- PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
- PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
- PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
- PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
- PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
- PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
- PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
- PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
- PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
- PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
- PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
- PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
- PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
- PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
- PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
- PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
- PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
- PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
- PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
- PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
- PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
- PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
- PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
- PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
- PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
- PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
- PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
- PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
- PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
- PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
- PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
- PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
- PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
- PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
- PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
- PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
- PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
- PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
- PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
- PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
- PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
- PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
- PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
- PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
- PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
- PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
- PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
- PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
- PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
- PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
- PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
- PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
- PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
- PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
- PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
- PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
- PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
- PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
- PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
- PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
-
- while (1) {
- Z.hi ^= Htable[n].hi;
- Z.lo ^= Htable[n].lo;
-
- if ((u8 *)Xi==xi) break;
-
- n = *(--xi);
-
- rem = (size_t)Z.lo&0xff;
- Z.lo = (Z.hi<<56)|(Z.lo>>8);
- Z.hi = (Z.hi>>8);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_8bit[rem];
- else
- Z.hi ^= (u64)rem_8bit[rem]<<32;
- }
-
- if (is_endian.little) {
-#ifdef BSWAP8
- Xi[0] = BSWAP8(Z.hi);
- Xi[1] = BSWAP8(Z.lo);
-#else
- u8 *p = (u8 *)Xi;
- u32 v;
- v = (u32)(Z.hi>>32); PUTU32(p,v);
- v = (u32)(Z.hi); PUTU32(p+4,v);
- v = (u32)(Z.lo>>32); PUTU32(p+8,v);
- v = (u32)(Z.lo); PUTU32(p+12,v);
-#endif
- }
- else {
- Xi[0] = Z.hi;
- Xi[1] = Z.lo;
- }
-}
-#define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
-
-#elif TABLE_BITS==4
-
-static void gcm_init_4bit(u128 Htable[16], u64 H[2])
-{
- u128 V;
-#if defined(OPENSSL_SMALL_FOOTPRINT)
- int i;
-#endif
-
- Htable[0].hi = 0;
- Htable[0].lo = 0;
- V.hi = H[0];
- V.lo = H[1];
-
-#if defined(OPENSSL_SMALL_FOOTPRINT)
- for (Htable[8]=V, i=4; i>0; i>>=1) {
- REDUCE1BIT(V);
- Htable[i] = V;
- }
-
- for (i=2; i<16; i<<=1) {
- u128 *Hi = Htable+i;
- int j;
- for (V=*Hi, j=1; j<i; ++j) {
- Hi[j].hi = V.hi^Htable[j].hi;
- Hi[j].lo = V.lo^Htable[j].lo;
- }
- }
-#else
- Htable[8] = V;
- REDUCE1BIT(V);
- Htable[4] = V;
- REDUCE1BIT(V);
- Htable[2] = V;
- REDUCE1BIT(V);
- Htable[1] = V;
- Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
- V=Htable[4];
- Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
- Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
- Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
- V=Htable[8];
- Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
- Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
- Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
- Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
- Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
- Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
- Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
-#endif
-#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
- /*
- * ARM assembler expects specific dword order in Htable.
- */
- {
- int j;
- const union { long one; char little; } is_endian = {1};
-
- if (is_endian.little)
- for (j=0;j<16;++j) {
- V = Htable[j];
- Htable[j].hi = V.lo;
- Htable[j].lo = V.hi;
- }
- else
- for (j=0;j<16;++j) {
- V = Htable[j];
- Htable[j].hi = V.lo<<32|V.lo>>32;
- Htable[j].lo = V.hi<<32|V.hi>>32;
- }
- }
-#endif
-}
-
-#ifndef GHASH_ASM
-static const size_t rem_4bit[16] = {
- PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
- PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
- PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
- PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
-
-static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
-{
- u128 Z;
- int cnt = 15;
- size_t rem, nlo, nhi;
- const union { long one; char little; } is_endian = {1};
-
- nlo = ((const u8 *)Xi)[15];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- Z.hi = Htable[nlo].hi;
- Z.lo = Htable[nlo].lo;
-
- while (1) {
- rem = (size_t)Z.lo&0xf;
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_4bit[rem];
- else
- Z.hi ^= (u64)rem_4bit[rem]<<32;
-
- Z.hi ^= Htable[nhi].hi;
- Z.lo ^= Htable[nhi].lo;
-
- if (--cnt<0) break;
-
- nlo = ((const u8 *)Xi)[cnt];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- rem = (size_t)Z.lo&0xf;
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_4bit[rem];
- else
- Z.hi ^= (u64)rem_4bit[rem]<<32;
-
- Z.hi ^= Htable[nlo].hi;
- Z.lo ^= Htable[nlo].lo;
- }
-
- if (is_endian.little) {
-#ifdef BSWAP8
- Xi[0] = BSWAP8(Z.hi);
- Xi[1] = BSWAP8(Z.lo);
-#else
- u8 *p = (u8 *)Xi;
- u32 v;
- v = (u32)(Z.hi>>32); PUTU32(p,v);
- v = (u32)(Z.hi); PUTU32(p+4,v);
- v = (u32)(Z.lo>>32); PUTU32(p+8,v);
- v = (u32)(Z.lo); PUTU32(p+12,v);
-#endif
- }
- else {
- Xi[0] = Z.hi;
- Xi[1] = Z.lo;
- }
-}
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-/*
- * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
- * details... Compiler-generated code doesn't seem to give any
- * performance improvement, at least not on x86[_64]. It's here
- * mostly as reference and a placeholder for possible future
- * non-trivial optimization[s]...
- */
-static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len)
-{
- u128 Z;
- int cnt;
- size_t rem, nlo, nhi;
- const union { long one; char little; } is_endian = {1};
-
-#if 1
- do {
- cnt = 15;
- nlo = ((const u8 *)Xi)[15];
- nlo ^= inp[15];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- Z.hi = Htable[nlo].hi;
- Z.lo = Htable[nlo].lo;
-
- while (1) {
- rem = (size_t)Z.lo&0xf;
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_4bit[rem];
- else
- Z.hi ^= (u64)rem_4bit[rem]<<32;
-
- Z.hi ^= Htable[nhi].hi;
- Z.lo ^= Htable[nhi].lo;
-
- if (--cnt<0) break;
-
- nlo = ((const u8 *)Xi)[cnt];
- nlo ^= inp[cnt];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- rem = (size_t)Z.lo&0xf;
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_4bit[rem];
- else
- Z.hi ^= (u64)rem_4bit[rem]<<32;
-
- Z.hi ^= Htable[nlo].hi;
- Z.lo ^= Htable[nlo].lo;
- }
-#else
- /*
- * Extra 256+16 bytes per-key plus 512 bytes shared tables
- * [should] give ~50% improvement... One could have PACK()-ed
- * the rem_8bit even here, but the priority is to minimize
- * cache footprint...
- */
- u128 Hshr4[16]; /* Htable shifted right by 4 bits */
- u8 Hshl4[16]; /* Htable shifted left by 4 bits */
- static const unsigned short rem_8bit[256] = {
- 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
- 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
- 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
- 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
- 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
- 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
- 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
- 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
- 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
- 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
- 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
- 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
- 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
- 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
- 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
- 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
- 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
- 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
- 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
- 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
- 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
- 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
- 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
- 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
- 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
- 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
- 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
- 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
- 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
- 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
- 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
- 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
- /*
- * This pre-processing phase slows down procedure by approximately
- * same time as it makes each loop spin faster. In other words
- * single block performance is approximately same as straightforward
- * "4-bit" implementation, and then it goes only faster...
- */
- for (cnt=0; cnt<16; ++cnt) {
- Z.hi = Htable[cnt].hi;
- Z.lo = Htable[cnt].lo;
- Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
- Hshr4[cnt].hi = (Z.hi>>4);
- Hshl4[cnt] = (u8)(Z.lo<<4);
- }
-
- do {
- for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
- nlo = ((const u8 *)Xi)[cnt];
- nlo ^= inp[cnt];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- Z.hi ^= Htable[nlo].hi;
- Z.lo ^= Htable[nlo].lo;
-
- rem = (size_t)Z.lo&0xff;
-
- Z.lo = (Z.hi<<56)|(Z.lo>>8);
- Z.hi = (Z.hi>>8);
-
- Z.hi ^= Hshr4[nhi].hi;
- Z.lo ^= Hshr4[nhi].lo;
- Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
- }
-
- nlo = ((const u8 *)Xi)[0];
- nlo ^= inp[0];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- Z.hi ^= Htable[nlo].hi;
- Z.lo ^= Htable[nlo].lo;
-
- rem = (size_t)Z.lo&0xf;
-
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
-
- Z.hi ^= Htable[nhi].hi;
- Z.lo ^= Htable[nhi].lo;
- Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
-#endif
-
- if (is_endian.little) {
-#ifdef BSWAP8
- Xi[0] = BSWAP8(Z.hi);
- Xi[1] = BSWAP8(Z.lo);
-#else
- u8 *p = (u8 *)Xi;
- u32 v;
- v = (u32)(Z.hi>>32); PUTU32(p,v);
- v = (u32)(Z.hi); PUTU32(p+4,v);
- v = (u32)(Z.lo>>32); PUTU32(p+8,v);
- v = (u32)(Z.lo); PUTU32(p+12,v);
-#endif
- }
- else {
- Xi[0] = Z.hi;
- Xi[1] = Z.lo;
- }
- } while (inp+=16, len-=16);
-}
-#endif
-#else
-void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#endif
-
-#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
-#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
-#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
-/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
- * trashing effect. In other words idea is to hash data while it's
- * still in L1 cache after encryption pass... */
-#define GHASH_CHUNK (3*1024)
-#endif
-
-#else /* TABLE_BITS */
-
-static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
-{
- u128 V,Z = { 0,0 };
- long X;
- int i,j;
- const long *xi = (const long *)Xi;
- const union { long one; char little; } is_endian = {1};
-
- V.hi = H[0]; /* H is in host byte order, no byte swapping */
- V.lo = H[1];
-
- for (j=0; j<16/sizeof(long); ++j) {
- if (is_endian.little) {
- if (sizeof(long)==8) {
-#ifdef BSWAP8
- X = (long)(BSWAP8(xi[j]));
-#else
- const u8 *p = (const u8 *)(xi+j);
- X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
-#endif
- }
- else {
- const u8 *p = (const u8 *)(xi+j);
- X = (long)GETU32(p);
- }
- }
- else
- X = xi[j];
-
- for (i=0; i<8*sizeof(long); ++i, X<<=1) {
- u64 M = (u64)(X>>(8*sizeof(long)-1));
- Z.hi ^= V.hi&M;
- Z.lo ^= V.lo&M;
-
- REDUCE1BIT(V);
- }
- }
-
- if (is_endian.little) {
-#ifdef BSWAP8
- Xi[0] = BSWAP8(Z.hi);
- Xi[1] = BSWAP8(Z.lo);
-#else
- u8 *p = (u8 *)Xi;
- u32 v;
- v = (u32)(Z.hi>>32); PUTU32(p,v);
- v = (u32)(Z.hi); PUTU32(p+4,v);
- v = (u32)(Z.lo>>32); PUTU32(p+8,v);
- v = (u32)(Z.lo); PUTU32(p+12,v);
-#endif
- }
- else {
- Xi[0] = Z.hi;
- Xi[1] = Z.lo;
- }
-}
-#define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
-
-#endif
-
-#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
-# if !defined(I386_ONLY) && \
- (defined(__i386) || defined(__i386__) || \
- defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
-# define GHASH_ASM_X86_OR_64
-# define GCM_FUNCREF_4BIT
-extern unsigned int OPENSSL_ia32cap_P[2];
-
-void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
-void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-
-# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
-# define GHASH_ASM_X86
-void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-
-void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-# endif
-# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
-# include "arm_arch.h"
-# if __ARM_ARCH__>=7
-# define GHASH_ASM_ARM
-# define GCM_FUNCREF_4BIT
-# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
-# if defined(__arm__) || defined(__arm)
-# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
-# endif
-void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
-void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
-void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-# endif
-# endif
-#endif
-
-#ifdef GCM_FUNCREF_4BIT
-# undef GCM_MUL
-# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
-# ifdef GHASH
-# undef GHASH
-# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
-# endif
-#endif
-
-void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
-{
- const union { long one; char little; } is_endian = {1};
-
- memset(ctx,0,sizeof(*ctx));
- ctx->block = block;
- ctx->key = key;
-
- (*block)(ctx->H.c,ctx->H.c,key);
-
- if (is_endian.little) {
- /* H is stored in host byte order */
-#ifdef BSWAP8
- ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
- ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
-#else
- u8 *p = ctx->H.c;
- u64 hi,lo;
- hi = (u64)GETU32(p) <<32|GETU32(p+4);
- lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
- ctx->H.u[0] = hi;
- ctx->H.u[1] = lo;
-#endif
- }
-
-#if TABLE_BITS==8
- gcm_init_8bit(ctx->Htable,ctx->H.u);
-#elif TABLE_BITS==4
-# if defined(GHASH_ASM_X86_OR_64)
-# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
- if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */
- OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */
- gcm_init_clmul(ctx->Htable,ctx->H.u);
- ctx->gmult = gcm_gmult_clmul;
- ctx->ghash = gcm_ghash_clmul;
- return;
- }
-# endif
- gcm_init_4bit(ctx->Htable,ctx->H.u);
-# if defined(GHASH_ASM_X86) /* x86 only */
-# if defined(OPENSSL_IA32_SSE2)
- if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */
-# else
- if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */
-# endif
- ctx->gmult = gcm_gmult_4bit_mmx;
- ctx->ghash = gcm_ghash_4bit_mmx;
- } else {
- ctx->gmult = gcm_gmult_4bit_x86;
- ctx->ghash = gcm_ghash_4bit_x86;
- }
-# else
- ctx->gmult = gcm_gmult_4bit;
- ctx->ghash = gcm_ghash_4bit;
-# endif
-# elif defined(GHASH_ASM_ARM)
-# ifdef PMULL_CAPABLE
- if (PMULL_CAPABLE) {
- gcm_init_v8(ctx->Htable,ctx->H.u);
- ctx->gmult = gcm_gmult_v8;
- ctx->ghash = gcm_ghash_v8;
- } else
-# endif
-# ifdef NEON_CAPABLE
- if (NEON_CAPABLE) {
- gcm_init_neon(ctx->Htable,ctx->H.u);
- ctx->gmult = gcm_gmult_neon;
- ctx->ghash = gcm_ghash_neon;
- } else
-# endif
- {
- gcm_init_4bit(ctx->Htable,ctx->H.u);
- ctx->gmult = gcm_gmult_4bit;
- ctx->ghash = gcm_ghash_4bit;
- }
-# else
- gcm_init_4bit(ctx->Htable,ctx->H.u);
-# endif
-#endif
-}
-
-void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
-{
- const union { long one; char little; } is_endian = {1};
- unsigned int ctr;
-#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
-#endif
-
- ctx->Yi.u[0] = 0;
- ctx->Yi.u[1] = 0;
- ctx->Xi.u[0] = 0;
- ctx->Xi.u[1] = 0;
- ctx->len.u[0] = 0; /* AAD length */
- ctx->len.u[1] = 0; /* message length */
- ctx->ares = 0;
- ctx->mres = 0;
-
- if (len==12) {
- memcpy(ctx->Yi.c,iv,12);
- ctx->Yi.c[15]=1;
- ctr=1;
- }
- else {
- size_t i;
- u64 len0 = len;
-
- while (len>=16) {
- for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
- GCM_MUL(ctx,Yi);
- iv += 16;
- len -= 16;
- }
- if (len) {
- for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
- GCM_MUL(ctx,Yi);
- }
- len0 <<= 3;
- if (is_endian.little) {
-#ifdef BSWAP8
- ctx->Yi.u[1] ^= BSWAP8(len0);
-#else
- ctx->Yi.c[8] ^= (u8)(len0>>56);
- ctx->Yi.c[9] ^= (u8)(len0>>48);
- ctx->Yi.c[10] ^= (u8)(len0>>40);
- ctx->Yi.c[11] ^= (u8)(len0>>32);
- ctx->Yi.c[12] ^= (u8)(len0>>24);
- ctx->Yi.c[13] ^= (u8)(len0>>16);
- ctx->Yi.c[14] ^= (u8)(len0>>8);
- ctx->Yi.c[15] ^= (u8)(len0);
-#endif
- }
- else
- ctx->Yi.u[1] ^= len0;
-
- GCM_MUL(ctx,Yi);
-
- if (is_endian.little)
-#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
-#else
- ctr = GETU32(ctx->Yi.c+12);
-#endif
- else
- ctr = ctx->Yi.d[3];
- }
-
- (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
-}
-
-int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
-{
- size_t i;
- unsigned int n;
- u64 alen = ctx->len.u[0];
-#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
-# endif
-#endif
-
- if (ctx->len.u[1]) return -2;
-
- alen += len;
- if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
- return -1;
- ctx->len.u[0] = alen;
-
- n = ctx->ares;
- if (n) {
- while (n && len) {
- ctx->Xi.c[n] ^= *(aad++);
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL(ctx,Xi);
- else {
- ctx->ares = n;
- return 0;
- }
- }
-
-#ifdef GHASH
- if ((i = (len&(size_t)-16))) {
- GHASH(ctx,aad,i);
- aad += i;
- len -= i;
- }
-#else
- while (len>=16) {
- for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
- GCM_MUL(ctx,Xi);
- aad += 16;
- len -= 16;
- }
-#endif
- if (len) {
- n = (unsigned int)len;
- for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
- }
-
- ctx->ares = n;
- return 0;
-}
-
-int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len)
-{
- const union { long one; char little; } is_endian = {1};
- unsigned int n, ctr;
- size_t i;
- u64 mlen = ctx->len.u[1];
- block128_f block = ctx->block;
- void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
-# endif
-#endif
-
-#if 0
- n = (unsigned int)mlen%16; /* alternative to ctx->mres */
-#endif
- mlen += len;
- if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
- return -1;
- ctx->len.u[1] = mlen;
-
- if (ctx->ares) {
- /* First call to encrypt finalizes GHASH(AAD) */
- GCM_MUL(ctx,Xi);
- ctx->ares = 0;
- }
-
- if (is_endian.little)
-#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
-#else
- ctr = GETU32(ctx->Yi.c+12);
-#endif
- else
- ctr = ctx->Yi.d[3];
-
- n = ctx->mres;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- if (n) {
- while (n && len) {
- ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL(ctx,Xi);
- else {
- ctx->mres = n;
- return 0;
- }
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
- break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
- while (len>=GHASH_CHUNK) {
- size_t j=GHASH_CHUNK;
-
- while (j) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- out_t[i] = in_t[i] ^ ctx->EKi.t[i];
- out += 16;
- in += 16;
- j -= 16;
- }
- GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
- len -= GHASH_CHUNK;
- }
- if ((i = (len&(size_t)-16))) {
- size_t j=i;
-
- while (len>=16) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- out_t[i] = in_t[i] ^ ctx->EKi.t[i];
- out += 16;
- in += 16;
- len -= 16;
- }
- GHASH(ctx,out-j,j);
- }
-#else
- while (len>=16) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- ctx->Xi.t[i] ^=
- out_t[i] = in_t[i]^ctx->EKi.t[i];
- GCM_MUL(ctx,Xi);
- out += 16;
- in += 16;
- len -= 16;
- }
-#endif
- if (len) {
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- while (len--) {
- ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
- ++n;
- }
- }
-
- ctx->mres = n;
- return 0;
- } while(0);
-#endif
- for (i=0;i<len;++i) {
- if (n==0) {
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- }
- ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
- n = (n+1)%16;
- if (n==0)
- GCM_MUL(ctx,Xi);
- }
-
- ctx->mres = n;
- return 0;
-}
-
-int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len)
-{
- const union { long one; char little; } is_endian = {1};
- unsigned int n, ctr;
- size_t i;
- u64 mlen = ctx->len.u[1];
- block128_f block = ctx->block;
- void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
-# endif
-#endif
-
- mlen += len;
- if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
- return -1;
- ctx->len.u[1] = mlen;
-
- if (ctx->ares) {
- /* First call to decrypt finalizes GHASH(AAD) */
- GCM_MUL(ctx,Xi);
- ctx->ares = 0;
- }
-
- if (is_endian.little)
-#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
-#else
- ctr = GETU32(ctx->Yi.c+12);
-#endif
- else
- ctr = ctx->Yi.d[3];
-
- n = ctx->mres;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- if (n) {
- while (n && len) {
- u8 c = *(in++);
- *(out++) = c^ctx->EKi.c[n];
- ctx->Xi.c[n] ^= c;
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL (ctx,Xi);
- else {
- ctx->mres = n;
- return 0;
- }
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
- break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
- while (len>=GHASH_CHUNK) {
- size_t j=GHASH_CHUNK;
-
- GHASH(ctx,in,GHASH_CHUNK);
- while (j) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- out_t[i] = in_t[i]^ctx->EKi.t[i];
- out += 16;
- in += 16;
- j -= 16;
- }
- len -= GHASH_CHUNK;
- }
- if ((i = (len&(size_t)-16))) {
- GHASH(ctx,in,i);
- while (len>=16) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- out_t[i] = in_t[i]^ctx->EKi.t[i];
- out += 16;
- in += 16;
- len -= 16;
- }
- }
-#else
- while (len>=16) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i) {
- size_t c = in[i];
- out[i] = c^ctx->EKi.t[i];
- ctx->Xi.t[i] ^= c;
- }
- GCM_MUL(ctx,Xi);
- out += 16;
- in += 16;
- len -= 16;
- }
-#endif
- if (len) {
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- while (len--) {
- u8 c = in[n];
- ctx->Xi.c[n] ^= c;
- out[n] = c^ctx->EKi.c[n];
- ++n;
- }
- }
-
- ctx->mres = n;
- return 0;
- } while(0);
-#endif
- for (i=0;i<len;++i) {
- u8 c;
- if (n==0) {
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- }
- c = in[i];
- out[i] = c^ctx->EKi.c[n];
- ctx->Xi.c[n] ^= c;
- n = (n+1)%16;
- if (n==0)
- GCM_MUL(ctx,Xi);
- }
-
- ctx->mres = n;
- return 0;
-}
-
-int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len, ctr128_f stream)
-{
- const union { long one; char little; } is_endian = {1};
- unsigned int n, ctr;
- size_t i;
- u64 mlen = ctx->len.u[1];
- void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
-# endif
-#endif
-
- mlen += len;
- if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
- return -1;
- ctx->len.u[1] = mlen;
-
- if (ctx->ares) {
- /* First call to encrypt finalizes GHASH(AAD) */
- GCM_MUL(ctx,Xi);
- ctx->ares = 0;
- }
-
- if (is_endian.little)
-#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
-#else
- ctr = GETU32(ctx->Yi.c+12);
-#endif
- else
- ctr = ctx->Yi.d[3];
-
- n = ctx->mres;
- if (n) {
- while (n && len) {
- ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL(ctx,Xi);
- else {
- ctx->mres = n;
- return 0;
- }
- }
-#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
- while (len>=GHASH_CHUNK) {
- (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
- ctr += GHASH_CHUNK/16;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- GHASH(ctx,out,GHASH_CHUNK);
- out += GHASH_CHUNK;
- in += GHASH_CHUNK;
- len -= GHASH_CHUNK;
- }
-#endif
- if ((i = (len&(size_t)-16))) {
- size_t j=i/16;
-
- (*stream)(in,out,j,key,ctx->Yi.c);
- ctr += (unsigned int)j;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- in += i;
- len -= i;
-#if defined(GHASH)
- GHASH(ctx,out,i);
- out += i;
-#else
- while (j--) {
- for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
- GCM_MUL(ctx,Xi);
- out += 16;
- }
-#endif
- }
- if (len) {
- (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- while (len--) {
- ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
- ++n;
- }
- }
-
- ctx->mres = n;
- return 0;
-}
-
-int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len,ctr128_f stream)
-{
- const union { long one; char little; } is_endian = {1};
- unsigned int n, ctr;
- size_t i;
- u64 mlen = ctx->len.u[1];
- void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
-# endif
-#endif
-
- mlen += len;
- if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
- return -1;
- ctx->len.u[1] = mlen;
-
- if (ctx->ares) {
- /* First call to decrypt finalizes GHASH(AAD) */
- GCM_MUL(ctx,Xi);
- ctx->ares = 0;
- }
-
- if (is_endian.little)
-#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
-#else
- ctr = GETU32(ctx->Yi.c+12);
-#endif
- else
- ctr = ctx->Yi.d[3];
-
- n = ctx->mres;
- if (n) {
- while (n && len) {
- u8 c = *(in++);
- *(out++) = c^ctx->EKi.c[n];
- ctx->Xi.c[n] ^= c;
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL (ctx,Xi);
- else {
- ctx->mres = n;
- return 0;
- }
- }
-#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
- while (len>=GHASH_CHUNK) {
- GHASH(ctx,in,GHASH_CHUNK);
- (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
- ctr += GHASH_CHUNK/16;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- out += GHASH_CHUNK;
- in += GHASH_CHUNK;
- len -= GHASH_CHUNK;
- }
-#endif
- if ((i = (len&(size_t)-16))) {
- size_t j=i/16;
-
-#if defined(GHASH)
- GHASH(ctx,in,i);
-#else
- while (j--) {
- size_t k;
- for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
- GCM_MUL(ctx,Xi);
- in += 16;
- }
- j = i/16;
- in -= i;
-#endif
- (*stream)(in,out,j,key,ctx->Yi.c);
- ctr += (unsigned int)j;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- out += i;
- in += i;
- len -= i;
- }
- if (len) {
- (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- while (len--) {
- u8 c = in[n];
- ctx->Xi.c[n] ^= c;
- out[n] = c^ctx->EKi.c[n];
- ++n;
- }
- }
-
- ctx->mres = n;
- return 0;
-}
-
-int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
- size_t len)
-{
- const union { long one; char little; } is_endian = {1};
- u64 alen = ctx->len.u[0]<<3;
- u64 clen = ctx->len.u[1]<<3;
-#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
-#endif
-
- if (ctx->mres || ctx->ares)
- GCM_MUL(ctx,Xi);
-
- if (is_endian.little) {
-#ifdef BSWAP8
- alen = BSWAP8(alen);
- clen = BSWAP8(clen);
-#else
- u8 *p = ctx->len.c;
-
- ctx->len.u[0] = alen;
- ctx->len.u[1] = clen;
-
- alen = (u64)GETU32(p) <<32|GETU32(p+4);
- clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
-#endif
- }
-
- ctx->Xi.u[0] ^= alen;
- ctx->Xi.u[1] ^= clen;
- GCM_MUL(ctx,Xi);
-
- ctx->Xi.u[0] ^= ctx->EK0.u[0];
- ctx->Xi.u[1] ^= ctx->EK0.u[1];
-
- if (tag && len<=sizeof(ctx->Xi))
- return memcmp(ctx->Xi.c,tag,len);
- else
- return -1;
-}
-
-void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
-{
- CRYPTO_gcm128_finish(ctx, NULL, 0);
- memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
-}
-
-GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
-{
- GCM128_CONTEXT *ret;
-
- if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
- CRYPTO_gcm128_init(ret,key,block);
-
- return ret;
-}
-
-void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
-{
- if (ctx) {
- OPENSSL_cleanse(ctx,sizeof(*ctx));
- OPENSSL_free(ctx);
- }
-}
-
-#if defined(SELFTEST)
-#include <stdio.h>
-#include <openssl/aes.h>
-
-/* Test Case 1 */
-static const u8 K1[16],
- *P1=NULL,
- *A1=NULL,
- IV1[12],
- *C1=NULL,
- T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
-
-/* Test Case 2 */
-#define K2 K1
-#define A2 A1
-#define IV2 IV1
-static const u8 P2[16],
- C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
- T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
-
-/* Test Case 3 */
-#define A3 A2
-static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
- P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
- IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
- C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
- 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
- 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
- 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
- T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
-
-/* Test Case 4 */
-#define K4 K3
-#define IV4 IV3
-static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
- A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
- 0xab,0xad,0xda,0xd2},
- C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
- 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
- 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
- 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
- T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
-
-/* Test Case 5 */
-#define K5 K4
-#define P5 P4
-#define A5 A4
-static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
- C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
- 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
- 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
- 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
- T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
-
-/* Test Case 6 */
-#define K6 K5
-#define P6 P5
-#define A6 A5
-static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
- 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
- 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
- 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
- C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
- 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
- 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
- 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
- T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
-
-/* Test Case 7 */
-static const u8 K7[24],
- *P7=NULL,
- *A7=NULL,
- IV7[12],
- *C7=NULL,
- T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
-
-/* Test Case 8 */
-#define K8 K7
-#define IV8 IV7
-#define A8 A7
-static const u8 P8[16],
- C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
- T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
-
-/* Test Case 9 */
-#define A9 A8
-static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
- 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
- P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
- IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
- C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
- 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
- 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
- 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
- T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
-
-/* Test Case 10 */
-#define K10 K9
-#define IV10 IV9
-static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
- A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
- 0xab,0xad,0xda,0xd2},
- C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
- 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
- 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
- 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
- T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
-
-/* Test Case 11 */
-#define K11 K10
-#define P11 P10
-#define A11 A10
-static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
- C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
- 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
- 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
- 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
- T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
-
-/* Test Case 12 */
-#define K12 K11
-#define P12 P11
-#define A12 A11
-static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
- 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
- 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
- 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
- C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
- 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
- 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
- 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
- T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
-
-/* Test Case 13 */
-static const u8 K13[32],
- *P13=NULL,
- *A13=NULL,
- IV13[12],
- *C13=NULL,
- T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
-
-/* Test Case 14 */
-#define K14 K13
-#define A14 A13
-static const u8 P14[16],
- IV14[12],
- C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
- T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
-
-/* Test Case 15 */
-#define A15 A14
-static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
- 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
- P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
- IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
- C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
- 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
- 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
- 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
- T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
-
-/* Test Case 16 */
-#define K16 K15
-#define IV16 IV15
-static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
- A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
- 0xab,0xad,0xda,0xd2},
- C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
- 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
- 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
- 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
- T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
-
-/* Test Case 17 */
-#define K17 K16
-#define P17 P16
-#define A17 A16
-static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
- C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
- 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
- 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
- 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
- T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
-
-/* Test Case 18 */
-#define K18 K17
-#define P18 P17
-#define A18 A17
-static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
- 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
- 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
- 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
- C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
- 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
- 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
- 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
- T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
-
-/* Test Case 19 */
-#define K19 K1
-#define P19 P1
-#define IV19 IV1
-#define C19 C1
-static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
- 0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
- 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
- 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
- 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
- T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
-
-/* Test Case 20 */
-#define K20 K1
-#define A20 A1
-static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
- P20[288],
- C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
- 0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
- 0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
- 0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
- 0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
- 0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
- 0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
- 0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
- 0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
- 0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
- 0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
- 0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
- 0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
- 0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
- 0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
- 0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
- 0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
- 0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
- T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
-
-#define TEST_CASE(n) do { \
- u8 out[sizeof(P##n)]; \
- AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
- CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
- CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
- memset(out,0,sizeof(out)); \
- if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
- if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
- if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
- (C##n && memcmp(out,C##n,sizeof(out)))) \
- ret++, printf ("encrypt test#%d failed.\n",n); \
- CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
- memset(out,0,sizeof(out)); \
- if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
- if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
- if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
- (P##n && memcmp(out,P##n,sizeof(out)))) \
- ret++, printf ("decrypt test#%d failed.\n",n); \
- } while(0)
-
-int main()
-{
- GCM128_CONTEXT ctx;
- AES_KEY key;
- int ret=0;
-
- TEST_CASE(1);
- TEST_CASE(2);
- TEST_CASE(3);
- TEST_CASE(4);
- TEST_CASE(5);
- TEST_CASE(6);
- TEST_CASE(7);
- TEST_CASE(8);
- TEST_CASE(9);
- TEST_CASE(10);
- TEST_CASE(11);
- TEST_CASE(12);
- TEST_CASE(13);
- TEST_CASE(14);
- TEST_CASE(15);
- TEST_CASE(16);
- TEST_CASE(17);
- TEST_CASE(18);
- TEST_CASE(19);
- TEST_CASE(20);
-
-#ifdef OPENSSL_CPUID_OBJ
- {
- size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
- union { u64 u; u8 c[1024]; } buf;
- int i;
-
- AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
- CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
- CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
-
- CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
- start = OPENSSL_rdtsc();
- CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
- gcm_t = OPENSSL_rdtsc() - start;
-
- CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
- &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
- (block128_f)AES_encrypt);
- start = OPENSSL_rdtsc();
- CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
- &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
- (block128_f)AES_encrypt);
- ctr_t = OPENSSL_rdtsc() - start;
-
- printf("%.2f-%.2f=%.2f\n",
- gcm_t/(double)sizeof(buf),
- ctr_t/(double)sizeof(buf),
- (gcm_t-ctr_t)/(double)sizeof(buf));
-#ifdef GHASH
- {
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx.ghash;
-
- GHASH((&ctx),buf.c,sizeof(buf));
- start = OPENSSL_rdtsc();
- for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
- gcm_t = OPENSSL_rdtsc() - start;
- printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
- }
-#endif
- }
-#endif
-
- return ret;
-}
-#endif
diff --git a/app/openssl/crypto/modes/modes_lcl.h b/app/openssl/crypto/modes/modes_lcl.h
deleted file mode 100644
index 9d83e128..00000000
--- a/app/openssl/crypto/modes/modes_lcl.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use is governed by OpenSSL license.
- * ====================================================================
- */
-
-#include <openssl/modes.h>
-
-
-#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
-typedef __int64 i64;
-typedef unsigned __int64 u64;
-#define U64(C) C##UI64
-#elif defined(__arch64__)
-typedef long i64;
-typedef unsigned long u64;
-#define U64(C) C##UL
-#else
-typedef long long i64;
-typedef unsigned long long u64;
-#define U64(C) C##ULL
-#endif
-
-typedef unsigned int u32;
-typedef unsigned char u8;
-
-#define STRICT_ALIGNMENT 1
-#if defined(__i386) || defined(__i386__) || \
- defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
- defined(__s390__) || defined(__s390x__)
-# undef STRICT_ALIGNMENT
-#endif
-
-#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-#if defined(__GNUC__) && __GNUC__>=2
-# if defined(__x86_64) || defined(__x86_64__)
-# define BSWAP8(x) ({ u64 ret=(x); \
- asm ("bswapq %0" \
- : "+r"(ret)); ret; })
-# define BSWAP4(x) ({ u32 ret=(x); \
- asm ("bswapl %0" \
- : "+r"(ret)); ret; })
-# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
-# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
- asm ("bswapl %0; bswapl %1" \
- : "+r"(hi),"+r"(lo)); \
- (u64)hi<<32|lo; })
-# define BSWAP4(x) ({ u32 ret=(x); \
- asm ("bswapl %0" \
- : "+r"(ret)); ret; })
-# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
-# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
- asm ("rev %0,%0; rev %1,%1" \
- : "+r"(hi),"+r"(lo)); \
- (u64)hi<<32|lo; })
-# define BSWAP4(x) ({ u32 ret; \
- asm ("rev %0,%1" \
- : "=r"(ret) : "r"((u32)(x))); \
- ret; })
-# endif
-#elif defined(_MSC_VER)
-# if _MSC_VER>=1300
-# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
-# define BSWAP8(x) _byteswap_uint64((u64)(x))
-# define BSWAP4(x) _byteswap_ulong((u32)(x))
-# elif defined(_M_IX86)
- __inline u32 _bswap4(u32 val) {
- _asm mov eax,val
- _asm bswap eax
- }
-# define BSWAP4(x) _bswap4(x)
-# endif
-#endif
-#endif
-
-#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
-#define GETU32(p) BSWAP4(*(const u32 *)(p))
-#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
-#else
-#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
-#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
-#endif
-
-/* GCM definitions */
-
-typedef struct { u64 hi,lo; } u128;
-
-#ifdef TABLE_BITS
-#undef TABLE_BITS
-#endif
-/*
- * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
- * never be set to 8 [or 1]. For further information see gcm128.c.
- */
-#define TABLE_BITS 4
-
-struct gcm128_context {
- /* Following 6 names follow names in GCM specification */
- union { u64 u[2]; u32 d[4]; u8 c[16]; size_t t[16/sizeof(size_t)]; }
- Yi,EKi,EK0,len,Xi,H;
- /* Relative position of Xi, H and pre-computed Htable is used
- * in some assembler modules, i.e. don't change the order! */
-#if TABLE_BITS==8
- u128 Htable[256];
-#else
- u128 Htable[16];
- void (*gmult)(u64 Xi[2],const u128 Htable[16]);
- void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#endif
- unsigned int mres, ares;
- block128_f block;
- void *key;
-};
-
-struct xts128_context {
- void *key1, *key2;
- block128_f block1,block2;
-};
-
-struct ccm128_context {
- union { u64 u[2]; u8 c[16]; } nonce, cmac;
- u64 blocks;
- block128_f block;
- void *key;
-};
-
diff --git a/app/openssl/crypto/modes/ofb128.c b/app/openssl/crypto/modes/ofb128.c
deleted file mode 100644
index 01c01702..00000000
--- a/app/openssl/crypto/modes/ofb128.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-# define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
-/* The input and output encrypted as though 128bit ofb mode is being
- * used. The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
- */
-void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], int *num,
- block128_f block)
-{
- unsigned int n;
- size_t l=0;
-
- assert(in && out && key && ivec && num);
-
- n = *num;
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- while (n && len) {
- *(out++) = *(in++) ^ ivec[n];
- --len;
- n = (n+1) % 16;
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
- break;
-#endif
- while (len>=16) {
- (*block)(ivec, ivec, key);
- for (; n<16; n+=sizeof(size_t))
- *(size_t*)(out+n) =
- *(size_t*)(in+n) ^ *(size_t*)(ivec+n);
- len -= 16;
- out += 16;
- in += 16;
- n = 0;
- }
- if (len) {
- (*block)(ivec, ivec, key);
- while (len--) {
- out[n] = in[n] ^ ivec[n];
- ++n;
- }
- }
- *num = n;
- return;
- } while(0);
- /* the rest would be commonly eliminated by x86* compiler */
-#endif
- while (l<len) {
- if (n==0) {
- (*block)(ivec, ivec, key);
- }
- out[l] = in[l] ^ ivec[n];
- ++l;
- n = (n+1) % 16;
- }
-
- *num=n;
-}
diff --git a/app/openssl/crypto/modes/xts128.c b/app/openssl/crypto/modes/xts128.c
deleted file mode 100644
index 9cf27a25..00000000
--- a/app/openssl/crypto/modes/xts128.c
+++ /dev/null
@@ -1,187 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-# define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
-int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
- const unsigned char *inp, unsigned char *out,
- size_t len, int enc)
-{
- const union { long one; char little; } is_endian = {1};
- union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
- unsigned int i;
-
- if (len<16) return -1;
-
- memcpy(tweak.c, iv, 16);
-
- (*ctx->block2)(tweak.c,tweak.c,ctx->key2);
-
- if (!enc && (len%16)) len-=16;
-
- while (len>=16) {
-#if defined(STRICT_ALIGNMENT)
- memcpy(scratch.c,inp,16);
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
-#else
- scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
- scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
-#endif
- (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
-#if defined(STRICT_ALIGNMENT)
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- memcpy(out,scratch.c,16);
-#else
- ((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
- ((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
-#endif
- inp += 16;
- out += 16;
- len -= 16;
-
- if (len==0) return 0;
-
- if (is_endian.little) {
- unsigned int carry,res;
-
- res = 0x87&(((int)tweak.d[3])>>31);
- carry = (unsigned int)(tweak.u[0]>>63);
- tweak.u[0] = (tweak.u[0]<<1)^res;
- tweak.u[1] = (tweak.u[1]<<1)|carry;
- }
- else {
- size_t c;
-
- for (c=0,i=0;i<16;++i) {
- /*+ substitutes for |, because c is 1 bit */
- c += ((size_t)tweak.c[i])<<1;
- tweak.c[i] = (u8)c;
- c = c>>8;
- }
- tweak.c[0] ^= (u8)(0x87&(0-c));
- }
- }
- if (enc) {
- for (i=0;i<len;++i) {
- u8 c = inp[i];
- out[i] = scratch.c[i];
- scratch.c[i] = c;
- }
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- memcpy(out-16,scratch.c,16);
- }
- else {
- union { u64 u[2]; u8 c[16]; } tweak1;
-
- if (is_endian.little) {
- unsigned int carry,res;
-
- res = 0x87&(((int)tweak.d[3])>>31);
- carry = (unsigned int)(tweak.u[0]>>63);
- tweak1.u[0] = (tweak.u[0]<<1)^res;
- tweak1.u[1] = (tweak.u[1]<<1)|carry;
- }
- else {
- size_t c;
-
- for (c=0,i=0;i<16;++i) {
- /*+ substitutes for |, because c is 1 bit */
- c += ((size_t)tweak.c[i])<<1;
- tweak1.c[i] = (u8)c;
- c = c>>8;
- }
- tweak1.c[0] ^= (u8)(0x87&(0-c));
- }
-#if defined(STRICT_ALIGNMENT)
- memcpy(scratch.c,inp,16);
- scratch.u[0] ^= tweak1.u[0];
- scratch.u[1] ^= tweak1.u[1];
-#else
- scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
- scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
-#endif
- (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
- scratch.u[0] ^= tweak1.u[0];
- scratch.u[1] ^= tweak1.u[1];
-
- for (i=0;i<len;++i) {
- u8 c = inp[16+i];
- out[16+i] = scratch.c[i];
- scratch.c[i] = c;
- }
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
-#if defined(STRICT_ALIGNMENT)
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- memcpy (out,scratch.c,16);
-#else
- ((u64*)out)[0] = scratch.u[0]^tweak.u[0];
- ((u64*)out)[1] = scratch.u[1]^tweak.u[1];
-#endif
- }
-
- return 0;
-}