summaryrefslogtreecommitdiff
path: root/app/openssl/crypto/aes/asm
diff options
context:
space:
mode:
Diffstat (limited to 'app/openssl/crypto/aes/asm')
-rw-r--r--app/openssl/crypto/aes/asm/aes-armv4.pl139
-rw-r--r--app/openssl/crypto/aes/asm/aes-armv4.s160
-rw-r--r--app/openssl/crypto/aes/asm/aesv8-armx-64.S761
-rw-r--r--app/openssl/crypto/aes/asm/aesv8-armx.S767
-rw-r--r--app/openssl/crypto/aes/asm/aesv8-armx.pl980
5 files changed, 55 insertions, 2752 deletions
diff --git a/app/openssl/crypto/aes/asm/aes-armv4.pl b/app/openssl/crypto/aes/asm/aes-armv4.pl
index 4f891708..86b86c4a 100644
--- a/app/openssl/crypto/aes/asm/aes-armv4.pl
+++ b/app/openssl/crypto/aes/asm/aes-armv4.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -51,23 +51,9 @@ $key="r11";
$rounds="r12";
$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-#endif
-
+#include "arm_arch.h"
.text
-#if __ARM_ARCH__<7
-.code 32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
.code 32
-# endif
-#endif
.type AES_Te,%object
.align 5
@@ -181,11 +167,7 @@ AES_Te:
.type AES_encrypt,%function
.align 5
AES_encrypt:
-#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_encrypt
-#else
- adr r3,AES_encrypt
-#endif
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
@@ -427,21 +409,11 @@ _armv4_AES_encrypt:
.align 5
private_AES_set_encrypt_key:
_armv4_AES_set_encrypt_key:
-#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_set_encrypt_key
-#else
- adr r3,private_AES_set_encrypt_key
-#endif
teq r0,#0
-#if __ARM_ARCH__>=7
- itt eq @ Thumb2 thing, sanity check in ARM
-#endif
moveq r0,#-1
beq .Labrt
teq r2,#0
-#if __ARM_ARCH__>=7
- itt eq @ Thumb2 thing, sanity check in ARM
-#endif
moveq r0,#-1
beq .Labrt
@@ -450,9 +422,6 @@ _armv4_AES_set_encrypt_key:
teq r1,#192
beq .Lok
teq r1,#256
-#if __ARM_ARCH__>=7
- itt ne @ Thumb2 thing, sanity check in ARM
-#endif
movne r0,#-1
bne .Labrt
@@ -607,9 +576,6 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-16]
subs $rounds,$rounds,#1
str $s3,[$key,#-12]
-#if __ARM_ARCH__>=7
- itt eq @ Thumb2 thing, sanity check in ARM
-#endif
subeq r2,$key,#216
beq .Ldone
@@ -679,9 +645,6 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-24]
subs $rounds,$rounds,#1
str $s3,[$key,#-20]
-#if __ARM_ARCH__>=7
- itt eq @ Thumb2 thing, sanity check in ARM
-#endif
subeq r2,$key,#256
beq .Ldone
@@ -711,17 +674,11 @@ _armv4_AES_set_encrypt_key:
str $i3,[$key,#-4]
b .L256_loop
-.align 2
.Ldone: mov r0,#0
ldmia sp!,{r4-r12,lr}
-.Labrt:
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
+.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
-#endif
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.global private_AES_set_decrypt_key
@@ -731,57 +688,34 @@ private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
bl _armv4_AES_set_encrypt_key
teq r0,#0
- ldr lr,[sp],#4 @ pop lr
+ ldrne lr,[sp],#4 @ pop lr
bne .Labrt
- mov r0,r2 @ AES_set_encrypt_key preserves r2,
- mov r1,r2 @ which is AES_KEY *key
- b _armv4_AES_set_enc2dec_key
-.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+ stmdb sp!,{r4-r12}
-@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
-.global AES_set_enc2dec_key
-.type AES_set_enc2dec_key,%function
-.align 5
-AES_set_enc2dec_key:
-_armv4_AES_set_enc2dec_key:
- stmdb sp!,{r4-r12,lr}
-
- ldr $rounds,[r0,#240]
- mov $i1,r0 @ input
- add $i2,r0,$rounds,lsl#4
- mov $key,r1 @ ouput
- add $tbl,r1,$rounds,lsl#4
- str $rounds,[r1,#240]
-
-.Linv: ldr $s0,[$i1],#16
- ldr $s1,[$i1,#-12]
- ldr $s2,[$i1,#-8]
- ldr $s3,[$i1,#-4]
- ldr $t1,[$i2],#-16
- ldr $t2,[$i2,#16+4]
- ldr $t3,[$i2,#16+8]
- ldr $i3,[$i2,#16+12]
- str $s0,[$tbl],#-16
- str $s1,[$tbl,#16+4]
- str $s2,[$tbl,#16+8]
- str $s3,[$tbl,#16+12]
- str $t1,[$key],#16
- str $t2,[$key,#-12]
- str $t3,[$key,#-8]
- str $i3,[$key,#-4]
- teq $i1,$i2
- bne .Linv
+ ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
+ mov $key,r2 @ which is AES_KEY *key
+ mov $i1,r2
+ add $i2,r2,$rounds,lsl#4
- ldr $s0,[$i1]
+.Linv: ldr $s0,[$i1]
ldr $s1,[$i1,#4]
ldr $s2,[$i1,#8]
ldr $s3,[$i1,#12]
- str $s0,[$key]
- str $s1,[$key,#4]
- str $s2,[$key,#8]
- str $s3,[$key,#12]
- sub $key,$key,$rounds,lsl#3
+ ldr $t1,[$i2]
+ ldr $t2,[$i2,#4]
+ ldr $t3,[$i2,#8]
+ ldr $i3,[$i2,#12]
+ str $s0,[$i2],#-16
+ str $s1,[$i2,#16+4]
+ str $s2,[$i2,#16+8]
+ str $s3,[$i2,#16+12]
+ str $t1,[$i1],#16
+ str $t2,[$i1,#-12]
+ str $t3,[$i1,#-8]
+ str $i3,[$i1,#-4]
+ teq $i1,$i2
+ bne .Linv
___
$mask80=$i1;
$mask1b=$i2;
@@ -839,7 +773,7 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size AES_set_enc2dec_key,.-AES_set_enc2dec_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.type AES_Td,%object
.align 5
@@ -949,11 +883,7 @@ AES_Td:
.type AES_decrypt,%function
.align 5
AES_decrypt:
-#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_decrypt
-#else
- adr r3,AES_decrypt
-#endif
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
@@ -1150,9 +1080,8 @@ _armv4_AES_decrypt:
ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
and $i3,lr,$s1,lsr#8
- add $s1,$tbl,$s1,lsr#24
ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
- ldrb $s1,[$s1] @ Td4[s1>>24]
+ ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
eor $s0,$i1,$s0,lsl#24
ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
@@ -1165,8 +1094,7 @@ _armv4_AES_decrypt:
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16
- add $s2,$tbl,$s2,lsr#24
- ldrb $s2,[$s2] @ Td4[s2>>24]
+ ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s1,$i2,$s1,lsl#16
@@ -1178,9 +1106,8 @@ _armv4_AES_decrypt:
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2
- add $s3,$tbl,$s3,lsr#24
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
- ldrb $s3,[$s3] @ Td4[s3>>24]
+ ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16
ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8
@@ -1203,15 +1130,5 @@ _armv4_AES_decrypt:
___
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx\tlr/gm;
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
print $code;
close STDOUT; # enforce flush
diff --git a/app/openssl/crypto/aes/asm/aes-armv4.s b/app/openssl/crypto/aes/asm/aes-armv4.s
index 333a5227..2697d4ce 100644
--- a/app/openssl/crypto/aes/asm/aes-armv4.s
+++ b/app/openssl/crypto/aes/asm/aes-armv4.s
@@ -1,53 +1,6 @@
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@ ====================================================================
-
-@ AES for ARMv4
-
-@ January 2007.
-@
-@ Code uses single 1K S-box and is >2 times faster than code generated
-@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
-@ allows to merge logical or arithmetic operation with shift or rotate
-@ in one instruction and emit combined result every cycle. The module
-@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit
-@ key [on single-issue Xscale PXA250 core].
-
-@ May 2007.
-@
-@ AES_set_[en|de]crypt_key is added.
-
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 12% improvement on
-@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 16%
-@ improvement on Cortex A8 core and ~21.5 cycles per byte.
-
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-#endif
-
+#include "arm_arch.h"
.text
-#if __ARM_ARCH__<7
-.code 32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
.code 32
-# endif
-#endif
.type AES_Te,%object
.align 5
@@ -161,11 +114,7 @@ AES_Te:
.type AES_encrypt,%function
.align 5
AES_encrypt:
-#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_encrypt
-#else
- adr r3,AES_encrypt
-#endif
stmdb sp!,{r1,r4-r12,lr}
mov r12,r0 @ inp
mov r11,r2
@@ -407,21 +356,11 @@ _armv4_AES_encrypt:
.align 5
private_AES_set_encrypt_key:
_armv4_AES_set_encrypt_key:
-#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_set_encrypt_key
-#else
- adr r3,private_AES_set_encrypt_key
-#endif
teq r0,#0
-#if __ARM_ARCH__>=7
- itt eq @ Thumb2 thing, sanity check in ARM
-#endif
moveq r0,#-1
beq .Labrt
teq r2,#0
-#if __ARM_ARCH__>=7
- itt eq @ Thumb2 thing, sanity check in ARM
-#endif
moveq r0,#-1
beq .Labrt
@@ -430,9 +369,6 @@ _armv4_AES_set_encrypt_key:
teq r1,#192
beq .Lok
teq r1,#256
-#if __ARM_ARCH__>=7
- itt ne @ Thumb2 thing, sanity check in ARM
-#endif
movne r0,#-1
bne .Labrt
@@ -587,9 +523,6 @@ _armv4_AES_set_encrypt_key:
str r2,[r11,#-16]
subs r12,r12,#1
str r3,[r11,#-12]
-#if __ARM_ARCH__>=7
- itt eq @ Thumb2 thing, sanity check in ARM
-#endif
subeq r2,r11,#216
beq .Ldone
@@ -659,9 +592,6 @@ _armv4_AES_set_encrypt_key:
str r2,[r11,#-24]
subs r12,r12,#1
str r3,[r11,#-20]
-#if __ARM_ARCH__>=7
- itt eq @ Thumb2 thing, sanity check in ARM
-#endif
subeq r2,r11,#256
beq .Ldone
@@ -691,17 +621,11 @@ _armv4_AES_set_encrypt_key:
str r9,[r11,#-4]
b .L256_loop
-.align 2
.Ldone: mov r0,#0
ldmia sp!,{r4-r12,lr}
-.Labrt:
-#if __ARM_ARCH__>=5
- bx lr @ .word 0xe12fff1e
-#else
- tst lr,#1
+.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.global private_AES_set_decrypt_key
@@ -711,57 +635,34 @@ private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
bl _armv4_AES_set_encrypt_key
teq r0,#0
- ldr lr,[sp],#4 @ pop lr
+ ldrne lr,[sp],#4 @ pop lr
bne .Labrt
- mov r0,r2 @ AES_set_encrypt_key preserves r2,
- mov r1,r2 @ which is AES_KEY *key
- b _armv4_AES_set_enc2dec_key
-.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
-
-@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
-.global AES_set_enc2dec_key
-.type AES_set_enc2dec_key,%function
-.align 5
-AES_set_enc2dec_key:
-_armv4_AES_set_enc2dec_key:
- stmdb sp!,{r4-r12,lr}
-
- ldr r12,[r0,#240]
- mov r7,r0 @ input
- add r8,r0,r12,lsl#4
- mov r11,r1 @ ouput
- add r10,r1,r12,lsl#4
- str r12,[r1,#240]
+ stmdb sp!,{r4-r12}
-.Linv: ldr r0,[r7],#16
- ldr r1,[r7,#-12]
- ldr r2,[r7,#-8]
- ldr r3,[r7,#-4]
- ldr r4,[r8],#-16
- ldr r5,[r8,#16+4]
- ldr r6,[r8,#16+8]
- ldr r9,[r8,#16+12]
- str r0,[r10],#-16
- str r1,[r10,#16+4]
- str r2,[r10,#16+8]
- str r3,[r10,#16+12]
- str r4,[r11],#16
- str r5,[r11,#-12]
- str r6,[r11,#-8]
- str r9,[r11,#-4]
- teq r7,r8
- bne .Linv
+ ldr r12,[r2,#240] @ AES_set_encrypt_key preserves r2,
+ mov r11,r2 @ which is AES_KEY *key
+ mov r7,r2
+ add r8,r2,r12,lsl#4
- ldr r0,[r7]
+.Linv: ldr r0,[r7]
ldr r1,[r7,#4]
ldr r2,[r7,#8]
ldr r3,[r7,#12]
- str r0,[r11]
- str r1,[r11,#4]
- str r2,[r11,#8]
- str r3,[r11,#12]
- sub r11,r11,r12,lsl#3
+ ldr r4,[r8]
+ ldr r5,[r8,#4]
+ ldr r6,[r8,#8]
+ ldr r9,[r8,#12]
+ str r0,[r8],#-16
+ str r1,[r8,#16+4]
+ str r2,[r8,#16+8]
+ str r3,[r8,#16+12]
+ str r4,[r7],#16
+ str r5,[r7,#-12]
+ str r6,[r7,#-8]
+ str r9,[r7,#-4]
+ teq r7,r8
+ bne .Linv
ldr r0,[r11,#16]! @ prefetch tp1
mov r7,#0x80
mov r8,#0x1b
@@ -814,7 +715,7 @@ _armv4_AES_set_enc2dec_key:
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
-.size AES_set_enc2dec_key,.-AES_set_enc2dec_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.type AES_Td,%object
.align 5
@@ -924,11 +825,7 @@ AES_Td:
.type AES_decrypt,%function
.align 5
AES_decrypt:
-#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_decrypt
-#else
- adr r3,AES_decrypt
-#endif
stmdb sp!,{r1,r4-r12,lr}
mov r12,r0 @ inp
mov r11,r2
@@ -1125,9 +1022,8 @@ _armv4_AES_decrypt:
ldrb r6,[r10,r9] @ Td4[s0>>0]
and r9,lr,r1,lsr#8
- add r1,r10,r1,lsr#24
ldrb r7,[r10,r7] @ Td4[s1>>0]
- ldrb r1,[r1] @ Td4[s1>>24]
+ ldrb r1,[r10,r1,lsr#24] @ Td4[s1>>24]
ldrb r8,[r10,r8] @ Td4[s1>>16]
eor r0,r7,r0,lsl#24
ldrb r9,[r10,r9] @ Td4[s1>>8]
@@ -1140,8 +1036,7 @@ _armv4_AES_decrypt:
ldrb r8,[r10,r8] @ Td4[s2>>0]
and r9,lr,r2,lsr#16
- add r2,r10,r2,lsr#24
- ldrb r2,[r2] @ Td4[s2>>24]
+ ldrb r2,[r10,r2,lsr#24] @ Td4[s2>>24]
eor r0,r0,r7,lsl#8
ldrb r9,[r10,r9] @ Td4[s2>>16]
eor r1,r8,r1,lsl#16
@@ -1153,9 +1048,8 @@ _armv4_AES_decrypt:
ldrb r8,[r10,r8] @ Td4[s3>>8]
and r9,lr,r3 @ i2
- add r3,r10,r3,lsr#24
ldrb r9,[r10,r9] @ Td4[s3>>0]
- ldrb r3,[r3] @ Td4[s3>>24]
+ ldrb r3,[r10,r3,lsr#24] @ Td4[s3>>24]
eor r0,r0,r7,lsl#16
ldr r7,[r11,#0]
eor r1,r1,r8,lsl#8
diff --git a/app/openssl/crypto/aes/asm/aesv8-armx-64.S b/app/openssl/crypto/aes/asm/aesv8-armx-64.S
deleted file mode 100644
index be0a13df..00000000
--- a/app/openssl/crypto/aes/asm/aesv8-armx-64.S
+++ /dev/null
@@ -1,761 +0,0 @@
-#include "arm_arch.h"
-
-#if __ARM_ARCH__>=7
-.text
-.arch armv8-a+crypto
-.align 5
-rcon:
-.long 0x01,0x01,0x01,0x01
-.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
-.long 0x1b,0x1b,0x1b,0x1b
-
-.globl aes_v8_set_encrypt_key
-.type aes_v8_set_encrypt_key,%function
-.align 5
-aes_v8_set_encrypt_key:
-.Lenc_key:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- adr x3,rcon
- cmp w1,#192
-
- eor v0.16b,v0.16b,v0.16b
- ld1 {v3.16b},[x0],#16
- mov w1,#8 // reuse w1
- ld1 {v1.4s,v2.4s},[x3],#32
-
- b.lt .Loop128
- b.eq .L192
- b .L256
-
-.align 4
-.Loop128:
- tbl v6.16b,{v3.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v3.4s},[x2],#16
- aese v6.16b,v0.16b
- subs w1,w1,#1
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
- eor v3.16b,v3.16b,v5.16b
- shl v1.16b,v1.16b,#1
- eor v3.16b,v3.16b,v6.16b
- b.ne .Loop128
-
- ld1 {v1.4s},[x3]
-
- tbl v6.16b,{v3.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v3.4s},[x2],#16
- aese v6.16b,v0.16b
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
- eor v3.16b,v3.16b,v5.16b
- shl v1.16b,v1.16b,#1
- eor v3.16b,v3.16b,v6.16b
-
- tbl v6.16b,{v3.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v3.4s},[x2],#16
- aese v6.16b,v0.16b
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
- eor v3.16b,v3.16b,v5.16b
- eor v3.16b,v3.16b,v6.16b
- st1 {v3.4s},[x2]
- add x2,x2,#0x50
-
- mov w12,#10
- b .Ldone
-
-.align 4
-.L192:
- ld1 {v4.8b},[x0],#8
- movi v6.16b,#8 // borrow v6.16b
- st1 {v3.4s},[x2],#16
- sub v2.16b,v2.16b,v6.16b // adjust the mask
-
-.Loop192:
- tbl v6.16b,{v4.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v4.8b},[x2],#8
- aese v6.16b,v0.16b
- subs w1,w1,#1
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
-
- dup v5.4s,v3.s[3]
- eor v5.16b,v5.16b,v4.16b
- eor v6.16b,v6.16b,v1.16b
- ext v4.16b,v0.16b,v4.16b,#12
- shl v1.16b,v1.16b,#1
- eor v4.16b,v4.16b,v5.16b
- eor v3.16b,v3.16b,v6.16b
- eor v4.16b,v4.16b,v6.16b
- st1 {v3.4s},[x2],#16
- b.ne .Loop192
-
- mov w12,#12
- add x2,x2,#0x20
- b .Ldone
-
-.align 4
-.L256:
- ld1 {v4.16b},[x0]
- mov w1,#7
- mov w12,#14
- st1 {v3.4s},[x2],#16
-
-.Loop256:
- tbl v6.16b,{v4.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v4.4s},[x2],#16
- aese v6.16b,v0.16b
- subs w1,w1,#1
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
- eor v3.16b,v3.16b,v5.16b
- shl v1.16b,v1.16b,#1
- eor v3.16b,v3.16b,v6.16b
- st1 {v3.4s},[x2],#16
- b.eq .Ldone
-
- dup v6.4s,v3.s[3] // just splat
- ext v5.16b,v0.16b,v4.16b,#12
- aese v6.16b,v0.16b
-
- eor v4.16b,v4.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v4.16b,v4.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v4.16b,v4.16b,v5.16b
-
- eor v4.16b,v4.16b,v6.16b
- b .Loop256
-
-.Ldone:
- str w12,[x2]
-
- eor x0,x0,x0 // return value
- ldr x29,[sp],#16
- ret
-.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
-
-.globl aes_v8_set_decrypt_key
-.type aes_v8_set_decrypt_key,%function
-.align 5
-aes_v8_set_decrypt_key:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- bl .Lenc_key
-
- sub x2,x2,#240 // restore original x2
- mov x4,#-16
- add x0,x2,x12,lsl#4 // end of key schedule
-
- ld1 {v0.4s},[x2]
- ld1 {v1.4s},[x0]
- st1 {v0.4s},[x0],x4
- st1 {v1.4s},[x2],#16
-
-.Loop_imc:
- ld1 {v0.4s},[x2]
- ld1 {v1.4s},[x0]
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- st1 {v0.4s},[x0],x4
- st1 {v1.4s},[x2],#16
- cmp x0,x2
- b.hi .Loop_imc
-
- ld1 {v0.4s},[x2]
- aesimc v0.16b,v0.16b
- st1 {v0.4s},[x0]
-
- eor x0,x0,x0 // return value
- ldp x29,x30,[sp],#16
- ret
-.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
-.globl aes_v8_encrypt
-.type aes_v8_encrypt,%function
-.align 5
-aes_v8_encrypt:
- ldr w3,[x2,#240]
- ld1 {v0.4s},[x2],#16
- ld1 {v2.16b},[x0]
- sub w3,w3,#2
- ld1 {v1.4s},[x2],#16
-
-.Loop_enc:
- aese v2.16b,v0.16b
- ld1 {v0.4s},[x2],#16
- aesmc v2.16b,v2.16b
- subs w3,w3,#2
- aese v2.16b,v1.16b
- ld1 {v1.4s},[x2],#16
- aesmc v2.16b,v2.16b
- b.gt .Loop_enc
-
- aese v2.16b,v0.16b
- ld1 {v0.4s},[x2]
- aesmc v2.16b,v2.16b
- aese v2.16b,v1.16b
- eor v2.16b,v2.16b,v0.16b
-
- st1 {v2.16b},[x1]
- ret
-.size aes_v8_encrypt,.-aes_v8_encrypt
-.globl aes_v8_decrypt
-.type aes_v8_decrypt,%function
-.align 5
-aes_v8_decrypt:
- ldr w3,[x2,#240]
- ld1 {v0.4s},[x2],#16
- ld1 {v2.16b},[x0]
- sub w3,w3,#2
- ld1 {v1.4s},[x2],#16
-
-.Loop_dec:
- aesd v2.16b,v0.16b
- ld1 {v0.4s},[x2],#16
- aesimc v2.16b,v2.16b
- subs w3,w3,#2
- aesd v2.16b,v1.16b
- ld1 {v1.4s},[x2],#16
- aesimc v2.16b,v2.16b
- b.gt .Loop_dec
-
- aesd v2.16b,v0.16b
- ld1 {v0.4s},[x2]
- aesimc v2.16b,v2.16b
- aesd v2.16b,v1.16b
- eor v2.16b,v2.16b,v0.16b
-
- st1 {v2.16b},[x1]
- ret
-.size aes_v8_decrypt,.-aes_v8_decrypt
-.globl aes_v8_cbc_encrypt
-.type aes_v8_cbc_encrypt,%function
-.align 5
-aes_v8_cbc_encrypt:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- subs x2,x2,#16
- mov x8,#16
- b.lo .Lcbc_abort
- csel x8,xzr,x8,eq
-
- cmp w5,#0 // en- or decrypting?
- ldr w5,[x3,#240]
- and x2,x2,#-16
- ld1 {v6.16b},[x4]
- ld1 {v0.16b},[x0],x8
-
- ld1 {v16.4s-v17.4s},[x3] // load key schedule...
- sub w5,w5,#6
- add x7,x3,x5,lsl#4 // pointer to last 7 round keys
- sub w5,w5,#2
- ld1 {v18.4s-v19.4s},[x7],#32
- ld1 {v20.4s-v21.4s},[x7],#32
- ld1 {v22.4s-v23.4s},[x7],#32
- ld1 {v7.4s},[x7]
-
- add x7,x3,#32
- mov w6,w5
- b.eq .Lcbc_dec
-
- cmp w5,#2
- eor v0.16b,v0.16b,v6.16b
- eor v5.16b,v16.16b,v7.16b
- b.eq .Lcbc_enc128
-
-.Loop_cbc_enc:
- aese v0.16b,v16.16b
- ld1 {v16.4s},[x7],#16
- aesmc v0.16b,v0.16b
- subs w6,w6,#2
- aese v0.16b,v17.16b
- ld1 {v17.4s},[x7],#16
- aesmc v0.16b,v0.16b
- b.gt .Loop_cbc_enc
-
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- subs x2,x2,#16
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
- aese v0.16b,v18.16b
- aesmc v0.16b,v0.16b
- add x7,x3,#16
- aese v0.16b,v19.16b
- aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
- aese v0.16b,v20.16b
- aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
- aese v0.16b,v21.16b
- aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- aese v0.16b,v22.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v23.16b
-
- mov w6,w5
- eor v6.16b,v0.16b,v7.16b
- st1 {v6.16b},[x1],#16
- b.hs .Loop_cbc_enc
-
- b .Lcbc_done
-
-.align 5
-.Lcbc_enc128:
- ld1 {v2.4s-v3.4s},[x7]
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- b .Lenter_cbc_enc128
-.Loop_cbc_enc128:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
-.Lenter_cbc_enc128:
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- subs x2,x2,#16
- aese v0.16b,v2.16b
- aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
- aese v0.16b,v3.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v18.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v19.16b
- aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
- aese v0.16b,v20.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v21.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v22.16b
- aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
- aese v0.16b,v23.16b
- eor v6.16b,v0.16b,v7.16b
- b.hs .Loop_cbc_enc128
-
- st1 {v6.16b},[x1],#16
- b .Lcbc_done
-
-.align 5
-.Lcbc_dec128:
- ld1 {v4.4s-v5.4s},[x7]
- eor v6.16b,v6.16b,v7.16b
- eor v2.16b,v0.16b,v7.16b
- mov x12,x8
-
-.Loop2x_cbc_dec128:
- aesd v0.16b,v16.16b
- aesd v1.16b,v16.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- subs x2,x2,#32
- aesd v0.16b,v17.16b
- aesd v1.16b,v17.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- csel x8,xzr,x8,lo
- aesd v0.16b,v4.16b
- aesd v1.16b,v4.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- csel x12,xzr,x12,ls
- aesd v0.16b,v5.16b
- aesd v1.16b,v5.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- aesd v0.16b,v18.16b
- aesd v1.16b,v18.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- aesd v0.16b,v19.16b
- aesd v1.16b,v19.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- aesd v0.16b,v20.16b
- aesd v1.16b,v20.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- aesd v0.16b,v21.16b
- aesd v1.16b,v21.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- aesd v0.16b,v22.16b
- aesd v1.16b,v22.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- aesd v0.16b,v23.16b
- aesd v1.16b,v23.16b
-
- eor v6.16b,v6.16b,v0.16b
- ld1 {v0.16b},[x0],x8
- eor v2.16b,v2.16b,v1.16b
- ld1 {v1.16b},[x0],x12
- st1 {v6.16b},[x1],#16
- eor v6.16b,v3.16b,v7.16b
- st1 {v2.16b},[x1],#16
- eor v2.16b,v0.16b,v7.16b
- orr v3.16b,v1.16b,v1.16b
- b.hs .Loop2x_cbc_dec128
-
- adds x2,x2,#32
- eor v6.16b,v6.16b,v7.16b
- b.eq .Lcbc_done
- eor v2.16b,v2.16b,v7.16b
- b .Lcbc_dec_tail
-
-.align 5
-.Lcbc_dec:
- subs x2,x2,#16
- orr v2.16b,v0.16b,v0.16b
- b.lo .Lcbc_dec_tail
-
- csel x8,xzr,x8,eq
- cmp w5,#2
- ld1 {v1.16b},[x0],x8
- orr v3.16b,v1.16b,v1.16b
- b.eq .Lcbc_dec128
-
-.Loop2x_cbc_dec:
- aesd v0.16b,v16.16b
- aesd v1.16b,v16.16b
- ld1 {v16.4s},[x7],#16
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- subs w6,w6,#2
- aesd v0.16b,v17.16b
- aesd v1.16b,v17.16b
- ld1 {v17.4s},[x7],#16
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- b.gt .Loop2x_cbc_dec
-
- aesd v0.16b,v16.16b
- aesd v1.16b,v16.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- eor v4.16b,v6.16b,v7.16b
- eor v5.16b,v2.16b,v7.16b
- aesd v0.16b,v17.16b
- aesd v1.16b,v17.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- orr v6.16b,v3.16b,v3.16b
- subs x2,x2,#32
- aesd v0.16b,v18.16b
- aesd v1.16b,v18.16b
- aesimc v0.16b,v0.16b
- csel x8,xzr,x8,lo
- aesimc v1.16b,v1.16b
- mov x7,x3
- aesd v0.16b,v19.16b
- aesd v1.16b,v19.16b
- aesimc v0.16b,v0.16b
- ld1 {v2.16b},[x0],x8
- aesimc v1.16b,v1.16b
- csel x8,xzr,x8,ls
- aesd v0.16b,v20.16b
- aesd v1.16b,v20.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- ld1 {v3.16b},[x0],x8
- aesd v0.16b,v21.16b
- aesd v1.16b,v21.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
- aesd v0.16b,v22.16b
- aesd v1.16b,v22.16b
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- aesd v0.16b,v23.16b
- aesd v1.16b,v23.16b
-
- mov w6,w5
- eor v4.16b,v4.16b,v0.16b
- eor v5.16b,v5.16b,v1.16b
- orr v0.16b,v2.16b,v2.16b
- st1 {v4.16b},[x1],#16
- orr v1.16b,v3.16b,v3.16b
- st1 {v5.16b},[x1],#16
- b.hs .Loop2x_cbc_dec
-
- adds x2,x2,#32
- b.eq .Lcbc_done
-
-.Lcbc_dec_tail:
- aesd v0.16b,v16.16b
- ld1 {v16.4s},[x7],#16
- aesimc v0.16b,v0.16b
- subs w6,w6,#2
- aesd v0.16b,v17.16b
- ld1 {v17.4s},[x7],#16
- aesimc v0.16b,v0.16b
- b.gt .Lcbc_dec_tail
-
- aesd v0.16b,v16.16b
- aesimc v0.16b,v0.16b
- aesd v0.16b,v17.16b
- aesimc v0.16b,v0.16b
- eor v4.16b,v6.16b,v7.16b
- aesd v0.16b,v18.16b
- aesimc v0.16b,v0.16b
- orr v6.16b,v2.16b,v2.16b
- aesd v0.16b,v19.16b
- aesimc v0.16b,v0.16b
- aesd v0.16b,v20.16b
- aesimc v0.16b,v0.16b
- aesd v0.16b,v21.16b
- aesimc v0.16b,v0.16b
- aesd v0.16b,v22.16b
- aesimc v0.16b,v0.16b
- aesd v0.16b,v23.16b
-
- eor v4.16b,v4.16b,v0.16b
- st1 {v4.16b},[x1],#16
-
-.Lcbc_done:
- st1 {v6.16b},[x4]
-.Lcbc_abort:
- ldr x29,[sp],#16
- ret
-.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
-.globl aes_v8_ctr32_encrypt_blocks
-.type aes_v8_ctr32_encrypt_blocks,%function
-.align 5
-aes_v8_ctr32_encrypt_blocks:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- ldr w5,[x3,#240]
-
- ldr w8, [x4, #12]
- ld1 {v0.4s},[x4]
-
- ld1 {v16.4s-v17.4s},[x3] // load key schedule...
- sub w5,w5,#6
- add x7,x3,x5,lsl#4 // pointer to last 7 round keys
- sub w5,w5,#2
- ld1 {v18.4s-v19.4s},[x7],#32
- ld1 {v20.4s-v21.4s},[x7],#32
- ld1 {v22.4s-v23.4s},[x7],#32
- ld1 {v7.4s},[x7]
-
- add x7,x3,#32
- mov w6,w5
-
- subs x2,x2,#2
- b.lo .Lctr32_tail
-
-#ifndef __ARMEB__
- rev w8, w8
-#endif
- orr v1.16b,v0.16b,v0.16b
- add w8, w8, #1
- orr v6.16b,v0.16b,v0.16b
- rev w10, w8
- cmp w5,#2
- mov v1.s[3],w10
- b.eq .Lctr32_128
-
-.Loop2x_ctr32:
- aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- ld1 {v16.4s},[x7],#16
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- subs w6,w6,#2
- aese v0.16b,v17.16b
- aese v1.16b,v17.16b
- ld1 {v17.4s},[x7],#16
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- b.gt .Loop2x_ctr32
-
- aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- aesmc v4.16b,v0.16b
- orr v0.16b,v6.16b,v6.16b
- aesmc v5.16b,v1.16b
- orr v1.16b,v6.16b,v6.16b
- aese v4.16b,v17.16b
- aese v5.16b,v17.16b
- ld1 {v2.16b},[x0],#16
- aesmc v4.16b,v4.16b
- ld1 {v3.16b},[x0],#16
- aesmc v5.16b,v5.16b
- add w8,w8,#1
- aese v4.16b,v18.16b
- aese v5.16b,v18.16b
- rev w9,w8
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
- add w8,w8,#1
- aese v4.16b,v19.16b
- aese v5.16b,v19.16b
- eor v2.16b,v2.16b,v7.16b
- rev w10,w8
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
- eor v3.16b,v3.16b,v7.16b
- mov x7,x3
- aese v4.16b,v20.16b
- aese v5.16b,v20.16b
- subs x2,x2,#2
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
- ld1 {v16.4s-v17.4s},[x7],#32 // re-pre-load rndkey[0-1]
- aese v4.16b,v21.16b
- aese v5.16b,v21.16b
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
- aese v4.16b,v22.16b
- aese v5.16b,v22.16b
- mov v0.s[3], w9
- aesmc v4.16b,v4.16b
- mov v1.s[3], w10
- aesmc v5.16b,v5.16b
- aese v4.16b,v23.16b
- aese v5.16b,v23.16b
-
- mov w6,w5
- eor v2.16b,v2.16b,v4.16b
- eor v3.16b,v3.16b,v5.16b
- st1 {v2.16b},[x1],#16
- st1 {v3.16b},[x1],#16
- b.hs .Loop2x_ctr32
-
- adds x2,x2,#2
- b.eq .Lctr32_done
- b .Lctr32_tail
-
-.Lctr32_128:
- ld1 {v4.4s-v5.4s},[x7]
-
-.Loop2x_ctr32_128:
- aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- aesmc v0.16b,v0.16b
- ld1 {v2.16b},[x0],#16
- aesmc v1.16b,v1.16b
- ld1 {v3.16b},[x0],#16
- aese v0.16b,v17.16b
- aese v1.16b,v17.16b
- add w8,w8,#1
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- rev w9,w8
- aese v0.16b,v4.16b
- aese v1.16b,v4.16b
- add w8,w8,#1
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- rev w10,w8
- aese v0.16b,v5.16b
- aese v1.16b,v5.16b
- subs x2,x2,#2
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v18.16b
- aese v1.16b,v18.16b
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v19.16b
- aese v1.16b,v19.16b
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v20.16b
- aese v1.16b,v20.16b
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v21.16b
- aese v1.16b,v21.16b
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v22.16b
- aese v1.16b,v22.16b
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- eor v2.16b,v2.16b,v7.16b
- aese v0.16b,v23.16b
- eor v3.16b,v3.16b,v7.16b
- aese v1.16b,v23.16b
-
- eor v2.16b,v2.16b,v0.16b
- orr v0.16b,v6.16b,v6.16b
- eor v3.16b,v3.16b,v1.16b
- orr v1.16b,v6.16b,v6.16b
- st1 {v2.16b},[x1],#16
- mov v0.s[3], w9
- st1 {v3.16b},[x1],#16
- mov v1.s[3], w10
- b.hs .Loop2x_ctr32_128
-
- adds x2,x2,#2
- b.eq .Lctr32_done
-
-.Lctr32_tail:
- aese v0.16b,v16.16b
- ld1 {v16.4s},[x7],#16
- aesmc v0.16b,v0.16b
- subs w6,w6,#2
- aese v0.16b,v17.16b
- ld1 {v17.4s},[x7],#16
- aesmc v0.16b,v0.16b
- b.gt .Lctr32_tail
-
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- ld1 {v2.16b},[x0]
- aese v0.16b,v18.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v19.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v20.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v21.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v22.16b
- aesmc v0.16b,v0.16b
- eor v2.16b,v2.16b,v7.16b
- aese v0.16b,v23.16b
-
- eor v2.16b,v2.16b,v0.16b
- st1 {v2.16b},[x1]
-
-.Lctr32_done:
- ldr x29,[sp],#16
- ret
-.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
-#endif
diff --git a/app/openssl/crypto/aes/asm/aesv8-armx.S b/app/openssl/crypto/aes/asm/aesv8-armx.S
deleted file mode 100644
index 1637e4d4..00000000
--- a/app/openssl/crypto/aes/asm/aesv8-armx.S
+++ /dev/null
@@ -1,767 +0,0 @@
-#include "arm_arch.h"
-
-#if __ARM_ARCH__>=7
-.text
-.fpu neon
-.code 32
-.align 5
-rcon:
-.long 0x01,0x01,0x01,0x01
-.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
-.long 0x1b,0x1b,0x1b,0x1b
-
-.globl aes_v8_set_encrypt_key
-.type aes_v8_set_encrypt_key,%function
-.align 5
-aes_v8_set_encrypt_key:
-.Lenc_key:
- adr r3,rcon
- cmp r1,#192
-
- veor q0,q0,q0
- vld1.8 {q3},[r0]!
- mov r1,#8 @ reuse r1
- vld1.32 {q1,q2},[r3]!
-
- blt .Loop128
- beq .L192
- b .L256
-
-.align 4
-.Loop128:
- vtbl.8 d20,{q3},d4
- vtbl.8 d21,{q3},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {q3},[r2]!
- .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
- subs r1,r1,#1
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q10,q10,q1
- veor q3,q3,q9
- vshl.u8 q1,q1,#1
- veor q3,q3,q10
- bne .Loop128
-
- vld1.32 {q1},[r3]
-
- vtbl.8 d20,{q3},d4
- vtbl.8 d21,{q3},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {q3},[r2]!
- .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q10,q10,q1
- veor q3,q3,q9
- vshl.u8 q1,q1,#1
- veor q3,q3,q10
-
- vtbl.8 d20,{q3},d4
- vtbl.8 d21,{q3},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {q3},[r2]!
- .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q10,q10,q1
- veor q3,q3,q9
- veor q3,q3,q10
- vst1.32 {q3},[r2]
- add r2,r2,#0x50
-
- mov r12,#10
- b .Ldone
-
-.align 4
-.L192:
- vld1.8 {d16},[r0]!
- vmov.i8 q10,#8 @ borrow q10
- vst1.32 {q3},[r2]!
- vsub.i8 q2,q2,q10 @ adjust the mask
-
-.Loop192:
- vtbl.8 d20,{q8},d4
- vtbl.8 d21,{q8},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {d16},[r2]!
- .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
- subs r1,r1,#1
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
-
- vdup.32 q9,d7[1]
- veor q9,q9,q8
- veor q10,q10,q1
- vext.8 q8,q0,q8,#12
- vshl.u8 q1,q1,#1
- veor q8,q8,q9
- veor q3,q3,q10
- veor q8,q8,q10
- vst1.32 {q3},[r2]!
- bne .Loop192
-
- mov r12,#12
- add r2,r2,#0x20
- b .Ldone
-
-.align 4
-.L256:
- vld1.8 {q8},[r0]
- mov r1,#7
- mov r12,#14
- vst1.32 {q3},[r2]!
-
-.Loop256:
- vtbl.8 d20,{q8},d4
- vtbl.8 d21,{q8},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {q8},[r2]!
- .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
- subs r1,r1,#1
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q10,q10,q1
- veor q3,q3,q9
- vshl.u8 q1,q1,#1
- veor q3,q3,q10
- vst1.32 {q3},[r2]!
- beq .Ldone
-
- vdup.32 q10,d7[1]
- vext.8 q9,q0,q8,#12
- .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
-
- veor q8,q8,q9
- vext.8 q9,q0,q9,#12
- veor q8,q8,q9
- vext.8 q9,q0,q9,#12
- veor q8,q8,q9
-
- veor q8,q8,q10
- b .Loop256
-
-.Ldone:
- str r12,[r2]
-
- eor r0,r0,r0 @ return value
-
- bx lr
-.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
-
-.globl aes_v8_set_decrypt_key
-.type aes_v8_set_decrypt_key,%function
-.align 5
-aes_v8_set_decrypt_key:
- stmdb sp!,{r4,lr}
- bl .Lenc_key
-
- sub r2,r2,#240 @ restore original r2
- mov r4,#-16
- add r0,r2,r12,lsl#4 @ end of key schedule
-
- vld1.32 {q0},[r2]
- vld1.32 {q1},[r0]
- vst1.32 {q0},[r0],r4
- vst1.32 {q1},[r2]!
-
-.Loop_imc:
- vld1.32 {q0},[r2]
- vld1.32 {q1},[r0]
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- vst1.32 {q0},[r0],r4
- vst1.32 {q1},[r2]!
- cmp r0,r2
- bhi .Loop_imc
-
- vld1.32 {q0},[r2]
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- vst1.32 {q0},[r0]
-
- eor r0,r0,r0 @ return value
- ldmia sp!,{r4,pc}
-.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
-.globl aes_v8_encrypt
-.type aes_v8_encrypt,%function
-.align 5
-aes_v8_encrypt:
- ldr r3,[r2,#240]
- vld1.32 {q0},[r2]!
- vld1.8 {q2},[r0]
- sub r3,r3,#2
- vld1.32 {q1},[r2]!
-
-.Loop_enc:
- .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
- vld1.32 {q0},[r2]!
- .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
- subs r3,r3,#2
- .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
- vld1.32 {q1},[r2]!
- .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
- bgt .Loop_enc
-
- .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
- vld1.32 {q0},[r2]
- .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
- .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
- veor q2,q2,q0
-
- vst1.8 {q2},[r1]
- bx lr
-.size aes_v8_encrypt,.-aes_v8_encrypt
-.globl aes_v8_decrypt
-.type aes_v8_decrypt,%function
-.align 5
-aes_v8_decrypt:
- ldr r3,[r2,#240]
- vld1.32 {q0},[r2]!
- vld1.8 {q2},[r0]
- sub r3,r3,#2
- vld1.32 {q1},[r2]!
-
-.Loop_dec:
- .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
- vld1.32 {q0},[r2]!
- .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
- subs r3,r3,#2
- .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
- vld1.32 {q1},[r2]!
- .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
- bgt .Loop_dec
-
- .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
- vld1.32 {q0},[r2]
- .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
- .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
- veor q2,q2,q0
-
- vst1.8 {q2},[r1]
- bx lr
-.size aes_v8_decrypt,.-aes_v8_decrypt
-.globl aes_v8_cbc_encrypt
-.type aes_v8_cbc_encrypt,%function
-.align 5
-aes_v8_cbc_encrypt:
- mov ip,sp
- stmdb sp!,{r4-r8,lr}
- vstmdb sp!,{d8-d15} @ ABI specification says so
- ldmia ip,{r4-r5} @ load remaining args
- subs r2,r2,#16
- mov r8,#16
- blo .Lcbc_abort
- moveq r8,#0
-
- cmp r5,#0 @ en- or decrypting?
- ldr r5,[r3,#240]
- and r2,r2,#-16
- vld1.8 {q6},[r4]
- vld1.8 {q0},[r0],r8
-
- vld1.32 {q8-q9},[r3] @ load key schedule...
- sub r5,r5,#6
- add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
- sub r5,r5,#2
- vld1.32 {q10-q11},[r7]!
- vld1.32 {q12-q13},[r7]!
- vld1.32 {q14-q15},[r7]!
- vld1.32 {q7},[r7]
-
- add r7,r3,#32
- mov r6,r5
- beq .Lcbc_dec
-
- cmp r5,#2
- veor q0,q0,q6
- veor q5,q8,q7
- beq .Lcbc_enc128
-
-.Loop_cbc_enc:
- .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- vld1.32 {q8},[r7]!
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- subs r6,r6,#2
- .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- vld1.32 {q9},[r7]!
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- bgt .Loop_cbc_enc
-
- .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- subs r2,r2,#16
- .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- moveq r8,#0
- .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- add r7,r3,#16
- .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.8 {q8},[r0],r8
- .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- veor q8,q8,q5
- .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
- .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
-
- mov r6,r5
- veor q6,q0,q7
- vst1.8 {q6},[r1]!
- bhs .Loop_cbc_enc
-
- b .Lcbc_done
-
-.align 5
-.Lcbc_enc128:
- vld1.32 {q2-q3},[r7]
- .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- b .Lenter_cbc_enc128
-.Loop_cbc_enc128:
- .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vst1.8 {q6},[r1]!
-.Lenter_cbc_enc128:
- .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- subs r2,r2,#16
- .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- moveq r8,#0
- .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.8 {q8},[r0],r8
- .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- veor q8,q8,q5
- .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
- veor q6,q0,q7
- bhs .Loop_cbc_enc128
-
- vst1.8 {q6},[r1]!
- b .Lcbc_done
-
-.align 5
-.Lcbc_dec128:
- vld1.32 {q4-q5},[r7]
- veor q6,q6,q7
- veor q2,q0,q7
- mov r12,r8
-
-.Loop2x_cbc_dec128:
- .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
- .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- subs r2,r2,#32
- .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
- .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- movlo r8,#0
- .byte 0x48,0x03,0xb0,0xf3 @ aesd q0,q4
- .byte 0x48,0x23,0xb0,0xf3 @ aesd q1,q4
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- movls r12,#0
- .byte 0x4a,0x03,0xb0,0xf3 @ aesd q0,q5
- .byte 0x4a,0x23,0xb0,0xf3 @ aesd q1,q5
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10
- .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11
- .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
- .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
- .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
- .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
- .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
-
- veor q6,q6,q0
- vld1.8 {q0},[r0],r8
- veor q2,q2,q1
- vld1.8 {q1},[r0],r12
- vst1.8 {q6},[r1]!
- veor q6,q3,q7
- vst1.8 {q2},[r1]!
- veor q2,q0,q7
- vorr q3,q1,q1
- bhs .Loop2x_cbc_dec128
-
- adds r2,r2,#32
- veor q6,q6,q7
- beq .Lcbc_done
- veor q2,q2,q7
- b .Lcbc_dec_tail
-
-.align 5
-.Lcbc_dec:
- subs r2,r2,#16
- vorr q2,q0,q0
- blo .Lcbc_dec_tail
-
- moveq r8,#0
- cmp r5,#2
- vld1.8 {q1},[r0],r8
- vorr q3,q1,q1
- beq .Lcbc_dec128
-
-.Loop2x_cbc_dec:
- .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
- .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- vld1.32 {q8},[r7]!
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- subs r6,r6,#2
- .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
- .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- vld1.32 {q9},[r7]!
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- bgt .Loop2x_cbc_dec
-
- .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
- .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- veor q4,q6,q7
- veor q5,q2,q7
- .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
- .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- vorr q6,q3,q3
- subs r2,r2,#32
- .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10
- .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- movlo r8,#0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- mov r7,r3
- .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11
- .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- vld1.8 {q2},[r0],r8
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- movls r8,#0
- .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
- .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- vld1.8 {q3},[r0],r8
- .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
- .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
- .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
- .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
- .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
- .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
-
- mov r6,r5
- veor q4,q4,q0
- veor q5,q5,q1
- vorr q0,q2,q2
- vst1.8 {q4},[r1]!
- vorr q1,q3,q3
- vst1.8 {q5},[r1]!
- bhs .Loop2x_cbc_dec
-
- adds r2,r2,#32
- beq .Lcbc_done
-
-.Lcbc_dec_tail:
- .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
- vld1.32 {q8},[r7]!
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- subs r6,r6,#2
- .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
- vld1.32 {q9},[r7]!
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- bgt .Lcbc_dec_tail
-
- .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- veor q4,q6,q7
- .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- vorr q6,q2,q2
- .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
- .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
-
- veor q4,q4,q0
- vst1.8 {q4},[r1]!
-
-.Lcbc_done:
- vst1.8 {q6},[r4]
-.Lcbc_abort:
- vldmia sp!,{d8-d15}
- ldmia sp!,{r4-r8,pc}
-.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
-.globl aes_v8_ctr32_encrypt_blocks
-.type aes_v8_ctr32_encrypt_blocks,%function
-.align 5
-aes_v8_ctr32_encrypt_blocks:
- mov ip,sp
- stmdb sp!,{r4-r10,lr}
- vstmdb sp!,{d8-d15} @ ABI specification says so
- ldr r4, [ip] @ load remaining arg
- ldr r5,[r3,#240]
-
- ldr r8, [r4, #12]
- vld1.32 {q0},[r4]
-
- vld1.32 {q8-q9},[r3] @ load key schedule...
- sub r5,r5,#6
- add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
- sub r5,r5,#2
- vld1.32 {q10-q11},[r7]!
- vld1.32 {q12-q13},[r7]!
- vld1.32 {q14-q15},[r7]!
- vld1.32 {q7},[r7]
-
- add r7,r3,#32
- mov r6,r5
-
- subs r2,r2,#2
- blo .Lctr32_tail
-
-#ifndef __ARMEB__
- rev r8, r8
-#endif
- vorr q1,q0,q0
- add r8, r8, #1
- vorr q6,q0,q0
- rev r10, r8
- cmp r5,#2
- vmov.32 d3[1],r10
- beq .Lctr32_128
-
-.Loop2x_ctr32:
- .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
- vld1.32 {q8},[r7]!
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- subs r6,r6,#2
- .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
- vld1.32 {q9},[r7]!
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- bgt .Loop2x_ctr32
-
- .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
- .byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
- vorr q0,q6,q6
- .byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
- vorr q1,q6,q6
- .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
- .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
- vld1.8 {q2},[r0]!
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- vld1.8 {q3},[r0]!
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- add r8,r8,#1
- .byte 0x24,0x83,0xb0,0xf3 @ aese q4,q10
- .byte 0x24,0xa3,0xb0,0xf3 @ aese q5,q10
- rev r9,r8
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- add r8,r8,#1
- .byte 0x26,0x83,0xb0,0xf3 @ aese q4,q11
- .byte 0x26,0xa3,0xb0,0xf3 @ aese q5,q11
- veor q2,q2,q7
- rev r10,r8
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- veor q3,q3,q7
- mov r7,r3
- .byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
- .byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
- subs r2,r2,#2
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- vld1.32 {q8-q9},[r7]! @ re-pre-load rndkey[0-1]
- .byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
- .byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- .byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
- .byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
- vmov.32 d1[1], r9
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- vmov.32 d3[1], r10
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- .byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
- .byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
-
- mov r6,r5
- veor q2,q2,q4
- veor q3,q3,q5
- vst1.8 {q2},[r1]!
- vst1.8 {q3},[r1]!
- bhs .Loop2x_ctr32
-
- adds r2,r2,#2
- beq .Lctr32_done
- b .Lctr32_tail
-
-.Lctr32_128:
- vld1.32 {q4-q5},[r7]
-
-.Loop2x_ctr32_128:
- .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.8 {q2},[r0]!
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- vld1.8 {q3},[r0]!
- .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
- add r8,r8,#1
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- rev r9,r8
- .byte 0x08,0x03,0xb0,0xf3 @ aese q0,q4
- .byte 0x08,0x23,0xb0,0xf3 @ aese q1,q4
- add r8,r8,#1
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- rev r10,r8
- .byte 0x0a,0x03,0xb0,0xf3 @ aese q0,q5
- .byte 0x0a,0x23,0xb0,0xf3 @ aese q1,q5
- subs r2,r2,#2
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
- .byte 0x24,0x23,0xb0,0xf3 @ aese q1,q10
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
- .byte 0x26,0x23,0xb0,0xf3 @ aese q1,q11
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
- .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
- .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
- .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- veor q2,q2,q7
- .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
- veor q3,q3,q7
- .byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
-
- veor q2,q2,q0
- vorr q0,q6,q6
- veor q3,q3,q1
- vorr q1,q6,q6
- vst1.8 {q2},[r1]!
- vmov.32 d1[1], r9
- vst1.8 {q3},[r1]!
- vmov.32 d3[1], r10
- bhs .Loop2x_ctr32_128
-
- adds r2,r2,#2
- beq .Lctr32_done
-
-.Lctr32_tail:
- .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- vld1.32 {q8},[r7]!
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- subs r6,r6,#2
- .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- vld1.32 {q9},[r7]!
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- bgt .Lctr32_tail
-
- .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.8 {q2},[r0]
- .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
- .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- veor q2,q2,q7
- .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
-
- veor q2,q2,q0
- vst1.8 {q2},[r1]
-
-.Lctr32_done:
- vldmia sp!,{d8-d15}
- ldmia sp!,{r4-r10,pc}
-.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
-#endif
diff --git a/app/openssl/crypto/aes/asm/aesv8-armx.pl b/app/openssl/crypto/aes/asm/aesv8-armx.pl
deleted file mode 100644
index 415dc04a..00000000
--- a/app/openssl/crypto/aes/asm/aesv8-armx.pl
+++ /dev/null
@@ -1,980 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements support for ARMv8 AES instructions. The
-# module is endian-agnostic in sense that it supports both big- and
-# little-endian cases. As does it support both 32- and 64-bit modes
-# of operation. Latter is achieved by limiting amount of utilized
-# registers to 16, which implies additional instructions. This has
-# no effect on mighty Apple A7, as results are literally equal to
-# the theoretical estimates based on instruction latencies and issue
-# rate. It remains to be seen how does it affect other platforms...
-#
-# Performance in cycles per byte processed with 128-bit key:
-#
-# CBC enc CBC dec CTR
-# Apple A7 2.39 1.20 1.20
-# Cortex-A5x n/a n/a n/a
-
-$flavour = shift;
-open STDOUT,">".shift;
-
-$prefix="aes_v8";
-
-$code=<<___;
-#include "arm_arch.h"
-
-#if __ARM_ARCH__>=7
-.text
-___
-$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
-$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
-
-# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
-# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
-# maintain both 32- and 64-bit codes within single module and
-# transliterate common code to either flavour with regex vodoo.
-#
-{{{
-my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
-my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
- $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
-
-
-$code.=<<___;
-.align 5
-rcon:
-.long 0x01,0x01,0x01,0x01
-.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
-.long 0x1b,0x1b,0x1b,0x1b
-
-.globl ${prefix}_set_encrypt_key
-.type ${prefix}_set_encrypt_key,%function
-.align 5
-${prefix}_set_encrypt_key:
-.Lenc_key:
-___
-$code.=<<___ if ($flavour =~ /64/);
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-___
-$code.=<<___;
- adr $ptr,rcon
- cmp $bits,#192
-
- veor $zero,$zero,$zero
- vld1.8 {$in0},[$inp],#16
- mov $bits,#8 // reuse $bits
- vld1.32 {$rcon,$mask},[$ptr],#32
-
- b.lt .Loop128
- b.eq .L192
- b .L256
-
-.align 4
-.Loop128:
- vtbl.8 $key,{$in0},$mask
- vext.8 $tmp,$zero,$in0,#12
- vst1.32 {$in0},[$out],#16
- aese $key,$zero
- subs $bits,$bits,#1
-
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $key,$key,$rcon
- veor $in0,$in0,$tmp
- vshl.u8 $rcon,$rcon,#1
- veor $in0,$in0,$key
- b.ne .Loop128
-
- vld1.32 {$rcon},[$ptr]
-
- vtbl.8 $key,{$in0},$mask
- vext.8 $tmp,$zero,$in0,#12
- vst1.32 {$in0},[$out],#16
- aese $key,$zero
-
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $key,$key,$rcon
- veor $in0,$in0,$tmp
- vshl.u8 $rcon,$rcon,#1
- veor $in0,$in0,$key
-
- vtbl.8 $key,{$in0},$mask
- vext.8 $tmp,$zero,$in0,#12
- vst1.32 {$in0},[$out],#16
- aese $key,$zero
-
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $key,$key,$rcon
- veor $in0,$in0,$tmp
- veor $in0,$in0,$key
- vst1.32 {$in0},[$out]
- add $out,$out,#0x50
-
- mov $rounds,#10
- b .Ldone
-
-.align 4
-.L192:
- vld1.8 {$in1},[$inp],#8
- vmov.i8 $key,#8 // borrow $key
- vst1.32 {$in0},[$out],#16
- vsub.i8 $mask,$mask,$key // adjust the mask
-
-.Loop192:
- vtbl.8 $key,{$in1},$mask
- vext.8 $tmp,$zero,$in0,#12
- vst1.32 {$in1},[$out],#8
- aese $key,$zero
- subs $bits,$bits,#1
-
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $in0,$in0,$tmp
-
- vdup.32 $tmp,${in0}[3]
- veor $tmp,$tmp,$in1
- veor $key,$key,$rcon
- vext.8 $in1,$zero,$in1,#12
- vshl.u8 $rcon,$rcon,#1
- veor $in1,$in1,$tmp
- veor $in0,$in0,$key
- veor $in1,$in1,$key
- vst1.32 {$in0},[$out],#16
- b.ne .Loop192
-
- mov $rounds,#12
- add $out,$out,#0x20
- b .Ldone
-
-.align 4
-.L256:
- vld1.8 {$in1},[$inp]
- mov $bits,#7
- mov $rounds,#14
- vst1.32 {$in0},[$out],#16
-
-.Loop256:
- vtbl.8 $key,{$in1},$mask
- vext.8 $tmp,$zero,$in0,#12
- vst1.32 {$in1},[$out],#16
- aese $key,$zero
- subs $bits,$bits,#1
-
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $in0,$in0,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $key,$key,$rcon
- veor $in0,$in0,$tmp
- vshl.u8 $rcon,$rcon,#1
- veor $in0,$in0,$key
- vst1.32 {$in0},[$out],#16
- b.eq .Ldone
-
- vdup.32 $key,${in0}[3] // just splat
- vext.8 $tmp,$zero,$in1,#12
- aese $key,$zero
-
- veor $in1,$in1,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $in1,$in1,$tmp
- vext.8 $tmp,$zero,$tmp,#12
- veor $in1,$in1,$tmp
-
- veor $in1,$in1,$key
- b .Loop256
-
-.Ldone:
- str $rounds,[$out]
-
- eor x0,x0,x0 // return value
- `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
- ret
-.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
-
-.globl ${prefix}_set_decrypt_key
-.type ${prefix}_set_decrypt_key,%function
-.align 5
-${prefix}_set_decrypt_key:
-___
-$code.=<<___ if ($flavour =~ /64/);
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-___
-$code.=<<___ if ($flavour !~ /64/);
- stmdb sp!,{r4,lr}
-___
-$code.=<<___;
- bl .Lenc_key
-
- sub $out,$out,#240 // restore original $out
- mov x4,#-16
- add $inp,$out,x12,lsl#4 // end of key schedule
-
- vld1.32 {v0.16b},[$out]
- vld1.32 {v1.16b},[$inp]
- vst1.32 {v0.16b},[$inp],x4
- vst1.32 {v1.16b},[$out],#16
-
-.Loop_imc:
- vld1.32 {v0.16b},[$out]
- vld1.32 {v1.16b},[$inp]
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- vst1.32 {v0.16b},[$inp],x4
- vst1.32 {v1.16b},[$out],#16
- cmp $inp,$out
- b.hi .Loop_imc
-
- vld1.32 {v0.16b},[$out]
- aesimc v0.16b,v0.16b
- vst1.32 {v0.16b},[$inp]
-
- eor x0,x0,x0 // return value
-___
-$code.=<<___ if ($flavour !~ /64/);
- ldmia sp!,{r4,pc}
-___
-$code.=<<___ if ($flavour =~ /64/);
- ldp x29,x30,[sp],#16
- ret
-___
-$code.=<<___;
-.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
-___
-}}}
-{{{
-sub gen_block () {
-my $dir = shift;
-my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
-my ($inp,$out,$key)=map("x$_",(0..2));
-my $rounds="w3";
-my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
-
-$code.=<<___;
-.globl ${prefix}_${dir}crypt
-.type ${prefix}_${dir}crypt,%function
-.align 5
-${prefix}_${dir}crypt:
- ldr $rounds,[$key,#240]
- vld1.32 {$rndkey0},[$key],#16
- vld1.8 {$inout},[$inp]
- sub $rounds,$rounds,#2
- vld1.32 {$rndkey1},[$key],#16
-
-.Loop_${dir}c:
- aes$e $inout,$rndkey0
- vld1.32 {$rndkey0},[$key],#16
- aes$mc $inout,$inout
- subs $rounds,$rounds,#2
- aes$e $inout,$rndkey1
- vld1.32 {$rndkey1},[$key],#16
- aes$mc $inout,$inout
- b.gt .Loop_${dir}c
-
- aes$e $inout,$rndkey0
- vld1.32 {$rndkey0},[$key]
- aes$mc $inout,$inout
- aes$e $inout,$rndkey1
- veor $inout,$inout,$rndkey0
-
- vst1.8 {$inout},[$out]
- ret
-.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
-___
-}
-&gen_block("en");
-&gen_block("de");
-}}}
-{{{
-my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
-my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
-my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
-
-my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
-
-### q8-q15 preloaded key schedule
-
-$code.=<<___;
-.globl ${prefix}_cbc_encrypt
-.type ${prefix}_cbc_encrypt,%function
-.align 5
-${prefix}_cbc_encrypt:
-___
-$code.=<<___ if ($flavour =~ /64/);
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-___
-$code.=<<___ if ($flavour !~ /64/);
- mov ip,sp
- stmdb sp!,{r4-r8,lr}
- vstmdb sp!,{d8-d15} @ ABI specification says so
- ldmia ip,{r4-r5} @ load remaining args
-___
-$code.=<<___;
- subs $len,$len,#16
- mov $step,#16
- b.lo .Lcbc_abort
- cclr $step,eq
-
- cmp $enc,#0 // en- or decrypting?
- ldr $rounds,[$key,#240]
- and $len,$len,#-16
- vld1.8 {$ivec},[$ivp]
- vld1.8 {$dat},[$inp],$step
-
- vld1.32 {q8-q9},[$key] // load key schedule...
- sub $rounds,$rounds,#6
- add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
- sub $rounds,$rounds,#2
- vld1.32 {q10-q11},[$key_],#32
- vld1.32 {q12-q13},[$key_],#32
- vld1.32 {q14-q15},[$key_],#32
- vld1.32 {$rndlast},[$key_]
-
- add $key_,$key,#32
- mov $cnt,$rounds
- b.eq .Lcbc_dec
-
- cmp $rounds,#2
- veor $dat,$dat,$ivec
- veor $rndzero_n_last,q8,$rndlast
- b.eq .Lcbc_enc128
-
-.Loop_cbc_enc:
- aese $dat,q8
- vld1.32 {q8},[$key_],#16
- aesmc $dat,$dat
- subs $cnt,$cnt,#2
- aese $dat,q9
- vld1.32 {q9},[$key_],#16
- aesmc $dat,$dat
- b.gt .Loop_cbc_enc
-
- aese $dat,q8
- aesmc $dat,$dat
- subs $len,$len,#16
- aese $dat,q9
- aesmc $dat,$dat
- cclr $step,eq
- aese $dat,q10
- aesmc $dat,$dat
- add $key_,$key,#16
- aese $dat,q11
- aesmc $dat,$dat
- vld1.8 {q8},[$inp],$step
- aese $dat,q12
- aesmc $dat,$dat
- veor q8,q8,$rndzero_n_last
- aese $dat,q13
- aesmc $dat,$dat
- vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
- aese $dat,q14
- aesmc $dat,$dat
- aese $dat,q15
-
- mov $cnt,$rounds
- veor $ivec,$dat,$rndlast
- vst1.8 {$ivec},[$out],#16
- b.hs .Loop_cbc_enc
-
- b .Lcbc_done
-
-.align 5
-.Lcbc_enc128:
- vld1.32 {$in0-$in1},[$key_]
- aese $dat,q8
- aesmc $dat,$dat
- b .Lenter_cbc_enc128
-.Loop_cbc_enc128:
- aese $dat,q8
- aesmc $dat,$dat
- vst1.8 {$ivec},[$out],#16
-.Lenter_cbc_enc128:
- aese $dat,q9
- aesmc $dat,$dat
- subs $len,$len,#16
- aese $dat,$in0
- aesmc $dat,$dat
- cclr $step,eq
- aese $dat,$in1
- aesmc $dat,$dat
- aese $dat,q10
- aesmc $dat,$dat
- aese $dat,q11
- aesmc $dat,$dat
- vld1.8 {q8},[$inp],$step
- aese $dat,q12
- aesmc $dat,$dat
- aese $dat,q13
- aesmc $dat,$dat
- aese $dat,q14
- aesmc $dat,$dat
- veor q8,q8,$rndzero_n_last
- aese $dat,q15
- veor $ivec,$dat,$rndlast
- b.hs .Loop_cbc_enc128
-
- vst1.8 {$ivec},[$out],#16
- b .Lcbc_done
-
-.align 5
-.Lcbc_dec128:
- vld1.32 {$tmp0-$tmp1},[$key_]
- veor $ivec,$ivec,$rndlast
- veor $in0,$dat0,$rndlast
- mov $step1,$step
-
-.Loop2x_cbc_dec128:
- aesd $dat0,q8
- aesd $dat1,q8
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- subs $len,$len,#32
- aesd $dat0,q9
- aesd $dat1,q9
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- cclr $step,lo
- aesd $dat0,$tmp0
- aesd $dat1,$tmp0
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- cclr $step1,ls
- aesd $dat0,$tmp1
- aesd $dat1,$tmp1
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q10
- aesd $dat1,q10
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q11
- aesd $dat1,q11
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q12
- aesd $dat1,q12
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q13
- aesd $dat1,q13
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q14
- aesd $dat1,q14
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q15
- aesd $dat1,q15
-
- veor $ivec,$ivec,$dat0
- vld1.8 {$dat0},[$inp],$step
- veor $in0,$in0,$dat1
- vld1.8 {$dat1},[$inp],$step1
- vst1.8 {$ivec},[$out],#16
- veor $ivec,$in1,$rndlast
- vst1.8 {$in0},[$out],#16
- veor $in0,$dat0,$rndlast
- vorr $in1,$dat1,$dat1
- b.hs .Loop2x_cbc_dec128
-
- adds $len,$len,#32
- veor $ivec,$ivec,$rndlast
- b.eq .Lcbc_done
- veor $in0,$in0,$rndlast
- b .Lcbc_dec_tail
-
-.align 5
-.Lcbc_dec:
- subs $len,$len,#16
- vorr $in0,$dat,$dat
- b.lo .Lcbc_dec_tail
-
- cclr $step,eq
- cmp $rounds,#2
- vld1.8 {$dat1},[$inp],$step
- vorr $in1,$dat1,$dat1
- b.eq .Lcbc_dec128
-
-.Loop2x_cbc_dec:
- aesd $dat0,q8
- aesd $dat1,q8
- vld1.32 {q8},[$key_],#16
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- subs $cnt,$cnt,#2
- aesd $dat0,q9
- aesd $dat1,q9
- vld1.32 {q9},[$key_],#16
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- b.gt .Loop2x_cbc_dec
-
- aesd $dat0,q8
- aesd $dat1,q8
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- veor $tmp0,$ivec,$rndlast
- veor $tmp1,$in0,$rndlast
- aesd $dat0,q9
- aesd $dat1,q9
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- vorr $ivec,$in1,$in1
- subs $len,$len,#32
- aesd $dat0,q10
- aesd $dat1,q10
- aesimc $dat0,$dat0
- cclr $step,lo
- aesimc $dat1,$dat1
- mov $key_,$key
- aesd $dat0,q11
- aesd $dat1,q11
- aesimc $dat0,$dat0
- vld1.8 {$in0},[$inp],$step
- aesimc $dat1,$dat1
- cclr $step,ls
- aesd $dat0,q12
- aesd $dat1,q12
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- vld1.8 {$in1},[$inp],$step
- aesd $dat0,q13
- aesd $dat1,q13
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
- aesd $dat0,q14
- aesd $dat1,q14
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
- aesd $dat0,q15
- aesd $dat1,q15
-
- mov $cnt,$rounds
- veor $tmp0,$tmp0,$dat0
- veor $tmp1,$tmp1,$dat1
- vorr $dat0,$in0,$in0
- vst1.8 {$tmp0},[$out],#16
- vorr $dat1,$in1,$in1
- vst1.8 {$tmp1},[$out],#16
- b.hs .Loop2x_cbc_dec
-
- adds $len,$len,#32
- b.eq .Lcbc_done
-
-.Lcbc_dec_tail:
- aesd $dat,q8
- vld1.32 {q8},[$key_],#16
- aesimc $dat,$dat
- subs $cnt,$cnt,#2
- aesd $dat,q9
- vld1.32 {q9},[$key_],#16
- aesimc $dat,$dat
- b.gt .Lcbc_dec_tail
-
- aesd $dat,q8
- aesimc $dat,$dat
- aesd $dat,q9
- aesimc $dat,$dat
- veor $tmp,$ivec,$rndlast
- aesd $dat,q10
- aesimc $dat,$dat
- vorr $ivec,$in0,$in0
- aesd $dat,q11
- aesimc $dat,$dat
- aesd $dat,q12
- aesimc $dat,$dat
- aesd $dat,q13
- aesimc $dat,$dat
- aesd $dat,q14
- aesimc $dat,$dat
- aesd $dat,q15
-
- veor $tmp,$tmp,$dat
- vst1.8 {$tmp},[$out],#16
-
-.Lcbc_done:
- vst1.8 {$ivec},[$ivp]
-.Lcbc_abort:
-___
-$code.=<<___ if ($flavour !~ /64/);
- vldmia sp!,{d8-d15}
- ldmia sp!,{r4-r8,pc}
-___
-$code.=<<___ if ($flavour =~ /64/);
- ldr x29,[sp],#16
- ret
-___
-$code.=<<___;
-.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
-___
-}}}
-{{{
-my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
-my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
-my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
-
-my ($dat,$tmp)=($dat0,$tmp0);
-
-### q8-q15 preloaded key schedule
-
-$code.=<<___;
-.globl ${prefix}_ctr32_encrypt_blocks
-.type ${prefix}_ctr32_encrypt_blocks,%function
-.align 5
-${prefix}_ctr32_encrypt_blocks:
-___
-$code.=<<___ if ($flavour =~ /64/);
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-___
-$code.=<<___ if ($flavour !~ /64/);
- mov ip,sp
- stmdb sp!,{r4-r10,lr}
- vstmdb sp!,{d8-d15} @ ABI specification says so
- ldr r4, [ip] @ load remaining arg
-___
-$code.=<<___;
- ldr $rounds,[$key,#240]
-
- ldr $ctr, [$ivp, #12]
- vld1.32 {$dat0},[$ivp]
-
- vld1.32 {q8-q9},[$key] // load key schedule...
- sub $rounds,$rounds,#6
- add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
- sub $rounds,$rounds,#2
- vld1.32 {q10-q11},[$key_],#32
- vld1.32 {q12-q13},[$key_],#32
- vld1.32 {q14-q15},[$key_],#32
- vld1.32 {$rndlast},[$key_]
-
- add $key_,$key,#32
- mov $cnt,$rounds
-
- subs $len,$len,#2
- b.lo .Lctr32_tail
-
-#ifndef __ARMEB__
- rev $ctr, $ctr
-#endif
- vorr $dat1,$dat0,$dat0
- add $ctr, $ctr, #1
- vorr $ivec,$dat0,$dat0
- rev $tctr1, $ctr
- cmp $rounds,#2
- vmov.32 ${dat1}[3],$tctr1
- b.eq .Lctr32_128
-
-.Loop2x_ctr32:
- aese $dat0,q8
- aese $dat1,q8
- vld1.32 {q8},[$key_],#16
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- subs $cnt,$cnt,#2
- aese $dat0,q9
- aese $dat1,q9
- vld1.32 {q9},[$key_],#16
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- b.gt .Loop2x_ctr32
-
- aese $dat0,q8
- aese $dat1,q8
- aesmc $tmp0,$dat0
- vorr $dat0,$ivec,$ivec
- aesmc $tmp1,$dat1
- vorr $dat1,$ivec,$ivec
- aese $tmp0,q9
- aese $tmp1,q9
- vld1.8 {$in0},[$inp],#16
- aesmc $tmp0,$tmp0
- vld1.8 {$in1},[$inp],#16
- aesmc $tmp1,$tmp1
- add $ctr,$ctr,#1
- aese $tmp0,q10
- aese $tmp1,q10
- rev $tctr,$ctr
- aesmc $tmp0,$tmp0
- aesmc $tmp1,$tmp1
- add $ctr,$ctr,#1
- aese $tmp0,q11
- aese $tmp1,q11
- veor $in0,$in0,$rndlast
- rev $tctr1,$ctr
- aesmc $tmp0,$tmp0
- aesmc $tmp1,$tmp1
- veor $in1,$in1,$rndlast
- mov $key_,$key
- aese $tmp0,q12
- aese $tmp1,q12
- subs $len,$len,#2
- aesmc $tmp0,$tmp0
- aesmc $tmp1,$tmp1
- vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1]
- aese $tmp0,q13
- aese $tmp1,q13
- aesmc $tmp0,$tmp0
- aesmc $tmp1,$tmp1
- aese $tmp0,q14
- aese $tmp1,q14
- vmov.32 ${dat0}[3], $tctr
- aesmc $tmp0,$tmp0
- vmov.32 ${dat1}[3], $tctr1
- aesmc $tmp1,$tmp1
- aese $tmp0,q15
- aese $tmp1,q15
-
- mov $cnt,$rounds
- veor $in0,$in0,$tmp0
- veor $in1,$in1,$tmp1
- vst1.8 {$in0},[$out],#16
- vst1.8 {$in1},[$out],#16
- b.hs .Loop2x_ctr32
-
- adds $len,$len,#2
- b.eq .Lctr32_done
- b .Lctr32_tail
-
-.Lctr32_128:
- vld1.32 {$tmp0-$tmp1},[$key_]
-
-.Loop2x_ctr32_128:
- aese $dat0,q8
- aese $dat1,q8
- aesmc $dat0,$dat0
- vld1.8 {$in0},[$inp],#16
- aesmc $dat1,$dat1
- vld1.8 {$in1},[$inp],#16
- aese $dat0,q9
- aese $dat1,q9
- add $ctr,$ctr,#1
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- rev $tctr,$ctr
- aese $dat0,$tmp0
- aese $dat1,$tmp0
- add $ctr,$ctr,#1
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- rev $tctr1,$ctr
- aese $dat0,$tmp1
- aese $dat1,$tmp1
- subs $len,$len,#2
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- aese $dat0,q10
- aese $dat1,q10
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- aese $dat0,q11
- aese $dat1,q11
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- aese $dat0,q12
- aese $dat1,q12
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- aese $dat0,q13
- aese $dat1,q13
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- aese $dat0,q14
- aese $dat1,q14
- aesmc $dat0,$dat0
- aesmc $dat1,$dat1
- veor $in0,$in0,$rndlast
- aese $dat0,q15
- veor $in1,$in1,$rndlast
- aese $dat1,q15
-
- veor $in0,$in0,$dat0
- vorr $dat0,$ivec,$ivec
- veor $in1,$in1,$dat1
- vorr $dat1,$ivec,$ivec
- vst1.8 {$in0},[$out],#16
- vmov.32 ${dat0}[3], $tctr
- vst1.8 {$in1},[$out],#16
- vmov.32 ${dat1}[3], $tctr1
- b.hs .Loop2x_ctr32_128
-
- adds $len,$len,#2
- b.eq .Lctr32_done
-
-.Lctr32_tail:
- aese $dat,q8
- vld1.32 {q8},[$key_],#16
- aesmc $dat,$dat
- subs $cnt,$cnt,#2
- aese $dat,q9
- vld1.32 {q9},[$key_],#16
- aesmc $dat,$dat
- b.gt .Lctr32_tail
-
- aese $dat,q8
- aesmc $dat,$dat
- aese $dat,q9
- aesmc $dat,$dat
- vld1.8 {$in0},[$inp]
- aese $dat,q10
- aesmc $dat,$dat
- aese $dat,q11
- aesmc $dat,$dat
- aese $dat,q12
- aesmc $dat,$dat
- aese $dat,q13
- aesmc $dat,$dat
- aese $dat,q14
- aesmc $dat,$dat
- veor $in0,$in0,$rndlast
- aese $dat,q15
-
- veor $in0,$in0,$dat
- vst1.8 {$in0},[$out]
-
-.Lctr32_done:
-___
-$code.=<<___ if ($flavour !~ /64/);
- vldmia sp!,{d8-d15}
- ldmia sp!,{r4-r10,pc}
-___
-$code.=<<___ if ($flavour =~ /64/);
- ldr x29,[sp],#16
- ret
-___
-$code.=<<___;
-.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
-___
-}}}
-$code.=<<___;
-#endif
-___
-########################################
-if ($flavour =~ /64/) { ######## 64-bit code
- my %opcode = (
- "aesd" => 0x4e285800, "aese" => 0x4e284800,
- "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
-
- local *unaes = sub {
- my ($mnemonic,$arg)=@_;
-
- $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
- sprintf ".inst\t0x%08x\t//%s %s",
- $opcode{$mnemonic}|$1|($2<<5),
- $mnemonic,$arg;
- };
-
- foreach(split("\n",$code)) {
- s/\`([^\`]*)\`/eval($1)/geo;
-
- s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
- s/@\s/\/\//o; # old->new style commentary
-
- #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
- s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
- s/vmov\.i8/movi/o or # fix up legacy mnemonics
- s/vext\.8/ext/o or
- s/vrev32\.8/rev32/o or
- s/vtst\.8/cmtst/o or
- s/vshr/ushr/o or
- s/^(\s+)v/$1/o or # strip off v prefix
- s/\bbx\s+lr\b/ret/o;
-
- # fix up remainig legacy suffixes
- s/\.[ui]?8//o;
- m/\],#8/o and s/\.16b/\.8b/go;
- s/\.[ui]?32//o and s/\.16b/\.4s/go;
- s/\.[ui]?64//o and s/\.16b/\.2d/go;
- s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
-
- print $_,"\n";
- }
-} else { ######## 32-bit code
- my %opcode = (
- "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
- "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
-
- local *unaes = sub {
- my ($mnemonic,$arg)=@_;
-
- if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
- my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
- |(($2&7)<<1) |(($2&8)<<2);
- # since ARMv7 instructions are always encoded little-endian.
- # correct solution is to use .inst directive, but older
- # assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
- $word&0xff,($word>>8)&0xff,
- ($word>>16)&0xff,($word>>24)&0xff,
- $mnemonic,$arg;
- }
- };
-
- sub unvtbl {
- my $arg=shift;
-
- $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
- sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
- "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
- }
-
- sub unvdup32 {
- my $arg=shift;
-
- $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
- sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
- }
-
- sub unvmov32 {
- my $arg=shift;
-
- $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
- sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
- }
-
- foreach(split("\n",$code)) {
- s/\`([^\`]*)\`/eval($1)/geo;
-
- s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
- s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
- s/\/\/\s?/@ /o; # new->old style commentary
-
- # fix up remainig new-style suffixes
- s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
- s/\],#[0-9]+/]!/o;
-
- s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
- s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
- s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
- s/vdup\.32\s+(.*)/unvdup32($1)/geo or
- s/vmov\.32\s+(.*)/unvmov32($1)/geo or
- s/^(\s+)b\./$1b/o or
- s/^(\s+)ret/$1bx\tlr/o;
-
- print $_,"\n";
- }
-}
-
-close STDOUT;