summaryrefslogtreecommitdiff
path: root/main/openssl/crypto/aes/asm
diff options
context:
space:
mode:
authorArne Schwabe <arne@rfc2549.org>2014-06-24 11:12:30 +0200
committerArne Schwabe <arne@rfc2549.org>2014-06-24 11:12:30 +0200
commiteb7891f0af9b787202e91e903b612eb2467ca870 (patch)
treedd2ad0eac533dca54f6ae2231430052bf8860a9d /main/openssl/crypto/aes/asm
parent830aec478ff774bf8beeb321f5f67a08a133e95f (diff)
Update OpenSSL to aosp/master
--HG-- extra : amend_source : 1d737bc919a18f072b1038b548a2cbad4cf76e22
Diffstat (limited to 'main/openssl/crypto/aes/asm')
-rw-r--r--main/openssl/crypto/aes/asm/aes-armv4.pl139
-rw-r--r--main/openssl/crypto/aes/asm/aes-armv4.s160
-rw-r--r--main/openssl/crypto/aes/asm/aesv8-armx-64.S761
-rw-r--r--main/openssl/crypto/aes/asm/aesv8-armx.S767
-rw-r--r--main/openssl/crypto/aes/asm/aesv8-armx.pl980
5 files changed, 2752 insertions, 55 deletions
diff --git a/main/openssl/crypto/aes/asm/aes-armv4.pl b/main/openssl/crypto/aes/asm/aes-armv4.pl
index 86b86c4a..4f891708 100644
--- a/main/openssl/crypto/aes/asm/aes-armv4.pl
+++ b/main/openssl/crypto/aes/asm/aes-armv4.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -51,9 +51,23 @@ $key="r11";
$rounds="r12";
$code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
.code 32
+# endif
+#endif
.type AES_Te,%object
.align 5
@@ -167,7 +181,11 @@ AES_Te:
.type AES_encrypt,%function
.align 5
AES_encrypt:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_encrypt
+#else
+ adr r3,AES_encrypt
+#endif
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
@@ -409,11 +427,21 @@ _armv4_AES_encrypt:
.align 5
private_AES_set_encrypt_key:
_armv4_AES_set_encrypt_key:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_set_encrypt_key
+#else
+ adr r3,private_AES_set_encrypt_key
+#endif
teq r0,#0
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
moveq r0,#-1
beq .Labrt
teq r2,#0
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
moveq r0,#-1
beq .Labrt
@@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key:
teq r1,#192
beq .Lok
teq r1,#256
+#if __ARM_ARCH__>=7
+ itt ne @ Thumb2 thing, sanity check in ARM
+#endif
movne r0,#-1
bne .Labrt
@@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-16]
subs $rounds,$rounds,#1
str $s3,[$key,#-12]
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
subeq r2,$key,#216
beq .Ldone
@@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-24]
subs $rounds,$rounds,#1
str $s3,[$key,#-20]
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
subeq r2,$key,#256
beq .Ldone
@@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key:
str $i3,[$key,#-4]
b .L256_loop
+.align 2
.Ldone: mov r0,#0
ldmia sp!,{r4-r12,lr}
-.Labrt: tst lr,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.global private_AES_set_decrypt_key
@@ -688,34 +731,57 @@ private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
bl _armv4_AES_set_encrypt_key
teq r0,#0
- ldrne lr,[sp],#4 @ pop lr
+ ldr lr,[sp],#4 @ pop lr
bne .Labrt
- stmdb sp!,{r4-r12}
+ mov r0,r2 @ AES_set_encrypt_key preserves r2,
+ mov r1,r2 @ which is AES_KEY *key
+ b _armv4_AES_set_enc2dec_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
- ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
- mov $key,r2 @ which is AES_KEY *key
- mov $i1,r2
- add $i2,r2,$rounds,lsl#4
+@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
+.global AES_set_enc2dec_key
+.type AES_set_enc2dec_key,%function
+.align 5
+AES_set_enc2dec_key:
+_armv4_AES_set_enc2dec_key:
+ stmdb sp!,{r4-r12,lr}
+
+ ldr $rounds,[r0,#240]
+ mov $i1,r0 @ input
+ add $i2,r0,$rounds,lsl#4
+ mov $key,r1 @ ouput
+ add $tbl,r1,$rounds,lsl#4
+ str $rounds,[r1,#240]
+
+.Linv: ldr $s0,[$i1],#16
+ ldr $s1,[$i1,#-12]
+ ldr $s2,[$i1,#-8]
+ ldr $s3,[$i1,#-4]
+ ldr $t1,[$i2],#-16
+ ldr $t2,[$i2,#16+4]
+ ldr $t3,[$i2,#16+8]
+ ldr $i3,[$i2,#16+12]
+ str $s0,[$tbl],#-16
+ str $s1,[$tbl,#16+4]
+ str $s2,[$tbl,#16+8]
+ str $s3,[$tbl,#16+12]
+ str $t1,[$key],#16
+ str $t2,[$key,#-12]
+ str $t3,[$key,#-8]
+ str $i3,[$key,#-4]
+ teq $i1,$i2
+ bne .Linv
-.Linv: ldr $s0,[$i1]
+ ldr $s0,[$i1]
ldr $s1,[$i1,#4]
ldr $s2,[$i1,#8]
ldr $s3,[$i1,#12]
- ldr $t1,[$i2]
- ldr $t2,[$i2,#4]
- ldr $t3,[$i2,#8]
- ldr $i3,[$i2,#12]
- str $s0,[$i2],#-16
- str $s1,[$i2,#16+4]
- str $s2,[$i2,#16+8]
- str $s3,[$i2,#16+12]
- str $t1,[$i1],#16
- str $t2,[$i1,#-12]
- str $t3,[$i1,#-8]
- str $i3,[$i1,#-4]
- teq $i1,$i2
- bne .Linv
+ str $s0,[$key]
+ str $s1,[$key,#4]
+ str $s2,[$key,#8]
+ str $s3,[$key,#12]
+ sub $key,$key,$rounds,lsl#3
___
$mask80=$i1;
$mask1b=$i2;
@@ -773,7 +839,7 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.size AES_set_enc2dec_key,.-AES_set_enc2dec_key
.type AES_Td,%object
.align 5
@@ -883,7 +949,11 @@ AES_Td:
.type AES_decrypt,%function
.align 5
AES_decrypt:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_decrypt
+#else
+ adr r3,AES_decrypt
+#endif
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
@@ -1080,8 +1150,9 @@ _armv4_AES_decrypt:
ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
and $i3,lr,$s1,lsr#8
+ add $s1,$tbl,$s1,lsr#24
ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
- ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
+ ldrb $s1,[$s1] @ Td4[s1>>24]
ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
eor $s0,$i1,$s0,lsl#24
ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
@@ -1094,7 +1165,8 @@ _armv4_AES_decrypt:
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16
- ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
+ add $s2,$tbl,$s2,lsr#24
+ ldrb $s2,[$s2] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s1,$i2,$s1,lsl#16
@@ -1106,8 +1178,9 @@ _armv4_AES_decrypt:
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2
+ add $s3,$tbl,$s3,lsr#24
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
- ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
+ ldrb $s3,[$s3] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16
ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8
@@ -1130,5 +1203,15 @@ _armv4_AES_decrypt:
___
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx\tlr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
print $code;
close STDOUT; # enforce flush
diff --git a/main/openssl/crypto/aes/asm/aes-armv4.s b/main/openssl/crypto/aes/asm/aes-armv4.s
index 2697d4ce..333a5227 100644
--- a/main/openssl/crypto/aes/asm/aes-armv4.s
+++ b/main/openssl/crypto/aes/asm/aes-armv4.s
@@ -1,6 +1,53 @@
-#include "arm_arch.h"
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+@ AES for ARMv4
+
+@ January 2007.
+@
+@ Code uses single 1K S-box and is >2 times faster than code generated
+@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
+@ allows to merge logical or arithmetic operation with shift or rotate
+@ in one instruction and emit combined result every cycle. The module
+@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit
+@ key [on single-issue Xscale PXA250 core].
+
+@ May 2007.
+@
+@ AES_set_[en|de]crypt_key is added.
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 12% improvement on
+@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~21.5 cycles per byte.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
.code 32
+# endif
+#endif
.type AES_Te,%object
.align 5
@@ -114,7 +161,11 @@ AES_Te:
.type AES_encrypt,%function
.align 5
AES_encrypt:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_encrypt
+#else
+ adr r3,AES_encrypt
+#endif
stmdb sp!,{r1,r4-r12,lr}
mov r12,r0 @ inp
mov r11,r2
@@ -356,11 +407,21 @@ _armv4_AES_encrypt:
.align 5
private_AES_set_encrypt_key:
_armv4_AES_set_encrypt_key:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_set_encrypt_key
+#else
+ adr r3,private_AES_set_encrypt_key
+#endif
teq r0,#0
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
moveq r0,#-1
beq .Labrt
teq r2,#0
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
moveq r0,#-1
beq .Labrt
@@ -369,6 +430,9 @@ _armv4_AES_set_encrypt_key:
teq r1,#192
beq .Lok
teq r1,#256
+#if __ARM_ARCH__>=7
+ itt ne @ Thumb2 thing, sanity check in ARM
+#endif
movne r0,#-1
bne .Labrt
@@ -523,6 +587,9 @@ _armv4_AES_set_encrypt_key:
str r2,[r11,#-16]
subs r12,r12,#1
str r3,[r11,#-12]
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
subeq r2,r11,#216
beq .Ldone
@@ -592,6 +659,9 @@ _armv4_AES_set_encrypt_key:
str r2,[r11,#-24]
subs r12,r12,#1
str r3,[r11,#-20]
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
subeq r2,r11,#256
beq .Ldone
@@ -621,11 +691,17 @@ _armv4_AES_set_encrypt_key:
str r9,[r11,#-4]
b .L256_loop
+.align 2
.Ldone: mov r0,#0
ldmia sp!,{r4-r12,lr}
-.Labrt: tst lr,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+ bx lr @ .word 0xe12fff1e
+#else
+ tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.global private_AES_set_decrypt_key
@@ -635,34 +711,57 @@ private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
bl _armv4_AES_set_encrypt_key
teq r0,#0
- ldrne lr,[sp],#4 @ pop lr
+ ldr lr,[sp],#4 @ pop lr
bne .Labrt
- stmdb sp!,{r4-r12}
+ mov r0,r2 @ AES_set_encrypt_key preserves r2,
+ mov r1,r2 @ which is AES_KEY *key
+ b _armv4_AES_set_enc2dec_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+
+@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
+.global AES_set_enc2dec_key
+.type AES_set_enc2dec_key,%function
+.align 5
+AES_set_enc2dec_key:
+_armv4_AES_set_enc2dec_key:
+ stmdb sp!,{r4-r12,lr}
+
+ ldr r12,[r0,#240]
+ mov r7,r0 @ input
+ add r8,r0,r12,lsl#4
+ mov r11,r1 @ ouput
+ add r10,r1,r12,lsl#4
+ str r12,[r1,#240]
- ldr r12,[r2,#240] @ AES_set_encrypt_key preserves r2,
- mov r11,r2 @ which is AES_KEY *key
- mov r7,r2
- add r8,r2,r12,lsl#4
+.Linv: ldr r0,[r7],#16
+ ldr r1,[r7,#-12]
+ ldr r2,[r7,#-8]
+ ldr r3,[r7,#-4]
+ ldr r4,[r8],#-16
+ ldr r5,[r8,#16+4]
+ ldr r6,[r8,#16+8]
+ ldr r9,[r8,#16+12]
+ str r0,[r10],#-16
+ str r1,[r10,#16+4]
+ str r2,[r10,#16+8]
+ str r3,[r10,#16+12]
+ str r4,[r11],#16
+ str r5,[r11,#-12]
+ str r6,[r11,#-8]
+ str r9,[r11,#-4]
+ teq r7,r8
+ bne .Linv
-.Linv: ldr r0,[r7]
+ ldr r0,[r7]
ldr r1,[r7,#4]
ldr r2,[r7,#8]
ldr r3,[r7,#12]
- ldr r4,[r8]
- ldr r5,[r8,#4]
- ldr r6,[r8,#8]
- ldr r9,[r8,#12]
- str r0,[r8],#-16
- str r1,[r8,#16+4]
- str r2,[r8,#16+8]
- str r3,[r8,#16+12]
- str r4,[r7],#16
- str r5,[r7,#-12]
- str r6,[r7,#-8]
- str r9,[r7,#-4]
- teq r7,r8
- bne .Linv
+ str r0,[r11]
+ str r1,[r11,#4]
+ str r2,[r11,#8]
+ str r3,[r11,#12]
+ sub r11,r11,r12,lsl#3
ldr r0,[r11,#16]! @ prefetch tp1
mov r7,#0x80
mov r8,#0x1b
@@ -715,7 +814,7 @@ private_AES_set_decrypt_key:
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
-.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.size AES_set_enc2dec_key,.-AES_set_enc2dec_key
.type AES_Td,%object
.align 5
@@ -825,7 +924,11 @@ AES_Td:
.type AES_decrypt,%function
.align 5
AES_decrypt:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_decrypt
+#else
+ adr r3,AES_decrypt
+#endif
stmdb sp!,{r1,r4-r12,lr}
mov r12,r0 @ inp
mov r11,r2
@@ -1022,8 +1125,9 @@ _armv4_AES_decrypt:
ldrb r6,[r10,r9] @ Td4[s0>>0]
and r9,lr,r1,lsr#8
+ add r1,r10,r1,lsr#24
ldrb r7,[r10,r7] @ Td4[s1>>0]
- ldrb r1,[r10,r1,lsr#24] @ Td4[s1>>24]
+ ldrb r1,[r1] @ Td4[s1>>24]
ldrb r8,[r10,r8] @ Td4[s1>>16]
eor r0,r7,r0,lsl#24
ldrb r9,[r10,r9] @ Td4[s1>>8]
@@ -1036,7 +1140,8 @@ _armv4_AES_decrypt:
ldrb r8,[r10,r8] @ Td4[s2>>0]
and r9,lr,r2,lsr#16
- ldrb r2,[r10,r2,lsr#24] @ Td4[s2>>24]
+ add r2,r10,r2,lsr#24
+ ldrb r2,[r2] @ Td4[s2>>24]
eor r0,r0,r7,lsl#8
ldrb r9,[r10,r9] @ Td4[s2>>16]
eor r1,r8,r1,lsl#16
@@ -1048,8 +1153,9 @@ _armv4_AES_decrypt:
ldrb r8,[r10,r8] @ Td4[s3>>8]
and r9,lr,r3 @ i2
+ add r3,r10,r3,lsr#24
ldrb r9,[r10,r9] @ Td4[s3>>0]
- ldrb r3,[r10,r3,lsr#24] @ Td4[s3>>24]
+ ldrb r3,[r3] @ Td4[s3>>24]
eor r0,r0,r7,lsl#16
ldr r7,[r11,#0]
eor r1,r1,r8,lsl#8
diff --git a/main/openssl/crypto/aes/asm/aesv8-armx-64.S b/main/openssl/crypto/aes/asm/aesv8-armx-64.S
new file mode 100644
index 00000000..be0a13df
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesv8-armx-64.S
@@ -0,0 +1,761 @@
+#include "arm_arch.h"
+
+#if __ARM_ARCH__>=7
+.text
+.arch armv8-a+crypto
+.align 5
+rcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.globl aes_v8_set_encrypt_key
+.type aes_v8_set_encrypt_key,%function
+.align 5
+aes_v8_set_encrypt_key:
+.Lenc_key:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ adr x3,rcon
+ cmp w1,#192
+
+ eor v0.16b,v0.16b,v0.16b
+ ld1 {v3.16b},[x0],#16
+ mov w1,#8 // reuse w1
+ ld1 {v1.4s,v2.4s},[x3],#32
+
+ b.lt .Loop128
+ b.eq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ b.ne .Loop128
+
+ ld1 {v1.4s},[x3]
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2]
+ add x2,x2,#0x50
+
+ mov w12,#10
+ b .Ldone
+
+.align 4
+.L192:
+ ld1 {v4.8b},[x0],#8
+ movi v6.16b,#8 // borrow v6.16b
+ st1 {v3.4s},[x2],#16
+ sub v2.16b,v2.16b,v6.16b // adjust the mask
+
+.Loop192:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.8b},[x2],#8
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+
+ dup v5.4s,v3.s[3]
+ eor v5.16b,v5.16b,v4.16b
+ eor v6.16b,v6.16b,v1.16b
+ ext v4.16b,v0.16b,v4.16b,#12
+ shl v1.16b,v1.16b,#1
+ eor v4.16b,v4.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ eor v4.16b,v4.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.ne .Loop192
+
+ mov w12,#12
+ add x2,x2,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ ld1 {v4.16b},[x0]
+ mov w1,#7
+ mov w12,#14
+ st1 {v3.4s},[x2],#16
+
+.Loop256:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.eq .Ldone
+
+ dup v6.4s,v3.s[3] // just splat
+ ext v5.16b,v0.16b,v4.16b,#12
+ aese v6.16b,v0.16b
+
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+
+ eor v4.16b,v4.16b,v6.16b
+ b .Loop256
+
+.Ldone:
+ str w12,[x2]
+
+ eor x0,x0,x0 // return value
+ ldr x29,[sp],#16
+ ret
+.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
+
+.globl aes_v8_set_decrypt_key
+.type aes_v8_set_decrypt_key,%function
+.align 5
+aes_v8_set_decrypt_key:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ bl .Lenc_key
+
+ sub x2,x2,#240 // restore original x2
+ mov x4,#-16
+ add x0,x2,x12,lsl#4 // end of key schedule
+
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+
+.Loop_imc:
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+ cmp x0,x2
+ b.hi .Loop_imc
+
+ ld1 {v0.4s},[x2]
+ aesimc v0.16b,v0.16b
+ st1 {v0.4s},[x0]
+
+ eor x0,x0,x0 // return value
+ ldp x29,x30,[sp],#16
+ ret
+.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
+.globl aes_v8_encrypt
+.type aes_v8_encrypt,%function
+.align 5
+aes_v8_encrypt:
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+.Loop_enc:
+ aese v2.16b,v0.16b
+ ld1 {v0.4s},[x2],#16
+ aesmc v2.16b,v2.16b
+ subs w3,w3,#2
+ aese v2.16b,v1.16b
+ ld1 {v1.4s},[x2],#16
+ aesmc v2.16b,v2.16b
+ b.gt .Loop_enc
+
+ aese v2.16b,v0.16b
+ ld1 {v0.4s},[x2]
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+.size aes_v8_encrypt,.-aes_v8_encrypt
+.globl aes_v8_decrypt
+.type aes_v8_decrypt,%function
+.align 5
+aes_v8_decrypt:
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+.Loop_dec:
+ aesd v2.16b,v0.16b
+ ld1 {v0.4s},[x2],#16
+ aesimc v2.16b,v2.16b
+ subs w3,w3,#2
+ aesd v2.16b,v1.16b
+ ld1 {v1.4s},[x2],#16
+ aesimc v2.16b,v2.16b
+ b.gt .Loop_dec
+
+ aesd v2.16b,v0.16b
+ ld1 {v0.4s},[x2]
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+.size aes_v8_decrypt,.-aes_v8_decrypt
+.globl aes_v8_cbc_encrypt
+.type aes_v8_cbc_encrypt,%function
+.align 5
+aes_v8_cbc_encrypt:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo .Lcbc_abort
+ csel x8,xzr,x8,eq
+
+ cmp w5,#0 // en- or decrypting?
+ ldr w5,[x3,#240]
+ and x2,x2,#-16
+ ld1 {v6.16b},[x4]
+ ld1 {v0.16b},[x0],x8
+
+ ld1 {v16.4s-v17.4s},[x3] // load key schedule...
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4 // pointer to last 7 round keys
+ sub w5,w5,#2
+ ld1 {v18.4s-v19.4s},[x7],#32
+ ld1 {v20.4s-v21.4s},[x7],#32
+ ld1 {v22.4s-v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+ add x7,x3,#32
+ mov w6,w5
+ b.eq .Lcbc_dec
+
+ cmp w5,#2
+ eor v0.16b,v0.16b,v6.16b
+ eor v5.16b,v16.16b,v7.16b
+ b.eq .Lcbc_enc128
+
+.Loop_cbc_enc:
+ aese v0.16b,v16.16b
+ ld1 {v16.4s},[x7],#16
+ aesmc v0.16b,v0.16b
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ aesmc v0.16b,v0.16b
+ b.gt .Loop_cbc_enc
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ add x7,x3,#16
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+
+ mov w6,w5
+ eor v6.16b,v0.16b,v7.16b
+ st1 {v6.16b},[x1],#16
+ b.hs .Loop_cbc_enc
+
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ ld1 {v2.4s-v3.4s},[x7]
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc128:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs .Loop_cbc_enc128
+
+ st1 {v6.16b},[x1],#16
+ b .Lcbc_done
+
+.align 5
+.Lcbc_dec128:
+ ld1 {v4.4s-v5.4s},[x7]
+ eor v6.16b,v6.16b,v7.16b
+ eor v2.16b,v0.16b,v7.16b
+ mov x12,x8
+
+.Loop2x_cbc_dec128:
+ aesd v0.16b,v16.16b
+ aesd v1.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ subs x2,x2,#32
+ aesd v0.16b,v17.16b
+ aesd v1.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ csel x8,xzr,x8,lo
+ aesd v0.16b,v4.16b
+ aesd v1.16b,v4.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ csel x12,xzr,x12,ls
+ aesd v0.16b,v5.16b
+ aesd v1.16b,v5.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ aesd v0.16b,v18.16b
+ aesd v1.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ aesd v0.16b,v19.16b
+ aesd v1.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ aesd v0.16b,v20.16b
+ aesd v1.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ aesd v0.16b,v21.16b
+ aesd v1.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ aesd v0.16b,v22.16b
+ aesd v1.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+
+ eor v6.16b,v6.16b,v0.16b
+ ld1 {v0.16b},[x0],x8
+ eor v2.16b,v2.16b,v1.16b
+ ld1 {v1.16b},[x0],x12
+ st1 {v6.16b},[x1],#16
+ eor v6.16b,v3.16b,v7.16b
+ st1 {v2.16b},[x1],#16
+ eor v2.16b,v0.16b,v7.16b
+ orr v3.16b,v1.16b,v1.16b
+ b.hs .Loop2x_cbc_dec128
+
+ adds x2,x2,#32
+ eor v6.16b,v6.16b,v7.16b
+ b.eq .Lcbc_done
+ eor v2.16b,v2.16b,v7.16b
+ b .Lcbc_dec_tail
+
+.align 5
+.Lcbc_dec:
+ subs x2,x2,#16
+ orr v2.16b,v0.16b,v0.16b
+ b.lo .Lcbc_dec_tail
+
+ csel x8,xzr,x8,eq
+ cmp w5,#2
+ ld1 {v1.16b},[x0],x8
+ orr v3.16b,v1.16b,v1.16b
+ b.eq .Lcbc_dec128
+
+.Loop2x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesd v1.16b,v16.16b
+ ld1 {v16.4s},[x7],#16
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesd v1.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ b.gt .Loop2x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesd v1.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ eor v4.16b,v6.16b,v7.16b
+ eor v5.16b,v2.16b,v7.16b
+ aesd v0.16b,v17.16b
+ aesd v1.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ orr v6.16b,v3.16b,v3.16b
+ subs x2,x2,#32
+ aesd v0.16b,v18.16b
+ aesd v1.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ csel x8,xzr,x8,lo
+ aesimc v1.16b,v1.16b
+ mov x7,x3
+ aesd v0.16b,v19.16b
+ aesd v1.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v2.16b},[x0],x8
+ aesimc v1.16b,v1.16b
+ csel x8,xzr,x8,ls
+ aesd v0.16b,v20.16b
+ aesd v1.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ ld1 {v3.16b},[x0],x8
+ aesd v0.16b,v21.16b
+ aesd v1.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ aesd v0.16b,v22.16b
+ aesd v1.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+
+ mov w6,w5
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v4.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v5.16b},[x1],#16
+ b.hs .Loop2x_cbc_dec
+
+ adds x2,x2,#32
+ b.eq .Lcbc_done
+
+.Lcbc_dec_tail:
+ aesd v0.16b,v16.16b
+ ld1 {v16.4s},[x7],#16
+ aesimc v0.16b,v0.16b
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ aesimc v0.16b,v0.16b
+ b.gt .Lcbc_dec_tail
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ eor v4.16b,v6.16b,v7.16b
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ orr v6.16b,v2.16b,v2.16b
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v23.16b
+
+ eor v4.16b,v4.16b,v0.16b
+ st1 {v4.16b},[x1],#16
+
+.Lcbc_done:
+ st1 {v6.16b},[x4]
+.Lcbc_abort:
+ ldr x29,[sp],#16
+ ret
+.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
+.globl aes_v8_ctr32_encrypt_blocks
+.type aes_v8_ctr32_encrypt_blocks,%function
+.align 5
+aes_v8_ctr32_encrypt_blocks:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+ ld1 {v0.4s},[x4]
+
+ ld1 {v16.4s-v17.4s},[x3] // load key schedule...
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4 // pointer to last 7 round keys
+ sub w5,w5,#2
+ ld1 {v18.4s-v19.4s},[x7],#32
+ ld1 {v20.4s-v21.4s},[x7],#32
+ ld1 {v22.4s-v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+ add x7,x3,#32
+ mov w6,w5
+
+ subs x2,x2,#2
+ b.lo .Lctr32_tail
+
+#ifndef __ARMEB__
+ rev w8, w8
+#endif
+ orr v1.16b,v0.16b,v0.16b
+ add w8, w8, #1
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w8
+ cmp w5,#2
+ mov v1.s[3],w10
+ b.eq .Lctr32_128
+
+.Loop2x_ctr32:
+ aese v0.16b,v16.16b
+ aese v1.16b,v16.16b
+ ld1 {v16.4s},[x7],#16
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aese v1.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ b.gt .Loop2x_ctr32
+
+ aese v0.16b,v16.16b
+ aese v1.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ orr v0.16b,v6.16b,v6.16b
+ aesmc v5.16b,v1.16b
+ orr v1.16b,v6.16b,v6.16b
+ aese v4.16b,v17.16b
+ aese v5.16b,v17.16b
+ ld1 {v2.16b},[x0],#16
+ aesmc v4.16b,v4.16b
+ ld1 {v3.16b},[x0],#16
+ aesmc v5.16b,v5.16b
+ add w8,w8,#1
+ aese v4.16b,v18.16b
+ aese v5.16b,v18.16b
+ rev w9,w8
+ aesmc v4.16b,v4.16b
+ aesmc v5.16b,v5.16b
+ add w8,w8,#1
+ aese v4.16b,v19.16b
+ aese v5.16b,v19.16b
+ eor v2.16b,v2.16b,v7.16b
+ rev w10,w8
+ aesmc v4.16b,v4.16b
+ aesmc v5.16b,v5.16b
+ eor v3.16b,v3.16b,v7.16b
+ mov x7,x3
+ aese v4.16b,v20.16b
+ aese v5.16b,v20.16b
+ subs x2,x2,#2
+ aesmc v4.16b,v4.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v16.4s-v17.4s},[x7],#32 // re-pre-load rndkey[0-1]
+ aese v4.16b,v21.16b
+ aese v5.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aesmc v5.16b,v5.16b
+ aese v4.16b,v22.16b
+ aese v5.16b,v22.16b
+ mov v0.s[3], w9
+ aesmc v4.16b,v4.16b
+ mov v1.s[3], w10
+ aesmc v5.16b,v5.16b
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+
+ mov w6,w5
+ eor v2.16b,v2.16b,v4.16b
+ eor v3.16b,v3.16b,v5.16b
+ st1 {v2.16b},[x1],#16
+ st1 {v3.16b},[x1],#16
+ b.hs .Loop2x_ctr32
+
+ adds x2,x2,#2
+ b.eq .Lctr32_done
+ b .Lctr32_tail
+
+.Lctr32_128:
+ ld1 {v4.4s-v5.4s},[x7]
+
+.Loop2x_ctr32_128:
+ aese v0.16b,v16.16b
+ aese v1.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v2.16b},[x0],#16
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0],#16
+ aese v0.16b,v17.16b
+ aese v1.16b,v17.16b
+ add w8,w8,#1
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ rev w9,w8
+ aese v0.16b,v4.16b
+ aese v1.16b,v4.16b
+ add w8,w8,#1
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ rev w10,w8
+ aese v0.16b,v5.16b
+ aese v1.16b,v5.16b
+ subs x2,x2,#2
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v18.16b
+ aese v1.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v19.16b
+ aese v1.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v20.16b
+ aese v1.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v21.16b
+ aese v1.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v22.16b
+ aese v1.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v23.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v1.16b,v23.16b
+
+ eor v2.16b,v2.16b,v0.16b
+ orr v0.16b,v6.16b,v6.16b
+ eor v3.16b,v3.16b,v1.16b
+ orr v1.16b,v6.16b,v6.16b
+ st1 {v2.16b},[x1],#16
+ mov v0.s[3], w9
+ st1 {v3.16b},[x1],#16
+ mov v1.s[3], w10
+ b.hs .Loop2x_ctr32_128
+
+ adds x2,x2,#2
+ b.eq .Lctr32_done
+
+.Lctr32_tail:
+ aese v0.16b,v16.16b
+ ld1 {v16.4s},[x7],#16
+ aesmc v0.16b,v0.16b
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ aesmc v0.16b,v0.16b
+ b.gt .Lctr32_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v2.16b},[x0]
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v23.16b
+
+ eor v2.16b,v2.16b,v0.16b
+ st1 {v2.16b},[x1]
+
+.Lctr32_done:
+ ldr x29,[sp],#16
+ ret
+.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
+#endif
diff --git a/main/openssl/crypto/aes/asm/aesv8-armx.S b/main/openssl/crypto/aes/asm/aesv8-armx.S
new file mode 100644
index 00000000..1637e4d4
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesv8-armx.S
@@ -0,0 +1,767 @@
+#include "arm_arch.h"
+
+#if __ARM_ARCH__>=7
+.text
+.fpu neon
+.code 32
+.align 5
+rcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.globl aes_v8_set_encrypt_key
+.type aes_v8_set_encrypt_key,%function
+.align 5
+aes_v8_set_encrypt_key:
+.Lenc_key:
+ adr r3,rcon
+ cmp r1,#192
+
+ veor q0,q0,q0
+ vld1.8 {q3},[r0]!
+ mov r1,#8 @ reuse r1
+ vld1.32 {q1,q2},[r3]!
+
+ blt .Loop128
+ beq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+ .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+ bne .Loop128
+
+ vld1.32 {q1},[r3]
+
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+ .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+ .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ veor q3,q3,q10
+ vst1.32 {q3},[r2]
+ add r2,r2,#0x50
+
+ mov r12,#10
+ b .Ldone
+
+.align 4
+.L192:
+ vld1.8 {d16},[r0]!
+ vmov.i8 q10,#8 @ borrow q10
+ vst1.32 {q3},[r2]!
+ vsub.i8 q2,q2,q10 @ adjust the mask
+
+.Loop192:
+ vtbl.8 d20,{q8},d4
+ vtbl.8 d21,{q8},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {d16},[r2]!
+ .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+
+ vdup.32 q9,d7[1]
+ veor q9,q9,q8
+ veor q10,q10,q1
+ vext.8 q8,q0,q8,#12
+ vshl.u8 q1,q1,#1
+ veor q8,q8,q9
+ veor q3,q3,q10
+ veor q8,q8,q10
+ vst1.32 {q3},[r2]!
+ bne .Loop192
+
+ mov r12,#12
+ add r2,r2,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ vld1.8 {q8},[r0]
+ mov r1,#7
+ mov r12,#14
+ vst1.32 {q3},[r2]!
+
+.Loop256:
+ vtbl.8 d20,{q8},d4
+ vtbl.8 d21,{q8},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q8},[r2]!
+ .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+ vst1.32 {q3},[r2]!
+ beq .Ldone
+
+ vdup.32 q10,d7[1]
+ vext.8 q9,q0,q8,#12
+ .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q8,q8,q9
+ vext.8 q9,q0,q9,#12
+ veor q8,q8,q9
+ vext.8 q9,q0,q9,#12
+ veor q8,q8,q9
+
+ veor q8,q8,q10
+ b .Loop256
+
+.Ldone:
+ str r12,[r2]
+
+ eor r0,r0,r0 @ return value
+
+ bx lr
+.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
+
+.globl aes_v8_set_decrypt_key
+.type aes_v8_set_decrypt_key,%function
+.align 5
+aes_v8_set_decrypt_key:
+ stmdb sp!,{r4,lr}
+ bl .Lenc_key
+
+ sub r2,r2,#240 @ restore original r2
+ mov r4,#-16
+ add r0,r2,r12,lsl#4 @ end of key schedule
+
+ vld1.32 {q0},[r2]
+ vld1.32 {q1},[r0]
+ vst1.32 {q0},[r0],r4
+ vst1.32 {q1},[r2]!
+
+.Loop_imc:
+ vld1.32 {q0},[r2]
+ vld1.32 {q1},[r0]
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ vst1.32 {q0},[r0],r4
+ vst1.32 {q1},[r2]!
+ cmp r0,r2
+ bhi .Loop_imc
+
+ vld1.32 {q0},[r2]
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ vst1.32 {q0},[r0]
+
+ eor r0,r0,r0 @ return value
+ ldmia sp!,{r4,pc}
+.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
+.globl aes_v8_encrypt
+.type aes_v8_encrypt,%function
+.align 5
+aes_v8_encrypt:
+ ldr r3,[r2,#240]
+ vld1.32 {q0},[r2]!
+ vld1.8 {q2},[r0]
+ sub r3,r3,#2
+ vld1.32 {q1},[r2]!
+
+.Loop_enc:
+ .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
+ vld1.32 {q0},[r2]!
+ .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ subs r3,r3,#2
+ .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
+ vld1.32 {q1},[r2]!
+ .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ bgt .Loop_enc
+
+ .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
+ vld1.32 {q0},[r2]
+ .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
+ veor q2,q2,q0
+
+ vst1.8 {q2},[r1]
+ bx lr
+.size aes_v8_encrypt,.-aes_v8_encrypt
+.globl aes_v8_decrypt
+.type aes_v8_decrypt,%function
+.align 5
+aes_v8_decrypt:
+ ldr r3,[r2,#240]
+ vld1.32 {q0},[r2]!
+ vld1.8 {q2},[r0]
+ sub r3,r3,#2
+ vld1.32 {q1},[r2]!
+
+.Loop_dec:
+ .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
+ vld1.32 {q0},[r2]!
+ .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ subs r3,r3,#2
+ .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
+ vld1.32 {q1},[r2]!
+ .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ bgt .Loop_dec
+
+ .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
+ vld1.32 {q0},[r2]
+ .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
+ veor q2,q2,q0
+
+ vst1.8 {q2},[r1]
+ bx lr
+.size aes_v8_decrypt,.-aes_v8_decrypt
+.globl aes_v8_cbc_encrypt
+.type aes_v8_cbc_encrypt,%function
+.align 5
+aes_v8_cbc_encrypt:
+ mov ip,sp
+ stmdb sp!,{r4-r8,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load remaining args
+ subs r2,r2,#16
+ mov r8,#16
+ blo .Lcbc_abort
+ moveq r8,#0
+
+ cmp r5,#0 @ en- or decrypting?
+ ldr r5,[r3,#240]
+ and r2,r2,#-16
+ vld1.8 {q6},[r4]
+ vld1.8 {q0},[r0],r8
+
+ vld1.32 {q8-q9},[r3] @ load key schedule...
+ sub r5,r5,#6
+ add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
+ sub r5,r5,#2
+ vld1.32 {q10-q11},[r7]!
+ vld1.32 {q12-q13},[r7]!
+ vld1.32 {q14-q15},[r7]!
+ vld1.32 {q7},[r7]
+
+ add r7,r3,#32
+ mov r6,r5
+ beq .Lcbc_dec
+
+ cmp r5,#2
+ veor q0,q0,q6
+ veor q5,q8,q7
+ beq .Lcbc_enc128
+
+.Loop_cbc_enc:
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ vld1.32 {q8},[r7]!
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r6,r6,#2
+ .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+ vld1.32 {q9},[r7]!
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ bgt .Loop_cbc_enc
+
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r2,r2,#16
+ .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ moveq r8,#0
+ .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ add r7,r3,#16
+ .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q8},[r0],r8
+ .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ veor q8,q8,q5
+ .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+
+ mov r6,r5
+ veor q6,q0,q7
+ vst1.8 {q6},[r1]!
+ bhs .Loop_cbc_enc
+
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ vld1.32 {q2-q3},[r7]
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vst1.8 {q6},[r1]!
+.Lenter_cbc_enc128:
+ .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r2,r2,#16
+ .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ moveq r8,#0
+ .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q8},[r0],r8
+ .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ veor q8,q8,q5
+ .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+ veor q6,q0,q7
+ bhs .Loop_cbc_enc128
+
+ vst1.8 {q6},[r1]!
+ b .Lcbc_done
+
+.align 5
+.Lcbc_dec128:
+ vld1.32 {q4-q5},[r7]
+ veor q6,q6,q7
+ veor q2,q0,q7
+ mov r12,r8
+
+.Loop2x_cbc_dec128:
+ .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+ .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ subs r2,r2,#32
+ .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+ .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ movlo r8,#0
+ .byte 0x48,0x03,0xb0,0xf3 @ aesd q0,q4
+ .byte 0x48,0x23,0xb0,0xf3 @ aesd q1,q4
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ movls r12,#0
+ .byte 0x4a,0x03,0xb0,0xf3 @ aesd q0,q5
+ .byte 0x4a,0x23,0xb0,0xf3 @ aesd q1,q5
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10
+ .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11
+ .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
+ .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
+ .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
+ .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
+ .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
+
+ veor q6,q6,q0
+ vld1.8 {q0},[r0],r8
+ veor q2,q2,q1
+ vld1.8 {q1},[r0],r12
+ vst1.8 {q6},[r1]!
+ veor q6,q3,q7
+ vst1.8 {q2},[r1]!
+ veor q2,q0,q7
+ vorr q3,q1,q1
+ bhs .Loop2x_cbc_dec128
+
+ adds r2,r2,#32
+ veor q6,q6,q7
+ beq .Lcbc_done
+ veor q2,q2,q7
+ b .Lcbc_dec_tail
+
+.align 5
+.Lcbc_dec:
+ subs r2,r2,#16
+ vorr q2,q0,q0
+ blo .Lcbc_dec_tail
+
+ moveq r8,#0
+ cmp r5,#2
+ vld1.8 {q1},[r0],r8
+ vorr q3,q1,q1
+ beq .Lcbc_dec128
+
+.Loop2x_cbc_dec:
+ .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+ .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+ vld1.32 {q8},[r7]!
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ subs r6,r6,#2
+ .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+ .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+ vld1.32 {q9},[r7]!
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ bgt .Loop2x_cbc_dec
+
+ .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+ .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ veor q4,q6,q7
+ veor q5,q2,q7
+ .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+ .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ vorr q6,q3,q3
+ subs r2,r2,#32
+ .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10
+ .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ movlo r8,#0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ mov r7,r3
+ .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11
+ .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ vld1.8 {q2},[r0],r8
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ movls r8,#0
+ .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
+ .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ vld1.8 {q3},[r0],r8
+ .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
+ .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
+ .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
+ .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
+
+ mov r6,r5
+ veor q4,q4,q0
+ veor q5,q5,q1
+ vorr q0,q2,q2
+ vst1.8 {q4},[r1]!
+ vorr q1,q3,q3
+ vst1.8 {q5},[r1]!
+ bhs .Loop2x_cbc_dec
+
+ adds r2,r2,#32
+ beq .Lcbc_done
+
+.Lcbc_dec_tail:
+ .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+ vld1.32 {q8},[r7]!
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ subs r6,r6,#2
+ .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+ vld1.32 {q9},[r7]!
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ bgt .Lcbc_dec_tail
+
+ .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ veor q4,q6,q7
+ .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ vorr q6,q2,q2
+ .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
+ .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
+
+ veor q4,q4,q0
+ vst1.8 {q4},[r1]!
+
+.Lcbc_done:
+ vst1.8 {q6},[r4]
+.Lcbc_abort:
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r8,pc}
+.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
+.globl aes_v8_ctr32_encrypt_blocks
+.type aes_v8_ctr32_encrypt_blocks,%function
+.align 5
+aes_v8_ctr32_encrypt_blocks:
+ mov ip,sp
+ stmdb sp!,{r4-r10,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+ ldr r5,[r3,#240]
+
+ ldr r8, [r4, #12]
+ vld1.32 {q0},[r4]
+
+ vld1.32 {q8-q9},[r3] @ load key schedule...
+ sub r5,r5,#6
+ add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
+ sub r5,r5,#2
+ vld1.32 {q10-q11},[r7]!
+ vld1.32 {q12-q13},[r7]!
+ vld1.32 {q14-q15},[r7]!
+ vld1.32 {q7},[r7]
+
+ add r7,r3,#32
+ mov r6,r5
+
+ subs r2,r2,#2
+ blo .Lctr32_tail
+
+#ifndef __ARMEB__
+ rev r8, r8
+#endif
+ vorr q1,q0,q0
+ add r8, r8, #1
+ vorr q6,q0,q0
+ rev r10, r8
+ cmp r5,#2
+ vmov.32 d3[1],r10
+ beq .Lctr32_128
+
+.Loop2x_ctr32:
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+ vld1.32 {q8},[r7]!
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ subs r6,r6,#2
+ .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+ .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+ vld1.32 {q9},[r7]!
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ bgt .Loop2x_ctr32
+
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+ .byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
+ vorr q0,q6,q6
+ .byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
+ vorr q1,q6,q6
+ .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
+ .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
+ vld1.8 {q2},[r0]!
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+ vld1.8 {q3},[r0]!
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ add r8,r8,#1
+ .byte 0x24,0x83,0xb0,0xf3 @ aese q4,q10
+ .byte 0x24,0xa3,0xb0,0xf3 @ aese q5,q10
+ rev r9,r8
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ add r8,r8,#1
+ .byte 0x26,0x83,0xb0,0xf3 @ aese q4,q11
+ .byte 0x26,0xa3,0xb0,0xf3 @ aese q5,q11
+ veor q2,q2,q7
+ rev r10,r8
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ veor q3,q3,q7
+ mov r7,r3
+ .byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
+ .byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
+ subs r2,r2,#2
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ vld1.32 {q8-q9},[r7]! @ re-pre-load rndkey[0-1]
+ .byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
+ .byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ .byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
+ .byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
+ vmov.32 d1[1], r9
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+ vmov.32 d3[1], r10
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ .byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
+ .byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
+
+ mov r6,r5
+ veor q2,q2,q4
+ veor q3,q3,q5
+ vst1.8 {q2},[r1]!
+ vst1.8 {q3},[r1]!
+ bhs .Loop2x_ctr32
+
+ adds r2,r2,#2
+ beq .Lctr32_done
+ b .Lctr32_tail
+
+.Lctr32_128:
+ vld1.32 {q4-q5},[r7]
+
+.Loop2x_ctr32_128:
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q2},[r0]!
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.8 {q3},[r0]!
+ .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+ .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+ add r8,r8,#1
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ rev r9,r8
+ .byte 0x08,0x03,0xb0,0xf3 @ aese q0,q4
+ .byte 0x08,0x23,0xb0,0xf3 @ aese q1,q4
+ add r8,r8,#1
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ rev r10,r8
+ .byte 0x0a,0x03,0xb0,0xf3 @ aese q0,q5
+ .byte 0x0a,0x23,0xb0,0xf3 @ aese q1,q5
+ subs r2,r2,#2
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+ .byte 0x24,0x23,0xb0,0xf3 @ aese q1,q10
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+ .byte 0x26,0x23,0xb0,0xf3 @ aese q1,q11
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+ .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+ .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+ .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ veor q2,q2,q7
+ .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+ veor q3,q3,q7
+ .byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
+
+ veor q2,q2,q0
+ vorr q0,q6,q6
+ veor q3,q3,q1
+ vorr q1,q6,q6
+ vst1.8 {q2},[r1]!
+ vmov.32 d1[1], r9
+ vst1.8 {q3},[r1]!
+ vmov.32 d3[1], r10
+ bhs .Loop2x_ctr32_128
+
+ adds r2,r2,#2
+ beq .Lctr32_done
+
+.Lctr32_tail:
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ vld1.32 {q8},[r7]!
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r6,r6,#2
+ .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+ vld1.32 {q9},[r7]!
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ bgt .Lctr32_tail
+
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q2},[r0]
+ .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ veor q2,q2,q7
+ .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+
+ veor q2,q2,q0
+ vst1.8 {q2},[r1]
+
+.Lctr32_done:
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r10,pc}
+.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
+#endif
diff --git a/main/openssl/crypto/aes/asm/aesv8-armx.pl b/main/openssl/crypto/aes/asm/aesv8-armx.pl
new file mode 100644
index 00000000..415dc04a
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesv8-armx.pl
@@ -0,0 +1,980 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for ARMv8 AES instructions. The
+# module is endian-agnostic in sense that it supports both big- and
+# little-endian cases. As does it support both 32- and 64-bit modes
+# of operation. Latter is achieved by limiting amount of utilized
+# registers to 16, which implies additional instructions. This has
+# no effect on mighty Apple A7, as results are literally equal to
+# the theoretical estimates based on instruction latencies and issue
+# rate. It remains to be seen how does it affect other platforms...
+#
+# Performance in cycles per byte processed with 128-bit key:
+#
+# CBC enc CBC dec CTR
+# Apple A7 2.39 1.20 1.20
+# Cortex-A5x n/a n/a n/a
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$prefix="aes_v8";
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_ARCH__>=7
+.text
+___
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
+
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
+# maintain both 32- and 64-bit codes within single module and
+# transliterate common code to either flavour with regex vodoo.
+#
+{{{
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
+ $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
+
+
+$code.=<<___;
+.align 5
+rcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.globl ${prefix}_set_encrypt_key
+.type ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+.Lenc_key:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___;
+ adr $ptr,rcon
+ cmp $bits,#192
+
+ veor $zero,$zero,$zero
+ vld1.8 {$in0},[$inp],#16
+ mov $bits,#8 // reuse $bits
+ vld1.32 {$rcon,$mask},[$ptr],#32
+
+ b.lt .Loop128
+ b.eq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+ b.ne .Loop128
+
+ vld1.32 {$rcon},[$ptr]
+
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ veor $in0,$in0,$key
+ vst1.32 {$in0},[$out]
+ add $out,$out,#0x50
+
+ mov $rounds,#10
+ b .Ldone
+
+.align 4
+.L192:
+ vld1.8 {$in1},[$inp],#8
+ vmov.i8 $key,#8 // borrow $key
+ vst1.32 {$in0},[$out],#16
+ vsub.i8 $mask,$mask,$key // adjust the mask
+
+.Loop192:
+ vtbl.8 $key,{$in1},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in1},[$out],#8
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+
+ vdup.32 $tmp,${in0}[3]
+ veor $tmp,$tmp,$in1
+ veor $key,$key,$rcon
+ vext.8 $in1,$zero,$in1,#12
+ vshl.u8 $rcon,$rcon,#1
+ veor $in1,$in1,$tmp
+ veor $in0,$in0,$key
+ veor $in1,$in1,$key
+ vst1.32 {$in0},[$out],#16
+ b.ne .Loop192
+
+ mov $rounds,#12
+ add $out,$out,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ vld1.8 {$in1},[$inp]
+ mov $bits,#7
+ mov $rounds,#14
+ vst1.32 {$in0},[$out],#16
+
+.Loop256:
+ vtbl.8 $key,{$in1},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in1},[$out],#16
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+ vst1.32 {$in0},[$out],#16
+ b.eq .Ldone
+
+ vdup.32 $key,${in0}[3] // just splat
+ vext.8 $tmp,$zero,$in1,#12
+ aese $key,$zero
+
+ veor $in1,$in1,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in1,$in1,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in1,$in1,$tmp
+
+ veor $in1,$in1,$key
+ b .Loop256
+
+.Ldone:
+ str $rounds,[$out]
+
+ eor x0,x0,x0 // return value
+ `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
+ ret
+.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+
+.globl ${prefix}_set_decrypt_key
+.type ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ stmdb sp!,{r4,lr}
+___
+$code.=<<___;
+ bl .Lenc_key
+
+ sub $out,$out,#240 // restore original $out
+ mov x4,#-16
+ add $inp,$out,x12,lsl#4 // end of key schedule
+
+ vld1.32 {v0.16b},[$out]
+ vld1.32 {v1.16b},[$inp]
+ vst1.32 {v0.16b},[$inp],x4
+ vst1.32 {v1.16b},[$out],#16
+
+.Loop_imc:
+ vld1.32 {v0.16b},[$out]
+ vld1.32 {v1.16b},[$inp]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ vst1.32 {v0.16b},[$inp],x4
+ vst1.32 {v1.16b},[$out],#16
+ cmp $inp,$out
+ b.hi .Loop_imc
+
+ vld1.32 {v0.16b},[$out]
+ aesimc v0.16b,v0.16b
+ vst1.32 {v0.16b},[$inp]
+
+ eor x0,x0,x0 // return value
+___
+$code.=<<___ if ($flavour !~ /64/);
+ ldmia sp!,{r4,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldp x29,x30,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
+my ($inp,$out,$key)=map("x$_",(0..2));
+my $rounds="w3";
+my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
+
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+ ldr $rounds,[$key,#240]
+ vld1.32 {$rndkey0},[$key],#16
+ vld1.8 {$inout},[$inp]
+ sub $rounds,$rounds,#2
+ vld1.32 {$rndkey1},[$key],#16
+
+.Loop_${dir}c:
+ aes$e $inout,$rndkey0
+ vld1.32 {$rndkey0},[$key],#16
+ aes$mc $inout,$inout
+ subs $rounds,$rounds,#2
+ aes$e $inout,$rndkey1
+ vld1.32 {$rndkey1},[$key],#16
+ aes$mc $inout,$inout
+ b.gt .Loop_${dir}c
+
+ aes$e $inout,$rndkey0
+ vld1.32 {$rndkey0},[$key]
+ aes$mc $inout,$inout
+ aes$e $inout,$rndkey1
+ veor $inout,$inout,$rndkey0
+
+ vst1.8 {$inout},[$out]
+ ret
+.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
+my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r8,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load remaining args
+___
+$code.=<<___;
+ subs $len,$len,#16
+ mov $step,#16
+ b.lo .Lcbc_abort
+ cclr $step,eq
+
+ cmp $enc,#0 // en- or decrypting?
+ ldr $rounds,[$key,#240]
+ and $len,$len,#-16
+ vld1.8 {$ivec},[$ivp]
+ vld1.8 {$dat},[$inp],$step
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#6
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q10-q11},[$key_],#32
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ add $key_,$key,#32
+ mov $cnt,$rounds
+ b.eq .Lcbc_dec
+
+ cmp $rounds,#2
+ veor $dat,$dat,$ivec
+ veor $rndzero_n_last,q8,$rndlast
+ b.eq .Lcbc_enc128
+
+.Loop_cbc_enc:
+ aese $dat,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat,$dat
+ subs $cnt,$cnt,#2
+ aese $dat,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat,$dat
+ b.gt .Loop_cbc_enc
+
+ aese $dat,q8
+ aesmc $dat,$dat
+ subs $len,$len,#16
+ aese $dat,q9
+ aesmc $dat,$dat
+ cclr $step,eq
+ aese $dat,q10
+ aesmc $dat,$dat
+ add $key_,$key,#16
+ aese $dat,q11
+ aesmc $dat,$dat
+ vld1.8 {q8},[$inp],$step
+ aese $dat,q12
+ aesmc $dat,$dat
+ veor q8,q8,$rndzero_n_last
+ aese $dat,q13
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ aese $dat,q14
+ aesmc $dat,$dat
+ aese $dat,q15
+
+ mov $cnt,$rounds
+ veor $ivec,$dat,$rndlast
+ vst1.8 {$ivec},[$out],#16
+ b.hs .Loop_cbc_enc
+
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ vld1.32 {$in0-$in1},[$key_]
+ aese $dat,q8
+ aesmc $dat,$dat
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+ aese $dat,q8
+ aesmc $dat,$dat
+ vst1.8 {$ivec},[$out],#16
+.Lenter_cbc_enc128:
+ aese $dat,q9
+ aesmc $dat,$dat
+ subs $len,$len,#16
+ aese $dat,$in0
+ aesmc $dat,$dat
+ cclr $step,eq
+ aese $dat,$in1
+ aesmc $dat,$dat
+ aese $dat,q10
+ aesmc $dat,$dat
+ aese $dat,q11
+ aesmc $dat,$dat
+ vld1.8 {q8},[$inp],$step
+ aese $dat,q12
+ aesmc $dat,$dat
+ aese $dat,q13
+ aesmc $dat,$dat
+ aese $dat,q14
+ aesmc $dat,$dat
+ veor q8,q8,$rndzero_n_last
+ aese $dat,q15
+ veor $ivec,$dat,$rndlast
+ b.hs .Loop_cbc_enc128
+
+ vst1.8 {$ivec},[$out],#16
+ b .Lcbc_done
+
+.align 5
+.Lcbc_dec128:
+ vld1.32 {$tmp0-$tmp1},[$key_]
+ veor $ivec,$ivec,$rndlast
+ veor $in0,$dat0,$rndlast
+ mov $step1,$step
+
+.Loop2x_cbc_dec128:
+ aesd $dat0,q8
+ aesd $dat1,q8
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ subs $len,$len,#32
+ aesd $dat0,q9
+ aesd $dat1,q9
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ cclr $step,lo
+ aesd $dat0,$tmp0
+ aesd $dat1,$tmp0
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ cclr $step1,ls
+ aesd $dat0,$tmp1
+ aesd $dat1,$tmp1
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q10
+ aesd $dat1,q10
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q11
+ aesd $dat1,q11
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q12
+ aesd $dat1,q12
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q13
+ aesd $dat1,q13
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q14
+ aesd $dat1,q14
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q15
+ aesd $dat1,q15
+
+ veor $ivec,$ivec,$dat0
+ vld1.8 {$dat0},[$inp],$step
+ veor $in0,$in0,$dat1
+ vld1.8 {$dat1},[$inp],$step1
+ vst1.8 {$ivec},[$out],#16
+ veor $ivec,$in1,$rndlast
+ vst1.8 {$in0},[$out],#16
+ veor $in0,$dat0,$rndlast
+ vorr $in1,$dat1,$dat1
+ b.hs .Loop2x_cbc_dec128
+
+ adds $len,$len,#32
+ veor $ivec,$ivec,$rndlast
+ b.eq .Lcbc_done
+ veor $in0,$in0,$rndlast
+ b .Lcbc_dec_tail
+
+.align 5
+.Lcbc_dec:
+ subs $len,$len,#16
+ vorr $in0,$dat,$dat
+ b.lo .Lcbc_dec_tail
+
+ cclr $step,eq
+ cmp $rounds,#2
+ vld1.8 {$dat1},[$inp],$step
+ vorr $in1,$dat1,$dat1
+ b.eq .Lcbc_dec128
+
+.Loop2x_cbc_dec:
+ aesd $dat0,q8
+ aesd $dat1,q8
+ vld1.32 {q8},[$key_],#16
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ subs $cnt,$cnt,#2
+ aesd $dat0,q9
+ aesd $dat1,q9
+ vld1.32 {q9},[$key_],#16
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ b.gt .Loop2x_cbc_dec
+
+ aesd $dat0,q8
+ aesd $dat1,q8
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ veor $tmp0,$ivec,$rndlast
+ veor $tmp1,$in0,$rndlast
+ aesd $dat0,q9
+ aesd $dat1,q9
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ vorr $ivec,$in1,$in1
+ subs $len,$len,#32
+ aesd $dat0,q10
+ aesd $dat1,q10
+ aesimc $dat0,$dat0
+ cclr $step,lo
+ aesimc $dat1,$dat1
+ mov $key_,$key
+ aesd $dat0,q11
+ aesd $dat1,q11
+ aesimc $dat0,$dat0
+ vld1.8 {$in0},[$inp],$step
+ aesimc $dat1,$dat1
+ cclr $step,ls
+ aesd $dat0,q12
+ aesd $dat1,q12
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ vld1.8 {$in1},[$inp],$step
+ aesd $dat0,q13
+ aesd $dat1,q13
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ aesd $dat0,q14
+ aesd $dat1,q14
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ aesd $dat0,q15
+ aesd $dat1,q15
+
+ mov $cnt,$rounds
+ veor $tmp0,$tmp0,$dat0
+ veor $tmp1,$tmp1,$dat1
+ vorr $dat0,$in0,$in0
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat1,$in1,$in1
+ vst1.8 {$tmp1},[$out],#16
+ b.hs .Loop2x_cbc_dec
+
+ adds $len,$len,#32
+ b.eq .Lcbc_done
+
+.Lcbc_dec_tail:
+ aesd $dat,q8
+ vld1.32 {q8},[$key_],#16
+ aesimc $dat,$dat
+ subs $cnt,$cnt,#2
+ aesd $dat,q9
+ vld1.32 {q9},[$key_],#16
+ aesimc $dat,$dat
+ b.gt .Lcbc_dec_tail
+
+ aesd $dat,q8
+ aesimc $dat,$dat
+ aesd $dat,q9
+ aesimc $dat,$dat
+ veor $tmp,$ivec,$rndlast
+ aesd $dat,q10
+ aesimc $dat,$dat
+ vorr $ivec,$in0,$in0
+ aesd $dat,q11
+ aesimc $dat,$dat
+ aesd $dat,q12
+ aesimc $dat,$dat
+ aesd $dat,q13
+ aesimc $dat,$dat
+ aesd $dat,q14
+ aesimc $dat,$dat
+ aesd $dat,q15
+
+ veor $tmp,$tmp,$dat
+ vst1.8 {$tmp},[$out],#16
+
+.Lcbc_done:
+ vst1.8 {$ivec},[$ivp]
+.Lcbc_abort:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r8,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r10,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+___
+$code.=<<___;
+ ldr $rounds,[$key,#240]
+
+ ldr $ctr, [$ivp, #12]
+ vld1.32 {$dat0},[$ivp]
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#6
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q10-q11},[$key_],#32
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ add $key_,$key,#32
+ mov $cnt,$rounds
+
+ subs $len,$len,#2
+ b.lo .Lctr32_tail
+
+#ifndef __ARMEB__
+ rev $ctr, $ctr
+#endif
+ vorr $dat1,$dat0,$dat0
+ add $ctr, $ctr, #1
+ vorr $ivec,$dat0,$dat0
+ rev $tctr1, $ctr
+ cmp $rounds,#2
+ vmov.32 ${dat1}[3],$tctr1
+ b.eq .Lctr32_128
+
+.Loop2x_ctr32:
+ aese $dat0,q8
+ aese $dat1,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aese $dat1,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ b.gt .Loop2x_ctr32
+
+ aese $dat0,q8
+ aese $dat1,q8
+ aesmc $tmp0,$dat0
+ vorr $dat0,$ivec,$ivec
+ aesmc $tmp1,$dat1
+ vorr $dat1,$ivec,$ivec
+ aese $tmp0,q9
+ aese $tmp1,q9
+ vld1.8 {$in0},[$inp],#16
+ aesmc $tmp0,$tmp0
+ vld1.8 {$in1},[$inp],#16
+ aesmc $tmp1,$tmp1
+ add $ctr,$ctr,#1
+ aese $tmp0,q10
+ aese $tmp1,q10
+ rev $tctr,$ctr
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ add $ctr,$ctr,#1
+ aese $tmp0,q11
+ aese $tmp1,q11
+ veor $in0,$in0,$rndlast
+ rev $tctr1,$ctr
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ veor $in1,$in1,$rndlast
+ mov $key_,$key
+ aese $tmp0,q12
+ aese $tmp1,q12
+ subs $len,$len,#2
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1]
+ aese $tmp0,q13
+ aese $tmp1,q13
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ aese $tmp0,q14
+ aese $tmp1,q14
+ vmov.32 ${dat0}[3], $tctr
+ aesmc $tmp0,$tmp0
+ vmov.32 ${dat1}[3], $tctr1
+ aesmc $tmp1,$tmp1
+ aese $tmp0,q15
+ aese $tmp1,q15
+
+ mov $cnt,$rounds
+ veor $in0,$in0,$tmp0
+ veor $in1,$in1,$tmp1
+ vst1.8 {$in0},[$out],#16
+ vst1.8 {$in1},[$out],#16
+ b.hs .Loop2x_ctr32
+
+ adds $len,$len,#2
+ b.eq .Lctr32_done
+ b .Lctr32_tail
+
+.Lctr32_128:
+ vld1.32 {$tmp0-$tmp1},[$key_]
+
+.Loop2x_ctr32_128:
+ aese $dat0,q8
+ aese $dat1,q8
+ aesmc $dat0,$dat0
+ vld1.8 {$in0},[$inp],#16
+ aesmc $dat1,$dat1
+ vld1.8 {$in1},[$inp],#16
+ aese $dat0,q9
+ aese $dat1,q9
+ add $ctr,$ctr,#1
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ rev $tctr,$ctr
+ aese $dat0,$tmp0
+ aese $dat1,$tmp0
+ add $ctr,$ctr,#1
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ rev $tctr1,$ctr
+ aese $dat0,$tmp1
+ aese $dat1,$tmp1
+ subs $len,$len,#2
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q10
+ aese $dat1,q10
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q11
+ aese $dat1,q11
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q12
+ aese $dat1,q12
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q13
+ aese $dat1,q13
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q14
+ aese $dat1,q14
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ veor $in0,$in0,$rndlast
+ aese $dat0,q15
+ veor $in1,$in1,$rndlast
+ aese $dat1,q15
+
+ veor $in0,$in0,$dat0
+ vorr $dat0,$ivec,$ivec
+ veor $in1,$in1,$dat1
+ vorr $dat1,$ivec,$ivec
+ vst1.8 {$in0},[$out],#16
+ vmov.32 ${dat0}[3], $tctr
+ vst1.8 {$in1},[$out],#16
+ vmov.32 ${dat1}[3], $tctr1
+ b.hs .Loop2x_ctr32_128
+
+ adds $len,$len,#2
+ b.eq .Lctr32_done
+
+.Lctr32_tail:
+ aese $dat,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat,$dat
+ subs $cnt,$cnt,#2
+ aese $dat,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat,$dat
+ b.gt .Lctr32_tail
+
+ aese $dat,q8
+ aesmc $dat,$dat
+ aese $dat,q9
+ aesmc $dat,$dat
+ vld1.8 {$in0},[$inp]
+ aese $dat,q10
+ aesmc $dat,$dat
+ aese $dat,q11
+ aesmc $dat,$dat
+ aese $dat,q12
+ aesmc $dat,$dat
+ aese $dat,q13
+ aesmc $dat,$dat
+ aese $dat,q14
+ aesmc $dat,$dat
+ veor $in0,$in0,$rndlast
+ aese $dat,q15
+
+ veor $in0,$in0,$dat
+ vst1.8 {$in0},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r10,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
+########################################
+if ($flavour =~ /64/) { ######## 64-bit code
+ my %opcode = (
+ "aesd" => 0x4e285800, "aese" => 0x4e284800,
+ "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
+
+ local *unaes = sub {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5),
+ $mnemonic,$arg;
+ };
+
+ foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
+ s/@\s/\/\//o; # old->new style commentary
+
+ #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vext\.8/ext/o or
+ s/vrev32\.8/rev32/o or
+ s/vtst\.8/cmtst/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
+ s/\bbx\s+lr\b/ret/o;
+
+ # fix up remainig legacy suffixes
+ s/\.[ui]?8//o;
+ m/\],#8/o and s/\.16b/\.8b/go;
+ s/\.[ui]?32//o and s/\.16b/\.4s/go;
+ s/\.[ui]?64//o and s/\.16b/\.2d/go;
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ my %opcode = (
+ "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
+ "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
+
+ local *unaes = sub {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<1) |(($2&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ };
+
+ sub unvtbl {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
+ sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
+ "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
+ }
+
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+
+ sub unvmov32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+ sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
+ }
+
+ foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remainig new-style suffixes
+ s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
+ s/\],#[0-9]+/]!/o;
+
+ s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/vmov\.32\s+(.*)/unvmov32($1)/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT;