diff options
Diffstat (limited to 'app/openssl/crypto/aes/asm')
-rw-r--r-- | app/openssl/crypto/aes/asm/aes-armv4.pl | 139 | ||||
-rw-r--r-- | app/openssl/crypto/aes/asm/aes-armv4.s | 160 | ||||
-rw-r--r-- | app/openssl/crypto/aes/asm/aesv8-armx-64.S | 761 | ||||
-rw-r--r-- | app/openssl/crypto/aes/asm/aesv8-armx.S | 767 | ||||
-rw-r--r-- | app/openssl/crypto/aes/asm/aesv8-armx.pl | 980 |
5 files changed, 55 insertions, 2752 deletions
diff --git a/app/openssl/crypto/aes/asm/aes-armv4.pl b/app/openssl/crypto/aes/asm/aes-armv4.pl index 4f891708..86b86c4a 100644 --- a/app/openssl/crypto/aes/asm/aes-armv4.pl +++ b/app/openssl/crypto/aes/asm/aes-armv4.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -51,23 +51,9 @@ $key="r11"; $rounds="r12"; $code=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -#endif - +#include "arm_arch.h" .text -#if __ARM_ARCH__<7 -.code 32 -#else -.syntax unified -# ifdef __thumb2__ -.thumb -# else .code 32 -# endif -#endif .type AES_Te,%object .align 5 @@ -181,11 +167,7 @@ AES_Te: .type AES_encrypt,%function .align 5 AES_encrypt: -#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_encrypt -#else - adr r3,AES_encrypt -#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -427,21 +409,11 @@ _armv4_AES_encrypt: .align 5 private_AES_set_encrypt_key: _armv4_AES_set_encrypt_key: -#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_set_encrypt_key -#else - adr r3,private_AES_set_encrypt_key -#endif teq r0,#0 -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif moveq r0,#-1 beq .Labrt teq r2,#0 -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif moveq r0,#-1 beq .Labrt @@ -450,9 +422,6 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 -#if __ARM_ARCH__>=7 - itt ne @ Thumb2 thing, sanity check in ARM -#endif movne r0,#-1 bne .Labrt @@ -607,9 +576,6 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-16] subs $rounds,$rounds,#1 str $s3,[$key,#-12] -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif subeq r2,$key,#216 beq .Ldone @@ -679,9 +645,6 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-24] subs $rounds,$rounds,#1 str $s3,[$key,#-20] -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif subeq r2,$key,#256 beq .Ldone @@ -711,17 +674,11 @@ _armv4_AES_set_encrypt_key: str $i3,[$key,#-4] b .L256_loop -.align 2 .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} -.Labrt: -#if __ARM_ARCH__>=5 - ret @ bx lr -#else - tst lr,#1 +.Labrt: tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) -#endif .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .global private_AES_set_decrypt_key @@ -731,57 +688,34 @@ private_AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 - ldr lr,[sp],#4 @ pop lr + ldrne lr,[sp],#4 @ pop lr bne .Labrt - mov r0,r2 @ AES_set_encrypt_key preserves r2, - mov r1,r2 @ which is AES_KEY *key - b _armv4_AES_set_enc2dec_key -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key + stmdb sp!,{r4-r12} -@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) -.global AES_set_enc2dec_key -.type AES_set_enc2dec_key,%function -.align 5 -AES_set_enc2dec_key: -_armv4_AES_set_enc2dec_key: - stmdb sp!,{r4-r12,lr} - - ldr $rounds,[r0,#240] - mov $i1,r0 @ input - add $i2,r0,$rounds,lsl#4 - mov $key,r1 @ ouput - add $tbl,r1,$rounds,lsl#4 - str $rounds,[r1,#240] - -.Linv: ldr $s0,[$i1],#16 - ldr $s1,[$i1,#-12] - ldr $s2,[$i1,#-8] - ldr $s3,[$i1,#-4] - ldr $t1,[$i2],#-16 - ldr $t2,[$i2,#16+4] - ldr $t3,[$i2,#16+8] - ldr $i3,[$i2,#16+12] - str $s0,[$tbl],#-16 - str $s1,[$tbl,#16+4] - str $s2,[$tbl,#16+8] - str $s3,[$tbl,#16+12] - str $t1,[$key],#16 - str $t2,[$key,#-12] - str $t3,[$key,#-8] - str $i3,[$key,#-4] - teq $i1,$i2 - bne .Linv + ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2, + mov $key,r2 @ which is AES_KEY *key + mov $i1,r2 + add $i2,r2,$rounds,lsl#4 - ldr $s0,[$i1] +.Linv: ldr $s0,[$i1] ldr $s1,[$i1,#4] ldr $s2,[$i1,#8] ldr $s3,[$i1,#12] - str $s0,[$key] - str $s1,[$key,#4] - str $s2,[$key,#8] - str $s3,[$key,#12] - sub $key,$key,$rounds,lsl#3 + ldr $t1,[$i2] + ldr $t2,[$i2,#4] + ldr $t3,[$i2,#8] + ldr $i3,[$i2,#12] + str $s0,[$i2],#-16 + str $s1,[$i2,#16+4] + str $s2,[$i2,#16+8] + str $s3,[$i2,#16+12] + str $t1,[$i1],#16 + str $t2,[$i1,#-12] + str $t3,[$i1,#-8] + str $i3,[$i1,#-4] + teq $i1,$i2 + bne .Linv ___ $mask80=$i1; $mask1b=$i2; @@ -839,7 +773,7 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size AES_set_enc2dec_key,.-AES_set_enc2dec_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key .type AES_Td,%object .align 5 @@ -949,11 +883,7 @@ AES_Td: .type AES_decrypt,%function .align 5 AES_decrypt: -#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_decrypt -#else - adr r3,AES_decrypt -#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -1150,9 +1080,8 @@ _armv4_AES_decrypt: ldrb $t3,[$tbl,$i3] @ Td4[s0>>0] and $i3,lr,$s1,lsr#8 - add $s1,$tbl,$s1,lsr#24 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0] - ldrb $s1,[$s1] @ Td4[s1>>24] + ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24] ldrb $i2,[$tbl,$i2] @ Td4[s1>>16] eor $s0,$i1,$s0,lsl#24 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8] @@ -1165,8 +1094,7 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] and $i3,lr,$s2,lsr#16 - add $s2,$tbl,$s2,lsr#24 - ldrb $s2,[$s2] @ Td4[s2>>24] + ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] eor $s0,$s0,$i1,lsl#8 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] eor $s1,$i2,$s1,lsl#16 @@ -1178,9 +1106,8 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] and $i3,lr,$s3 @ i2 - add $s3,$tbl,$s3,lsr#24 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] - ldrb $s3,[$s3] @ Td4[s3>>24] + ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] eor $s0,$s0,$i1,lsl#16 ldr $i1,[$key,#0] eor $s1,$s1,$i2,lsl#8 @@ -1203,15 +1130,5 @@ _armv4_AES_decrypt: ___ $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -$code =~ s/\bret\b/bx\tlr/gm; - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/@/ and !/^$/); - print; -} -close SELF; - print $code; close STDOUT; # enforce flush diff --git a/app/openssl/crypto/aes/asm/aes-armv4.s b/app/openssl/crypto/aes/asm/aes-armv4.s index 333a5227..2697d4ce 100644 --- a/app/openssl/crypto/aes/asm/aes-armv4.s +++ b/app/openssl/crypto/aes/asm/aes-armv4.s @@ -1,53 +1,6 @@ - -@ ==================================================================== -@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ ==================================================================== - -@ AES for ARMv4 - -@ January 2007. -@ -@ Code uses single 1K S-box and is >2 times faster than code generated -@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which -@ allows to merge logical or arithmetic operation with shift or rotate -@ in one instruction and emit combined result every cycle. The module -@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit -@ key [on single-issue Xscale PXA250 core]. - -@ May 2007. -@ -@ AES_set_[en|de]crypt_key is added. - -@ July 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 12% improvement on -@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 16% -@ improvement on Cortex A8 core and ~21.5 cycles per byte. - -#ifndef __KERNEL__ -# include "arm_arch.h" -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -#endif - +#include "arm_arch.h" .text -#if __ARM_ARCH__<7 -.code 32 -#else -.syntax unified -# ifdef __thumb2__ -.thumb -# else .code 32 -# endif -#endif .type AES_Te,%object .align 5 @@ -161,11 +114,7 @@ AES_Te: .type AES_encrypt,%function .align 5 AES_encrypt: -#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_encrypt -#else - adr r3,AES_encrypt -#endif stmdb sp!,{r1,r4-r12,lr} mov r12,r0 @ inp mov r11,r2 @@ -407,21 +356,11 @@ _armv4_AES_encrypt: .align 5 private_AES_set_encrypt_key: _armv4_AES_set_encrypt_key: -#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_set_encrypt_key -#else - adr r3,private_AES_set_encrypt_key -#endif teq r0,#0 -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif moveq r0,#-1 beq .Labrt teq r2,#0 -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif moveq r0,#-1 beq .Labrt @@ -430,9 +369,6 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 -#if __ARM_ARCH__>=7 - itt ne @ Thumb2 thing, sanity check in ARM -#endif movne r0,#-1 bne .Labrt @@ -587,9 +523,6 @@ _armv4_AES_set_encrypt_key: str r2,[r11,#-16] subs r12,r12,#1 str r3,[r11,#-12] -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif subeq r2,r11,#216 beq .Ldone @@ -659,9 +592,6 @@ _armv4_AES_set_encrypt_key: str r2,[r11,#-24] subs r12,r12,#1 str r3,[r11,#-20] -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif subeq r2,r11,#256 beq .Ldone @@ -691,17 +621,11 @@ _armv4_AES_set_encrypt_key: str r9,[r11,#-4] b .L256_loop -.align 2 .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} -.Labrt: -#if __ARM_ARCH__>=5 - bx lr @ .word 0xe12fff1e -#else - tst lr,#1 +.Labrt: tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .global private_AES_set_decrypt_key @@ -711,57 +635,34 @@ private_AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 - ldr lr,[sp],#4 @ pop lr + ldrne lr,[sp],#4 @ pop lr bne .Labrt - mov r0,r2 @ AES_set_encrypt_key preserves r2, - mov r1,r2 @ which is AES_KEY *key - b _armv4_AES_set_enc2dec_key -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key - -@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) -.global AES_set_enc2dec_key -.type AES_set_enc2dec_key,%function -.align 5 -AES_set_enc2dec_key: -_armv4_AES_set_enc2dec_key: - stmdb sp!,{r4-r12,lr} - - ldr r12,[r0,#240] - mov r7,r0 @ input - add r8,r0,r12,lsl#4 - mov r11,r1 @ ouput - add r10,r1,r12,lsl#4 - str r12,[r1,#240] + stmdb sp!,{r4-r12} -.Linv: ldr r0,[r7],#16 - ldr r1,[r7,#-12] - ldr r2,[r7,#-8] - ldr r3,[r7,#-4] - ldr r4,[r8],#-16 - ldr r5,[r8,#16+4] - ldr r6,[r8,#16+8] - ldr r9,[r8,#16+12] - str r0,[r10],#-16 - str r1,[r10,#16+4] - str r2,[r10,#16+8] - str r3,[r10,#16+12] - str r4,[r11],#16 - str r5,[r11,#-12] - str r6,[r11,#-8] - str r9,[r11,#-4] - teq r7,r8 - bne .Linv + ldr r12,[r2,#240] @ AES_set_encrypt_key preserves r2, + mov r11,r2 @ which is AES_KEY *key + mov r7,r2 + add r8,r2,r12,lsl#4 - ldr r0,[r7] +.Linv: ldr r0,[r7] ldr r1,[r7,#4] ldr r2,[r7,#8] ldr r3,[r7,#12] - str r0,[r11] - str r1,[r11,#4] - str r2,[r11,#8] - str r3,[r11,#12] - sub r11,r11,r12,lsl#3 + ldr r4,[r8] + ldr r5,[r8,#4] + ldr r6,[r8,#8] + ldr r9,[r8,#12] + str r0,[r8],#-16 + str r1,[r8,#16+4] + str r2,[r8,#16+8] + str r3,[r8,#16+12] + str r4,[r7],#16 + str r5,[r7,#-12] + str r6,[r7,#-8] + str r9,[r7,#-4] + teq r7,r8 + bne .Linv ldr r0,[r11,#16]! @ prefetch tp1 mov r7,#0x80 mov r8,#0x1b @@ -814,7 +715,7 @@ _armv4_AES_set_enc2dec_key: moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size AES_set_enc2dec_key,.-AES_set_enc2dec_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key .type AES_Td,%object .align 5 @@ -924,11 +825,7 @@ AES_Td: .type AES_decrypt,%function .align 5 AES_decrypt: -#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_decrypt -#else - adr r3,AES_decrypt -#endif stmdb sp!,{r1,r4-r12,lr} mov r12,r0 @ inp mov r11,r2 @@ -1125,9 +1022,8 @@ _armv4_AES_decrypt: ldrb r6,[r10,r9] @ Td4[s0>>0] and r9,lr,r1,lsr#8 - add r1,r10,r1,lsr#24 ldrb r7,[r10,r7] @ Td4[s1>>0] - ldrb r1,[r1] @ Td4[s1>>24] + ldrb r1,[r10,r1,lsr#24] @ Td4[s1>>24] ldrb r8,[r10,r8] @ Td4[s1>>16] eor r0,r7,r0,lsl#24 ldrb r9,[r10,r9] @ Td4[s1>>8] @@ -1140,8 +1036,7 @@ _armv4_AES_decrypt: ldrb r8,[r10,r8] @ Td4[s2>>0] and r9,lr,r2,lsr#16 - add r2,r10,r2,lsr#24 - ldrb r2,[r2] @ Td4[s2>>24] + ldrb r2,[r10,r2,lsr#24] @ Td4[s2>>24] eor r0,r0,r7,lsl#8 ldrb r9,[r10,r9] @ Td4[s2>>16] eor r1,r8,r1,lsl#16 @@ -1153,9 +1048,8 @@ _armv4_AES_decrypt: ldrb r8,[r10,r8] @ Td4[s3>>8] and r9,lr,r3 @ i2 - add r3,r10,r3,lsr#24 ldrb r9,[r10,r9] @ Td4[s3>>0] - ldrb r3,[r3] @ Td4[s3>>24] + ldrb r3,[r10,r3,lsr#24] @ Td4[s3>>24] eor r0,r0,r7,lsl#16 ldr r7,[r11,#0] eor r1,r1,r8,lsl#8 diff --git a/app/openssl/crypto/aes/asm/aesv8-armx-64.S b/app/openssl/crypto/aes/asm/aesv8-armx-64.S deleted file mode 100644 index be0a13df..00000000 --- a/app/openssl/crypto/aes/asm/aesv8-armx-64.S +++ /dev/null @@ -1,761 +0,0 @@ -#include "arm_arch.h" - -#if __ARM_ARCH__>=7 -.text -.arch armv8-a+crypto -.align 5 -rcon: -.long 0x01,0x01,0x01,0x01 -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat -.long 0x1b,0x1b,0x1b,0x1b - -.globl aes_v8_set_encrypt_key -.type aes_v8_set_encrypt_key,%function -.align 5 -aes_v8_set_encrypt_key: -.Lenc_key: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - adr x3,rcon - cmp w1,#192 - - eor v0.16b,v0.16b,v0.16b - ld1 {v3.16b},[x0],#16 - mov w1,#8 // reuse w1 - ld1 {v1.4s,v2.4s},[x3],#32 - - b.lt .Loop128 - b.eq .L192 - b .L256 - -.align 4 -.Loop128: - tbl v6.16b,{v3.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 - aese v6.16b,v0.16b - subs w1,w1,#1 - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - shl v1.16b,v1.16b,#1 - eor v3.16b,v3.16b,v6.16b - b.ne .Loop128 - - ld1 {v1.4s},[x3] - - tbl v6.16b,{v3.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 - aese v6.16b,v0.16b - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - shl v1.16b,v1.16b,#1 - eor v3.16b,v3.16b,v6.16b - - tbl v6.16b,{v3.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 - aese v6.16b,v0.16b - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - eor v3.16b,v3.16b,v6.16b - st1 {v3.4s},[x2] - add x2,x2,#0x50 - - mov w12,#10 - b .Ldone - -.align 4 -.L192: - ld1 {v4.8b},[x0],#8 - movi v6.16b,#8 // borrow v6.16b - st1 {v3.4s},[x2],#16 - sub v2.16b,v2.16b,v6.16b // adjust the mask - -.Loop192: - tbl v6.16b,{v4.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v4.8b},[x2],#8 - aese v6.16b,v0.16b - subs w1,w1,#1 - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - - dup v5.4s,v3.s[3] - eor v5.16b,v5.16b,v4.16b - eor v6.16b,v6.16b,v1.16b - ext v4.16b,v0.16b,v4.16b,#12 - shl v1.16b,v1.16b,#1 - eor v4.16b,v4.16b,v5.16b - eor v3.16b,v3.16b,v6.16b - eor v4.16b,v4.16b,v6.16b - st1 {v3.4s},[x2],#16 - b.ne .Loop192 - - mov w12,#12 - add x2,x2,#0x20 - b .Ldone - -.align 4 -.L256: - ld1 {v4.16b},[x0] - mov w1,#7 - mov w12,#14 - st1 {v3.4s},[x2],#16 - -.Loop256: - tbl v6.16b,{v4.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v4.4s},[x2],#16 - aese v6.16b,v0.16b - subs w1,w1,#1 - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - shl v1.16b,v1.16b,#1 - eor v3.16b,v3.16b,v6.16b - st1 {v3.4s},[x2],#16 - b.eq .Ldone - - dup v6.4s,v3.s[3] // just splat - ext v5.16b,v0.16b,v4.16b,#12 - aese v6.16b,v0.16b - - eor v4.16b,v4.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v4.16b,v4.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v4.16b,v4.16b,v5.16b - - eor v4.16b,v4.16b,v6.16b - b .Loop256 - -.Ldone: - str w12,[x2] - - eor x0,x0,x0 // return value - ldr x29,[sp],#16 - ret -.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key - -.globl aes_v8_set_decrypt_key -.type aes_v8_set_decrypt_key,%function -.align 5 -aes_v8_set_decrypt_key: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - bl .Lenc_key - - sub x2,x2,#240 // restore original x2 - mov x4,#-16 - add x0,x2,x12,lsl#4 // end of key schedule - - ld1 {v0.4s},[x2] - ld1 {v1.4s},[x0] - st1 {v0.4s},[x0],x4 - st1 {v1.4s},[x2],#16 - -.Loop_imc: - ld1 {v0.4s},[x2] - ld1 {v1.4s},[x0] - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - st1 {v0.4s},[x0],x4 - st1 {v1.4s},[x2],#16 - cmp x0,x2 - b.hi .Loop_imc - - ld1 {v0.4s},[x2] - aesimc v0.16b,v0.16b - st1 {v0.4s},[x0] - - eor x0,x0,x0 // return value - ldp x29,x30,[sp],#16 - ret -.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key -.globl aes_v8_encrypt -.type aes_v8_encrypt,%function -.align 5 -aes_v8_encrypt: - ldr w3,[x2,#240] - ld1 {v0.4s},[x2],#16 - ld1 {v2.16b},[x0] - sub w3,w3,#2 - ld1 {v1.4s},[x2],#16 - -.Loop_enc: - aese v2.16b,v0.16b - ld1 {v0.4s},[x2],#16 - aesmc v2.16b,v2.16b - subs w3,w3,#2 - aese v2.16b,v1.16b - ld1 {v1.4s},[x2],#16 - aesmc v2.16b,v2.16b - b.gt .Loop_enc - - aese v2.16b,v0.16b - ld1 {v0.4s},[x2] - aesmc v2.16b,v2.16b - aese v2.16b,v1.16b - eor v2.16b,v2.16b,v0.16b - - st1 {v2.16b},[x1] - ret -.size aes_v8_encrypt,.-aes_v8_encrypt -.globl aes_v8_decrypt -.type aes_v8_decrypt,%function -.align 5 -aes_v8_decrypt: - ldr w3,[x2,#240] - ld1 {v0.4s},[x2],#16 - ld1 {v2.16b},[x0] - sub w3,w3,#2 - ld1 {v1.4s},[x2],#16 - -.Loop_dec: - aesd v2.16b,v0.16b - ld1 {v0.4s},[x2],#16 - aesimc v2.16b,v2.16b - subs w3,w3,#2 - aesd v2.16b,v1.16b - ld1 {v1.4s},[x2],#16 - aesimc v2.16b,v2.16b - b.gt .Loop_dec - - aesd v2.16b,v0.16b - ld1 {v0.4s},[x2] - aesimc v2.16b,v2.16b - aesd v2.16b,v1.16b - eor v2.16b,v2.16b,v0.16b - - st1 {v2.16b},[x1] - ret -.size aes_v8_decrypt,.-aes_v8_decrypt -.globl aes_v8_cbc_encrypt -.type aes_v8_cbc_encrypt,%function -.align 5 -aes_v8_cbc_encrypt: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - subs x2,x2,#16 - mov x8,#16 - b.lo .Lcbc_abort - csel x8,xzr,x8,eq - - cmp w5,#0 // en- or decrypting? - ldr w5,[x3,#240] - and x2,x2,#-16 - ld1 {v6.16b},[x4] - ld1 {v0.16b},[x0],x8 - - ld1 {v16.4s-v17.4s},[x3] // load key schedule... - sub w5,w5,#6 - add x7,x3,x5,lsl#4 // pointer to last 7 round keys - sub w5,w5,#2 - ld1 {v18.4s-v19.4s},[x7],#32 - ld1 {v20.4s-v21.4s},[x7],#32 - ld1 {v22.4s-v23.4s},[x7],#32 - ld1 {v7.4s},[x7] - - add x7,x3,#32 - mov w6,w5 - b.eq .Lcbc_dec - - cmp w5,#2 - eor v0.16b,v0.16b,v6.16b - eor v5.16b,v16.16b,v7.16b - b.eq .Lcbc_enc128 - -.Loop_cbc_enc: - aese v0.16b,v16.16b - ld1 {v16.4s},[x7],#16 - aesmc v0.16b,v0.16b - subs w6,w6,#2 - aese v0.16b,v17.16b - ld1 {v17.4s},[x7],#16 - aesmc v0.16b,v0.16b - b.gt .Loop_cbc_enc - - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - subs x2,x2,#16 - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq - aese v0.16b,v18.16b - aesmc v0.16b,v0.16b - add x7,x3,#16 - aese v0.16b,v19.16b - aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - aese v0.16b,v23.16b - - mov w6,w5 - eor v6.16b,v0.16b,v7.16b - st1 {v6.16b},[x1],#16 - b.hs .Loop_cbc_enc - - b .Lcbc_done - -.align 5 -.Lcbc_enc128: - ld1 {v2.4s-v3.4s},[x7] - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - b .Lenter_cbc_enc128 -.Loop_cbc_enc128: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 -.Lenter_cbc_enc128: - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - subs x2,x2,#16 - aese v0.16b,v2.16b - aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq - aese v0.16b,v3.16b - aesmc v0.16b,v0.16b - aese v0.16b,v18.16b - aesmc v0.16b,v0.16b - aese v0.16b,v19.16b - aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b - aese v0.16b,v23.16b - eor v6.16b,v0.16b,v7.16b - b.hs .Loop_cbc_enc128 - - st1 {v6.16b},[x1],#16 - b .Lcbc_done - -.align 5 -.Lcbc_dec128: - ld1 {v4.4s-v5.4s},[x7] - eor v6.16b,v6.16b,v7.16b - eor v2.16b,v0.16b,v7.16b - mov x12,x8 - -.Loop2x_cbc_dec128: - aesd v0.16b,v16.16b - aesd v1.16b,v16.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - subs x2,x2,#32 - aesd v0.16b,v17.16b - aesd v1.16b,v17.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - csel x8,xzr,x8,lo - aesd v0.16b,v4.16b - aesd v1.16b,v4.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - csel x12,xzr,x12,ls - aesd v0.16b,v5.16b - aesd v1.16b,v5.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - aesd v0.16b,v18.16b - aesd v1.16b,v18.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - aesd v0.16b,v19.16b - aesd v1.16b,v19.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - aesd v0.16b,v20.16b - aesd v1.16b,v20.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - aesd v0.16b,v21.16b - aesd v1.16b,v21.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - aesd v0.16b,v22.16b - aesd v1.16b,v22.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - aesd v0.16b,v23.16b - aesd v1.16b,v23.16b - - eor v6.16b,v6.16b,v0.16b - ld1 {v0.16b},[x0],x8 - eor v2.16b,v2.16b,v1.16b - ld1 {v1.16b},[x0],x12 - st1 {v6.16b},[x1],#16 - eor v6.16b,v3.16b,v7.16b - st1 {v2.16b},[x1],#16 - eor v2.16b,v0.16b,v7.16b - orr v3.16b,v1.16b,v1.16b - b.hs .Loop2x_cbc_dec128 - - adds x2,x2,#32 - eor v6.16b,v6.16b,v7.16b - b.eq .Lcbc_done - eor v2.16b,v2.16b,v7.16b - b .Lcbc_dec_tail - -.align 5 -.Lcbc_dec: - subs x2,x2,#16 - orr v2.16b,v0.16b,v0.16b - b.lo .Lcbc_dec_tail - - csel x8,xzr,x8,eq - cmp w5,#2 - ld1 {v1.16b},[x0],x8 - orr v3.16b,v1.16b,v1.16b - b.eq .Lcbc_dec128 - -.Loop2x_cbc_dec: - aesd v0.16b,v16.16b - aesd v1.16b,v16.16b - ld1 {v16.4s},[x7],#16 - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - subs w6,w6,#2 - aesd v0.16b,v17.16b - aesd v1.16b,v17.16b - ld1 {v17.4s},[x7],#16 - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - b.gt .Loop2x_cbc_dec - - aesd v0.16b,v16.16b - aesd v1.16b,v16.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - eor v4.16b,v6.16b,v7.16b - eor v5.16b,v2.16b,v7.16b - aesd v0.16b,v17.16b - aesd v1.16b,v17.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - orr v6.16b,v3.16b,v3.16b - subs x2,x2,#32 - aesd v0.16b,v18.16b - aesd v1.16b,v18.16b - aesimc v0.16b,v0.16b - csel x8,xzr,x8,lo - aesimc v1.16b,v1.16b - mov x7,x3 - aesd v0.16b,v19.16b - aesd v1.16b,v19.16b - aesimc v0.16b,v0.16b - ld1 {v2.16b},[x0],x8 - aesimc v1.16b,v1.16b - csel x8,xzr,x8,ls - aesd v0.16b,v20.16b - aesd v1.16b,v20.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - ld1 {v3.16b},[x0],x8 - aesd v0.16b,v21.16b - aesd v1.16b,v21.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - aesd v0.16b,v22.16b - aesd v1.16b,v22.16b - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - aesd v0.16b,v23.16b - aesd v1.16b,v23.16b - - mov w6,w5 - eor v4.16b,v4.16b,v0.16b - eor v5.16b,v5.16b,v1.16b - orr v0.16b,v2.16b,v2.16b - st1 {v4.16b},[x1],#16 - orr v1.16b,v3.16b,v3.16b - st1 {v5.16b},[x1],#16 - b.hs .Loop2x_cbc_dec - - adds x2,x2,#32 - b.eq .Lcbc_done - -.Lcbc_dec_tail: - aesd v0.16b,v16.16b - ld1 {v16.4s},[x7],#16 - aesimc v0.16b,v0.16b - subs w6,w6,#2 - aesd v0.16b,v17.16b - ld1 {v17.4s},[x7],#16 - aesimc v0.16b,v0.16b - b.gt .Lcbc_dec_tail - - aesd v0.16b,v16.16b - aesimc v0.16b,v0.16b - aesd v0.16b,v17.16b - aesimc v0.16b,v0.16b - eor v4.16b,v6.16b,v7.16b - aesd v0.16b,v18.16b - aesimc v0.16b,v0.16b - orr v6.16b,v2.16b,v2.16b - aesd v0.16b,v19.16b - aesimc v0.16b,v0.16b - aesd v0.16b,v20.16b - aesimc v0.16b,v0.16b - aesd v0.16b,v21.16b - aesimc v0.16b,v0.16b - aesd v0.16b,v22.16b - aesimc v0.16b,v0.16b - aesd v0.16b,v23.16b - - eor v4.16b,v4.16b,v0.16b - st1 {v4.16b},[x1],#16 - -.Lcbc_done: - st1 {v6.16b},[x4] -.Lcbc_abort: - ldr x29,[sp],#16 - ret -.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt -.globl aes_v8_ctr32_encrypt_blocks -.type aes_v8_ctr32_encrypt_blocks,%function -.align 5 -aes_v8_ctr32_encrypt_blocks: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - ldr w5,[x3,#240] - - ldr w8, [x4, #12] - ld1 {v0.4s},[x4] - - ld1 {v16.4s-v17.4s},[x3] // load key schedule... - sub w5,w5,#6 - add x7,x3,x5,lsl#4 // pointer to last 7 round keys - sub w5,w5,#2 - ld1 {v18.4s-v19.4s},[x7],#32 - ld1 {v20.4s-v21.4s},[x7],#32 - ld1 {v22.4s-v23.4s},[x7],#32 - ld1 {v7.4s},[x7] - - add x7,x3,#32 - mov w6,w5 - - subs x2,x2,#2 - b.lo .Lctr32_tail - -#ifndef __ARMEB__ - rev w8, w8 -#endif - orr v1.16b,v0.16b,v0.16b - add w8, w8, #1 - orr v6.16b,v0.16b,v0.16b - rev w10, w8 - cmp w5,#2 - mov v1.s[3],w10 - b.eq .Lctr32_128 - -.Loop2x_ctr32: - aese v0.16b,v16.16b - aese v1.16b,v16.16b - ld1 {v16.4s},[x7],#16 - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - subs w6,w6,#2 - aese v0.16b,v17.16b - aese v1.16b,v17.16b - ld1 {v17.4s},[x7],#16 - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - b.gt .Loop2x_ctr32 - - aese v0.16b,v16.16b - aese v1.16b,v16.16b - aesmc v4.16b,v0.16b - orr v0.16b,v6.16b,v6.16b - aesmc v5.16b,v1.16b - orr v1.16b,v6.16b,v6.16b - aese v4.16b,v17.16b - aese v5.16b,v17.16b - ld1 {v2.16b},[x0],#16 - aesmc v4.16b,v4.16b - ld1 {v3.16b},[x0],#16 - aesmc v5.16b,v5.16b - add w8,w8,#1 - aese v4.16b,v18.16b - aese v5.16b,v18.16b - rev w9,w8 - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b - add w8,w8,#1 - aese v4.16b,v19.16b - aese v5.16b,v19.16b - eor v2.16b,v2.16b,v7.16b - rev w10,w8 - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b - eor v3.16b,v3.16b,v7.16b - mov x7,x3 - aese v4.16b,v20.16b - aese v5.16b,v20.16b - subs x2,x2,#2 - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b - ld1 {v16.4s-v17.4s},[x7],#32 // re-pre-load rndkey[0-1] - aese v4.16b,v21.16b - aese v5.16b,v21.16b - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b - aese v4.16b,v22.16b - aese v5.16b,v22.16b - mov v0.s[3], w9 - aesmc v4.16b,v4.16b - mov v1.s[3], w10 - aesmc v5.16b,v5.16b - aese v4.16b,v23.16b - aese v5.16b,v23.16b - - mov w6,w5 - eor v2.16b,v2.16b,v4.16b - eor v3.16b,v3.16b,v5.16b - st1 {v2.16b},[x1],#16 - st1 {v3.16b},[x1],#16 - b.hs .Loop2x_ctr32 - - adds x2,x2,#2 - b.eq .Lctr32_done - b .Lctr32_tail - -.Lctr32_128: - ld1 {v4.4s-v5.4s},[x7] - -.Loop2x_ctr32_128: - aese v0.16b,v16.16b - aese v1.16b,v16.16b - aesmc v0.16b,v0.16b - ld1 {v2.16b},[x0],#16 - aesmc v1.16b,v1.16b - ld1 {v3.16b},[x0],#16 - aese v0.16b,v17.16b - aese v1.16b,v17.16b - add w8,w8,#1 - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - rev w9,w8 - aese v0.16b,v4.16b - aese v1.16b,v4.16b - add w8,w8,#1 - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - rev w10,w8 - aese v0.16b,v5.16b - aese v1.16b,v5.16b - subs x2,x2,#2 - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aese v0.16b,v18.16b - aese v1.16b,v18.16b - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aese v0.16b,v19.16b - aese v1.16b,v19.16b - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aese v0.16b,v20.16b - aese v1.16b,v20.16b - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aese v0.16b,v21.16b - aese v1.16b,v21.16b - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aese v0.16b,v22.16b - aese v1.16b,v22.16b - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - eor v2.16b,v2.16b,v7.16b - aese v0.16b,v23.16b - eor v3.16b,v3.16b,v7.16b - aese v1.16b,v23.16b - - eor v2.16b,v2.16b,v0.16b - orr v0.16b,v6.16b,v6.16b - eor v3.16b,v3.16b,v1.16b - orr v1.16b,v6.16b,v6.16b - st1 {v2.16b},[x1],#16 - mov v0.s[3], w9 - st1 {v3.16b},[x1],#16 - mov v1.s[3], w10 - b.hs .Loop2x_ctr32_128 - - adds x2,x2,#2 - b.eq .Lctr32_done - -.Lctr32_tail: - aese v0.16b,v16.16b - ld1 {v16.4s},[x7],#16 - aesmc v0.16b,v0.16b - subs w6,w6,#2 - aese v0.16b,v17.16b - ld1 {v17.4s},[x7],#16 - aesmc v0.16b,v0.16b - b.gt .Lctr32_tail - - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - ld1 {v2.16b},[x0] - aese v0.16b,v18.16b - aesmc v0.16b,v0.16b - aese v0.16b,v19.16b - aesmc v0.16b,v0.16b - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - eor v2.16b,v2.16b,v7.16b - aese v0.16b,v23.16b - - eor v2.16b,v2.16b,v0.16b - st1 {v2.16b},[x1] - -.Lctr32_done: - ldr x29,[sp],#16 - ret -.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks -#endif diff --git a/app/openssl/crypto/aes/asm/aesv8-armx.S b/app/openssl/crypto/aes/asm/aesv8-armx.S deleted file mode 100644 index 1637e4d4..00000000 --- a/app/openssl/crypto/aes/asm/aesv8-armx.S +++ /dev/null @@ -1,767 +0,0 @@ -#include "arm_arch.h" - -#if __ARM_ARCH__>=7 -.text -.fpu neon -.code 32 -.align 5 -rcon: -.long 0x01,0x01,0x01,0x01 -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat -.long 0x1b,0x1b,0x1b,0x1b - -.globl aes_v8_set_encrypt_key -.type aes_v8_set_encrypt_key,%function -.align 5 -aes_v8_set_encrypt_key: -.Lenc_key: - adr r3,rcon - cmp r1,#192 - - veor q0,q0,q0 - vld1.8 {q3},[r0]! - mov r1,#8 @ reuse r1 - vld1.32 {q1,q2},[r3]! - - blt .Loop128 - beq .L192 - b .L256 - -.align 4 -.Loop128: - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! - .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - bne .Loop128 - - vld1.32 {q1},[r3] - - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! - .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! - .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - veor q3,q3,q10 - vst1.32 {q3},[r2] - add r2,r2,#0x50 - - mov r12,#10 - b .Ldone - -.align 4 -.L192: - vld1.8 {d16},[r0]! - vmov.i8 q10,#8 @ borrow q10 - vst1.32 {q3},[r2]! - vsub.i8 q2,q2,q10 @ adjust the mask - -.Loop192: - vtbl.8 d20,{q8},d4 - vtbl.8 d21,{q8},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {d16},[r2]! - .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - - vdup.32 q9,d7[1] - veor q9,q9,q8 - veor q10,q10,q1 - vext.8 q8,q0,q8,#12 - vshl.u8 q1,q1,#1 - veor q8,q8,q9 - veor q3,q3,q10 - veor q8,q8,q10 - vst1.32 {q3},[r2]! - bne .Loop192 - - mov r12,#12 - add r2,r2,#0x20 - b .Ldone - -.align 4 -.L256: - vld1.8 {q8},[r0] - mov r1,#7 - mov r12,#14 - vst1.32 {q3},[r2]! - -.Loop256: - vtbl.8 d20,{q8},d4 - vtbl.8 d21,{q8},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q8},[r2]! - .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - vst1.32 {q3},[r2]! - beq .Ldone - - vdup.32 q10,d7[1] - vext.8 q9,q0,q8,#12 - .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q8,q8,q9 - vext.8 q9,q0,q9,#12 - veor q8,q8,q9 - vext.8 q9,q0,q9,#12 - veor q8,q8,q9 - - veor q8,q8,q10 - b .Loop256 - -.Ldone: - str r12,[r2] - - eor r0,r0,r0 @ return value - - bx lr -.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key - -.globl aes_v8_set_decrypt_key -.type aes_v8_set_decrypt_key,%function -.align 5 -aes_v8_set_decrypt_key: - stmdb sp!,{r4,lr} - bl .Lenc_key - - sub r2,r2,#240 @ restore original r2 - mov r4,#-16 - add r0,r2,r12,lsl#4 @ end of key schedule - - vld1.32 {q0},[r2] - vld1.32 {q1},[r0] - vst1.32 {q0},[r0],r4 - vst1.32 {q1},[r2]! - -.Loop_imc: - vld1.32 {q0},[r2] - vld1.32 {q1},[r0] - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - vst1.32 {q0},[r0],r4 - vst1.32 {q1},[r2]! - cmp r0,r2 - bhi .Loop_imc - - vld1.32 {q0},[r2] - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - vst1.32 {q0},[r0] - - eor r0,r0,r0 @ return value - ldmia sp!,{r4,pc} -.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key -.globl aes_v8_encrypt -.type aes_v8_encrypt,%function -.align 5 -aes_v8_encrypt: - ldr r3,[r2,#240] - vld1.32 {q0},[r2]! - vld1.8 {q2},[r0] - sub r3,r3,#2 - vld1.32 {q1},[r2]! - -.Loop_enc: - .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 - vld1.32 {q0},[r2]! - .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - subs r3,r3,#2 - .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 - vld1.32 {q1},[r2]! - .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - bgt .Loop_enc - - .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 - vld1.32 {q0},[r2] - .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 - veor q2,q2,q0 - - vst1.8 {q2},[r1] - bx lr -.size aes_v8_encrypt,.-aes_v8_encrypt -.globl aes_v8_decrypt -.type aes_v8_decrypt,%function -.align 5 -aes_v8_decrypt: - ldr r3,[r2,#240] - vld1.32 {q0},[r2]! - vld1.8 {q2},[r0] - sub r3,r3,#2 - vld1.32 {q1},[r2]! - -.Loop_dec: - .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 - vld1.32 {q0},[r2]! - .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - subs r3,r3,#2 - .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 - vld1.32 {q1},[r2]! - .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - bgt .Loop_dec - - .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 - vld1.32 {q0},[r2] - .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 - veor q2,q2,q0 - - vst1.8 {q2},[r1] - bx lr -.size aes_v8_decrypt,.-aes_v8_decrypt -.globl aes_v8_cbc_encrypt -.type aes_v8_cbc_encrypt,%function -.align 5 -aes_v8_cbc_encrypt: - mov ip,sp - stmdb sp!,{r4-r8,lr} - vstmdb sp!,{d8-d15} @ ABI specification says so - ldmia ip,{r4-r5} @ load remaining args - subs r2,r2,#16 - mov r8,#16 - blo .Lcbc_abort - moveq r8,#0 - - cmp r5,#0 @ en- or decrypting? - ldr r5,[r3,#240] - and r2,r2,#-16 - vld1.8 {q6},[r4] - vld1.8 {q0},[r0],r8 - - vld1.32 {q8-q9},[r3] @ load key schedule... - sub r5,r5,#6 - add r7,r3,r5,lsl#4 @ pointer to last 7 round keys - sub r5,r5,#2 - vld1.32 {q10-q11},[r7]! - vld1.32 {q12-q13},[r7]! - vld1.32 {q14-q15},[r7]! - vld1.32 {q7},[r7] - - add r7,r3,#32 - mov r6,r5 - beq .Lcbc_dec - - cmp r5,#2 - veor q0,q0,q6 - veor q5,q8,q7 - beq .Lcbc_enc128 - -.Loop_cbc_enc: - .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - vld1.32 {q8},[r7]! - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r6,r6,#2 - .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - vld1.32 {q9},[r7]! - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - bgt .Loop_cbc_enc - - .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r2,r2,#16 - .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - moveq r8,#0 - .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - add r7,r3,#16 - .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q8},[r0],r8 - .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - veor q8,q8,q5 - .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - - mov r6,r5 - veor q6,q0,q7 - vst1.8 {q6},[r1]! - bhs .Loop_cbc_enc - - b .Lcbc_done - -.align 5 -.Lcbc_enc128: - vld1.32 {q2-q3},[r7] - .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - b .Lenter_cbc_enc128 -.Loop_cbc_enc128: - .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vst1.8 {q6},[r1]! -.Lenter_cbc_enc128: - .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r2,r2,#16 - .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - moveq r8,#0 - .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q8},[r0],r8 - .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - veor q8,q8,q5 - .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - veor q6,q0,q7 - bhs .Loop_cbc_enc128 - - vst1.8 {q6},[r1]! - b .Lcbc_done - -.align 5 -.Lcbc_dec128: - vld1.32 {q4-q5},[r7] - veor q6,q6,q7 - veor q2,q0,q7 - mov r12,r8 - -.Loop2x_cbc_dec128: - .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 - .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - subs r2,r2,#32 - .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 - .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - movlo r8,#0 - .byte 0x48,0x03,0xb0,0xf3 @ aesd q0,q4 - .byte 0x48,0x23,0xb0,0xf3 @ aesd q1,q4 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - movls r12,#0 - .byte 0x4a,0x03,0xb0,0xf3 @ aesd q0,q5 - .byte 0x4a,0x23,0xb0,0xf3 @ aesd q1,q5 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10 - .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11 - .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 - .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 - .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 - .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 - .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 - - veor q6,q6,q0 - vld1.8 {q0},[r0],r8 - veor q2,q2,q1 - vld1.8 {q1},[r0],r12 - vst1.8 {q6},[r1]! - veor q6,q3,q7 - vst1.8 {q2},[r1]! - veor q2,q0,q7 - vorr q3,q1,q1 - bhs .Loop2x_cbc_dec128 - - adds r2,r2,#32 - veor q6,q6,q7 - beq .Lcbc_done - veor q2,q2,q7 - b .Lcbc_dec_tail - -.align 5 -.Lcbc_dec: - subs r2,r2,#16 - vorr q2,q0,q0 - blo .Lcbc_dec_tail - - moveq r8,#0 - cmp r5,#2 - vld1.8 {q1},[r0],r8 - vorr q3,q1,q1 - beq .Lcbc_dec128 - -.Loop2x_cbc_dec: - .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 - .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - vld1.32 {q8},[r7]! - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - subs r6,r6,#2 - .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 - .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - vld1.32 {q9},[r7]! - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - bgt .Loop2x_cbc_dec - - .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 - .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - veor q4,q6,q7 - veor q5,q2,q7 - .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 - .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - vorr q6,q3,q3 - subs r2,r2,#32 - .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10 - .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - movlo r8,#0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - mov r7,r3 - .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11 - .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - vld1.8 {q2},[r0],r8 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - movls r8,#0 - .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 - .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - vld1.8 {q3},[r0],r8 - .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 - .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] - .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 - .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 - .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 - - mov r6,r5 - veor q4,q4,q0 - veor q5,q5,q1 - vorr q0,q2,q2 - vst1.8 {q4},[r1]! - vorr q1,q3,q3 - vst1.8 {q5},[r1]! - bhs .Loop2x_cbc_dec - - adds r2,r2,#32 - beq .Lcbc_done - -.Lcbc_dec_tail: - .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 - vld1.32 {q8},[r7]! - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - subs r6,r6,#2 - .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 - vld1.32 {q9},[r7]! - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - bgt .Lcbc_dec_tail - - .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - veor q4,q6,q7 - .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - vorr q6,q2,q2 - .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 - .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 - - veor q4,q4,q0 - vst1.8 {q4},[r1]! - -.Lcbc_done: - vst1.8 {q6},[r4] -.Lcbc_abort: - vldmia sp!,{d8-d15} - ldmia sp!,{r4-r8,pc} -.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt -.globl aes_v8_ctr32_encrypt_blocks -.type aes_v8_ctr32_encrypt_blocks,%function -.align 5 -aes_v8_ctr32_encrypt_blocks: - mov ip,sp - stmdb sp!,{r4-r10,lr} - vstmdb sp!,{d8-d15} @ ABI specification says so - ldr r4, [ip] @ load remaining arg - ldr r5,[r3,#240] - - ldr r8, [r4, #12] - vld1.32 {q0},[r4] - - vld1.32 {q8-q9},[r3] @ load key schedule... - sub r5,r5,#6 - add r7,r3,r5,lsl#4 @ pointer to last 7 round keys - sub r5,r5,#2 - vld1.32 {q10-q11},[r7]! - vld1.32 {q12-q13},[r7]! - vld1.32 {q14-q15},[r7]! - vld1.32 {q7},[r7] - - add r7,r3,#32 - mov r6,r5 - - subs r2,r2,#2 - blo .Lctr32_tail - -#ifndef __ARMEB__ - rev r8, r8 -#endif - vorr q1,q0,q0 - add r8, r8, #1 - vorr q6,q0,q0 - rev r10, r8 - cmp r5,#2 - vmov.32 d3[1],r10 - beq .Lctr32_128 - -.Loop2x_ctr32: - .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 - vld1.32 {q8},[r7]! - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - subs r6,r6,#2 - .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 - vld1.32 {q9},[r7]! - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - bgt .Loop2x_ctr32 - - .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 - .byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 - vorr q0,q6,q6 - .byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 - vorr q1,q6,q6 - .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 - .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 - vld1.8 {q2},[r0]! - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - vld1.8 {q3},[r0]! - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - add r8,r8,#1 - .byte 0x24,0x83,0xb0,0xf3 @ aese q4,q10 - .byte 0x24,0xa3,0xb0,0xf3 @ aese q5,q10 - rev r9,r8 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - add r8,r8,#1 - .byte 0x26,0x83,0xb0,0xf3 @ aese q4,q11 - .byte 0x26,0xa3,0xb0,0xf3 @ aese q5,q11 - veor q2,q2,q7 - rev r10,r8 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - veor q3,q3,q7 - mov r7,r3 - .byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 - .byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 - subs r2,r2,#2 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - vld1.32 {q8-q9},[r7]! @ re-pre-load rndkey[0-1] - .byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 - .byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - .byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 - .byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 - vmov.32 d1[1], r9 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - vmov.32 d3[1], r10 - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - .byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 - .byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 - - mov r6,r5 - veor q2,q2,q4 - veor q3,q3,q5 - vst1.8 {q2},[r1]! - vst1.8 {q3},[r1]! - bhs .Loop2x_ctr32 - - adds r2,r2,#2 - beq .Lctr32_done - b .Lctr32_tail - -.Lctr32_128: - vld1.32 {q4-q5},[r7] - -.Loop2x_ctr32_128: - .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q2},[r0]! - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.8 {q3},[r0]! - .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 - add r8,r8,#1 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - rev r9,r8 - .byte 0x08,0x03,0xb0,0xf3 @ aese q0,q4 - .byte 0x08,0x23,0xb0,0xf3 @ aese q1,q4 - add r8,r8,#1 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - rev r10,r8 - .byte 0x0a,0x03,0xb0,0xf3 @ aese q0,q5 - .byte 0x0a,0x23,0xb0,0xf3 @ aese q1,q5 - subs r2,r2,#2 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 - .byte 0x24,0x23,0xb0,0xf3 @ aese q1,q10 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 - .byte 0x26,0x23,0xb0,0xf3 @ aese q1,q11 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 - .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 - .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 - .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - veor q2,q2,q7 - .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - veor q3,q3,q7 - .byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 - - veor q2,q2,q0 - vorr q0,q6,q6 - veor q3,q3,q1 - vorr q1,q6,q6 - vst1.8 {q2},[r1]! - vmov.32 d1[1], r9 - vst1.8 {q3},[r1]! - vmov.32 d3[1], r10 - bhs .Loop2x_ctr32_128 - - adds r2,r2,#2 - beq .Lctr32_done - -.Lctr32_tail: - .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - vld1.32 {q8},[r7]! - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r6,r6,#2 - .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - vld1.32 {q9},[r7]! - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - bgt .Lctr32_tail - - .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q2},[r0] - .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 - .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - veor q2,q2,q7 - .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - - veor q2,q2,q0 - vst1.8 {q2},[r1] - -.Lctr32_done: - vldmia sp!,{d8-d15} - ldmia sp!,{r4-r10,pc} -.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks -#endif diff --git a/app/openssl/crypto/aes/asm/aesv8-armx.pl b/app/openssl/crypto/aes/asm/aesv8-armx.pl deleted file mode 100644 index 415dc04a..00000000 --- a/app/openssl/crypto/aes/asm/aesv8-armx.pl +++ /dev/null @@ -1,980 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements support for ARMv8 AES instructions. The -# module is endian-agnostic in sense that it supports both big- and -# little-endian cases. As does it support both 32- and 64-bit modes -# of operation. Latter is achieved by limiting amount of utilized -# registers to 16, which implies additional instructions. This has -# no effect on mighty Apple A7, as results are literally equal to -# the theoretical estimates based on instruction latencies and issue -# rate. It remains to be seen how does it affect other platforms... -# -# Performance in cycles per byte processed with 128-bit key: -# -# CBC enc CBC dec CTR -# Apple A7 2.39 1.20 1.20 -# Cortex-A5x n/a n/a n/a - -$flavour = shift; -open STDOUT,">".shift; - -$prefix="aes_v8"; - -$code=<<___; -#include "arm_arch.h" - -#if __ARM_ARCH__>=7 -.text -___ -$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); -$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); - -# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, -# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to -# maintain both 32- and 64-bit codes within single module and -# transliterate common code to either flavour with regex vodoo. -# -{{{ -my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); -my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= - $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); - - -$code.=<<___; -.align 5 -rcon: -.long 0x01,0x01,0x01,0x01 -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat -.long 0x1b,0x1b,0x1b,0x1b - -.globl ${prefix}_set_encrypt_key -.type ${prefix}_set_encrypt_key,%function -.align 5 -${prefix}_set_encrypt_key: -.Lenc_key: -___ -$code.=<<___ if ($flavour =~ /64/); - stp x29,x30,[sp,#-16]! - add x29,sp,#0 -___ -$code.=<<___; - adr $ptr,rcon - cmp $bits,#192 - - veor $zero,$zero,$zero - vld1.8 {$in0},[$inp],#16 - mov $bits,#8 // reuse $bits - vld1.32 {$rcon,$mask},[$ptr],#32 - - b.lt .Loop128 - b.eq .L192 - b .L256 - -.align 4 -.Loop128: - vtbl.8 $key,{$in0},$mask - vext.8 $tmp,$zero,$in0,#12 - vst1.32 {$in0},[$out],#16 - aese $key,$zero - subs $bits,$bits,#1 - - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $key,$key,$rcon - veor $in0,$in0,$tmp - vshl.u8 $rcon,$rcon,#1 - veor $in0,$in0,$key - b.ne .Loop128 - - vld1.32 {$rcon},[$ptr] - - vtbl.8 $key,{$in0},$mask - vext.8 $tmp,$zero,$in0,#12 - vst1.32 {$in0},[$out],#16 - aese $key,$zero - - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $key,$key,$rcon - veor $in0,$in0,$tmp - vshl.u8 $rcon,$rcon,#1 - veor $in0,$in0,$key - - vtbl.8 $key,{$in0},$mask - vext.8 $tmp,$zero,$in0,#12 - vst1.32 {$in0},[$out],#16 - aese $key,$zero - - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $key,$key,$rcon - veor $in0,$in0,$tmp - veor $in0,$in0,$key - vst1.32 {$in0},[$out] - add $out,$out,#0x50 - - mov $rounds,#10 - b .Ldone - -.align 4 -.L192: - vld1.8 {$in1},[$inp],#8 - vmov.i8 $key,#8 // borrow $key - vst1.32 {$in0},[$out],#16 - vsub.i8 $mask,$mask,$key // adjust the mask - -.Loop192: - vtbl.8 $key,{$in1},$mask - vext.8 $tmp,$zero,$in0,#12 - vst1.32 {$in1},[$out],#8 - aese $key,$zero - subs $bits,$bits,#1 - - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $in0,$in0,$tmp - - vdup.32 $tmp,${in0}[3] - veor $tmp,$tmp,$in1 - veor $key,$key,$rcon - vext.8 $in1,$zero,$in1,#12 - vshl.u8 $rcon,$rcon,#1 - veor $in1,$in1,$tmp - veor $in0,$in0,$key - veor $in1,$in1,$key - vst1.32 {$in0},[$out],#16 - b.ne .Loop192 - - mov $rounds,#12 - add $out,$out,#0x20 - b .Ldone - -.align 4 -.L256: - vld1.8 {$in1},[$inp] - mov $bits,#7 - mov $rounds,#14 - vst1.32 {$in0},[$out],#16 - -.Loop256: - vtbl.8 $key,{$in1},$mask - vext.8 $tmp,$zero,$in0,#12 - vst1.32 {$in1},[$out],#16 - aese $key,$zero - subs $bits,$bits,#1 - - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $in0,$in0,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $key,$key,$rcon - veor $in0,$in0,$tmp - vshl.u8 $rcon,$rcon,#1 - veor $in0,$in0,$key - vst1.32 {$in0},[$out],#16 - b.eq .Ldone - - vdup.32 $key,${in0}[3] // just splat - vext.8 $tmp,$zero,$in1,#12 - aese $key,$zero - - veor $in1,$in1,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $in1,$in1,$tmp - vext.8 $tmp,$zero,$tmp,#12 - veor $in1,$in1,$tmp - - veor $in1,$in1,$key - b .Loop256 - -.Ldone: - str $rounds,[$out] - - eor x0,x0,x0 // return value - `"ldr x29,[sp],#16" if ($flavour =~ /64/)` - ret -.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key - -.globl ${prefix}_set_decrypt_key -.type ${prefix}_set_decrypt_key,%function -.align 5 -${prefix}_set_decrypt_key: -___ -$code.=<<___ if ($flavour =~ /64/); - stp x29,x30,[sp,#-16]! - add x29,sp,#0 -___ -$code.=<<___ if ($flavour !~ /64/); - stmdb sp!,{r4,lr} -___ -$code.=<<___; - bl .Lenc_key - - sub $out,$out,#240 // restore original $out - mov x4,#-16 - add $inp,$out,x12,lsl#4 // end of key schedule - - vld1.32 {v0.16b},[$out] - vld1.32 {v1.16b},[$inp] - vst1.32 {v0.16b},[$inp],x4 - vst1.32 {v1.16b},[$out],#16 - -.Loop_imc: - vld1.32 {v0.16b},[$out] - vld1.32 {v1.16b},[$inp] - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - vst1.32 {v0.16b},[$inp],x4 - vst1.32 {v1.16b},[$out],#16 - cmp $inp,$out - b.hi .Loop_imc - - vld1.32 {v0.16b},[$out] - aesimc v0.16b,v0.16b - vst1.32 {v0.16b},[$inp] - - eor x0,x0,x0 // return value -___ -$code.=<<___ if ($flavour !~ /64/); - ldmia sp!,{r4,pc} -___ -$code.=<<___ if ($flavour =~ /64/); - ldp x29,x30,[sp],#16 - ret -___ -$code.=<<___; -.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key -___ -}}} -{{{ -sub gen_block () { -my $dir = shift; -my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); -my ($inp,$out,$key)=map("x$_",(0..2)); -my $rounds="w3"; -my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); - -$code.=<<___; -.globl ${prefix}_${dir}crypt -.type ${prefix}_${dir}crypt,%function -.align 5 -${prefix}_${dir}crypt: - ldr $rounds,[$key,#240] - vld1.32 {$rndkey0},[$key],#16 - vld1.8 {$inout},[$inp] - sub $rounds,$rounds,#2 - vld1.32 {$rndkey1},[$key],#16 - -.Loop_${dir}c: - aes$e $inout,$rndkey0 - vld1.32 {$rndkey0},[$key],#16 - aes$mc $inout,$inout - subs $rounds,$rounds,#2 - aes$e $inout,$rndkey1 - vld1.32 {$rndkey1},[$key],#16 - aes$mc $inout,$inout - b.gt .Loop_${dir}c - - aes$e $inout,$rndkey0 - vld1.32 {$rndkey0},[$key] - aes$mc $inout,$inout - aes$e $inout,$rndkey1 - veor $inout,$inout,$rndkey0 - - vst1.8 {$inout},[$out] - ret -.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt -___ -} -&gen_block("en"); -&gen_block("de"); -}}} -{{{ -my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; -my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); -my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); - -my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); - -### q8-q15 preloaded key schedule - -$code.=<<___; -.globl ${prefix}_cbc_encrypt -.type ${prefix}_cbc_encrypt,%function -.align 5 -${prefix}_cbc_encrypt: -___ -$code.=<<___ if ($flavour =~ /64/); - stp x29,x30,[sp,#-16]! - add x29,sp,#0 -___ -$code.=<<___ if ($flavour !~ /64/); - mov ip,sp - stmdb sp!,{r4-r8,lr} - vstmdb sp!,{d8-d15} @ ABI specification says so - ldmia ip,{r4-r5} @ load remaining args -___ -$code.=<<___; - subs $len,$len,#16 - mov $step,#16 - b.lo .Lcbc_abort - cclr $step,eq - - cmp $enc,#0 // en- or decrypting? - ldr $rounds,[$key,#240] - and $len,$len,#-16 - vld1.8 {$ivec},[$ivp] - vld1.8 {$dat},[$inp],$step - - vld1.32 {q8-q9},[$key] // load key schedule... - sub $rounds,$rounds,#6 - add $key_,$key,x5,lsl#4 // pointer to last 7 round keys - sub $rounds,$rounds,#2 - vld1.32 {q10-q11},[$key_],#32 - vld1.32 {q12-q13},[$key_],#32 - vld1.32 {q14-q15},[$key_],#32 - vld1.32 {$rndlast},[$key_] - - add $key_,$key,#32 - mov $cnt,$rounds - b.eq .Lcbc_dec - - cmp $rounds,#2 - veor $dat,$dat,$ivec - veor $rndzero_n_last,q8,$rndlast - b.eq .Lcbc_enc128 - -.Loop_cbc_enc: - aese $dat,q8 - vld1.32 {q8},[$key_],#16 - aesmc $dat,$dat - subs $cnt,$cnt,#2 - aese $dat,q9 - vld1.32 {q9},[$key_],#16 - aesmc $dat,$dat - b.gt .Loop_cbc_enc - - aese $dat,q8 - aesmc $dat,$dat - subs $len,$len,#16 - aese $dat,q9 - aesmc $dat,$dat - cclr $step,eq - aese $dat,q10 - aesmc $dat,$dat - add $key_,$key,#16 - aese $dat,q11 - aesmc $dat,$dat - vld1.8 {q8},[$inp],$step - aese $dat,q12 - aesmc $dat,$dat - veor q8,q8,$rndzero_n_last - aese $dat,q13 - aesmc $dat,$dat - vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] - aese $dat,q14 - aesmc $dat,$dat - aese $dat,q15 - - mov $cnt,$rounds - veor $ivec,$dat,$rndlast - vst1.8 {$ivec},[$out],#16 - b.hs .Loop_cbc_enc - - b .Lcbc_done - -.align 5 -.Lcbc_enc128: - vld1.32 {$in0-$in1},[$key_] - aese $dat,q8 - aesmc $dat,$dat - b .Lenter_cbc_enc128 -.Loop_cbc_enc128: - aese $dat,q8 - aesmc $dat,$dat - vst1.8 {$ivec},[$out],#16 -.Lenter_cbc_enc128: - aese $dat,q9 - aesmc $dat,$dat - subs $len,$len,#16 - aese $dat,$in0 - aesmc $dat,$dat - cclr $step,eq - aese $dat,$in1 - aesmc $dat,$dat - aese $dat,q10 - aesmc $dat,$dat - aese $dat,q11 - aesmc $dat,$dat - vld1.8 {q8},[$inp],$step - aese $dat,q12 - aesmc $dat,$dat - aese $dat,q13 - aesmc $dat,$dat - aese $dat,q14 - aesmc $dat,$dat - veor q8,q8,$rndzero_n_last - aese $dat,q15 - veor $ivec,$dat,$rndlast - b.hs .Loop_cbc_enc128 - - vst1.8 {$ivec},[$out],#16 - b .Lcbc_done - -.align 5 -.Lcbc_dec128: - vld1.32 {$tmp0-$tmp1},[$key_] - veor $ivec,$ivec,$rndlast - veor $in0,$dat0,$rndlast - mov $step1,$step - -.Loop2x_cbc_dec128: - aesd $dat0,q8 - aesd $dat1,q8 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - subs $len,$len,#32 - aesd $dat0,q9 - aesd $dat1,q9 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - cclr $step,lo - aesd $dat0,$tmp0 - aesd $dat1,$tmp0 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - cclr $step1,ls - aesd $dat0,$tmp1 - aesd $dat1,$tmp1 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q10 - aesd $dat1,q10 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q11 - aesd $dat1,q11 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q12 - aesd $dat1,q12 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q13 - aesd $dat1,q13 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q14 - aesd $dat1,q14 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q15 - aesd $dat1,q15 - - veor $ivec,$ivec,$dat0 - vld1.8 {$dat0},[$inp],$step - veor $in0,$in0,$dat1 - vld1.8 {$dat1},[$inp],$step1 - vst1.8 {$ivec},[$out],#16 - veor $ivec,$in1,$rndlast - vst1.8 {$in0},[$out],#16 - veor $in0,$dat0,$rndlast - vorr $in1,$dat1,$dat1 - b.hs .Loop2x_cbc_dec128 - - adds $len,$len,#32 - veor $ivec,$ivec,$rndlast - b.eq .Lcbc_done - veor $in0,$in0,$rndlast - b .Lcbc_dec_tail - -.align 5 -.Lcbc_dec: - subs $len,$len,#16 - vorr $in0,$dat,$dat - b.lo .Lcbc_dec_tail - - cclr $step,eq - cmp $rounds,#2 - vld1.8 {$dat1},[$inp],$step - vorr $in1,$dat1,$dat1 - b.eq .Lcbc_dec128 - -.Loop2x_cbc_dec: - aesd $dat0,q8 - aesd $dat1,q8 - vld1.32 {q8},[$key_],#16 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - subs $cnt,$cnt,#2 - aesd $dat0,q9 - aesd $dat1,q9 - vld1.32 {q9},[$key_],#16 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - b.gt .Loop2x_cbc_dec - - aesd $dat0,q8 - aesd $dat1,q8 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - veor $tmp0,$ivec,$rndlast - veor $tmp1,$in0,$rndlast - aesd $dat0,q9 - aesd $dat1,q9 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - vorr $ivec,$in1,$in1 - subs $len,$len,#32 - aesd $dat0,q10 - aesd $dat1,q10 - aesimc $dat0,$dat0 - cclr $step,lo - aesimc $dat1,$dat1 - mov $key_,$key - aesd $dat0,q11 - aesd $dat1,q11 - aesimc $dat0,$dat0 - vld1.8 {$in0},[$inp],$step - aesimc $dat1,$dat1 - cclr $step,ls - aesd $dat0,q12 - aesd $dat1,q12 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - vld1.8 {$in1},[$inp],$step - aesd $dat0,q13 - aesd $dat1,q13 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] - aesd $dat0,q14 - aesd $dat1,q14 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] - aesd $dat0,q15 - aesd $dat1,q15 - - mov $cnt,$rounds - veor $tmp0,$tmp0,$dat0 - veor $tmp1,$tmp1,$dat1 - vorr $dat0,$in0,$in0 - vst1.8 {$tmp0},[$out],#16 - vorr $dat1,$in1,$in1 - vst1.8 {$tmp1},[$out],#16 - b.hs .Loop2x_cbc_dec - - adds $len,$len,#32 - b.eq .Lcbc_done - -.Lcbc_dec_tail: - aesd $dat,q8 - vld1.32 {q8},[$key_],#16 - aesimc $dat,$dat - subs $cnt,$cnt,#2 - aesd $dat,q9 - vld1.32 {q9},[$key_],#16 - aesimc $dat,$dat - b.gt .Lcbc_dec_tail - - aesd $dat,q8 - aesimc $dat,$dat - aesd $dat,q9 - aesimc $dat,$dat - veor $tmp,$ivec,$rndlast - aesd $dat,q10 - aesimc $dat,$dat - vorr $ivec,$in0,$in0 - aesd $dat,q11 - aesimc $dat,$dat - aesd $dat,q12 - aesimc $dat,$dat - aesd $dat,q13 - aesimc $dat,$dat - aesd $dat,q14 - aesimc $dat,$dat - aesd $dat,q15 - - veor $tmp,$tmp,$dat - vst1.8 {$tmp},[$out],#16 - -.Lcbc_done: - vst1.8 {$ivec},[$ivp] -.Lcbc_abort: -___ -$code.=<<___ if ($flavour !~ /64/); - vldmia sp!,{d8-d15} - ldmia sp!,{r4-r8,pc} -___ -$code.=<<___ if ($flavour =~ /64/); - ldr x29,[sp],#16 - ret -___ -$code.=<<___; -.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt -___ -}}} -{{{ -my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); -my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10"); -my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); - -my ($dat,$tmp)=($dat0,$tmp0); - -### q8-q15 preloaded key schedule - -$code.=<<___; -.globl ${prefix}_ctr32_encrypt_blocks -.type ${prefix}_ctr32_encrypt_blocks,%function -.align 5 -${prefix}_ctr32_encrypt_blocks: -___ -$code.=<<___ if ($flavour =~ /64/); - stp x29,x30,[sp,#-16]! - add x29,sp,#0 -___ -$code.=<<___ if ($flavour !~ /64/); - mov ip,sp - stmdb sp!,{r4-r10,lr} - vstmdb sp!,{d8-d15} @ ABI specification says so - ldr r4, [ip] @ load remaining arg -___ -$code.=<<___; - ldr $rounds,[$key,#240] - - ldr $ctr, [$ivp, #12] - vld1.32 {$dat0},[$ivp] - - vld1.32 {q8-q9},[$key] // load key schedule... - sub $rounds,$rounds,#6 - add $key_,$key,x5,lsl#4 // pointer to last 7 round keys - sub $rounds,$rounds,#2 - vld1.32 {q10-q11},[$key_],#32 - vld1.32 {q12-q13},[$key_],#32 - vld1.32 {q14-q15},[$key_],#32 - vld1.32 {$rndlast},[$key_] - - add $key_,$key,#32 - mov $cnt,$rounds - - subs $len,$len,#2 - b.lo .Lctr32_tail - -#ifndef __ARMEB__ - rev $ctr, $ctr -#endif - vorr $dat1,$dat0,$dat0 - add $ctr, $ctr, #1 - vorr $ivec,$dat0,$dat0 - rev $tctr1, $ctr - cmp $rounds,#2 - vmov.32 ${dat1}[3],$tctr1 - b.eq .Lctr32_128 - -.Loop2x_ctr32: - aese $dat0,q8 - aese $dat1,q8 - vld1.32 {q8},[$key_],#16 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - subs $cnt,$cnt,#2 - aese $dat0,q9 - aese $dat1,q9 - vld1.32 {q9},[$key_],#16 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - b.gt .Loop2x_ctr32 - - aese $dat0,q8 - aese $dat1,q8 - aesmc $tmp0,$dat0 - vorr $dat0,$ivec,$ivec - aesmc $tmp1,$dat1 - vorr $dat1,$ivec,$ivec - aese $tmp0,q9 - aese $tmp1,q9 - vld1.8 {$in0},[$inp],#16 - aesmc $tmp0,$tmp0 - vld1.8 {$in1},[$inp],#16 - aesmc $tmp1,$tmp1 - add $ctr,$ctr,#1 - aese $tmp0,q10 - aese $tmp1,q10 - rev $tctr,$ctr - aesmc $tmp0,$tmp0 - aesmc $tmp1,$tmp1 - add $ctr,$ctr,#1 - aese $tmp0,q11 - aese $tmp1,q11 - veor $in0,$in0,$rndlast - rev $tctr1,$ctr - aesmc $tmp0,$tmp0 - aesmc $tmp1,$tmp1 - veor $in1,$in1,$rndlast - mov $key_,$key - aese $tmp0,q12 - aese $tmp1,q12 - subs $len,$len,#2 - aesmc $tmp0,$tmp0 - aesmc $tmp1,$tmp1 - vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1] - aese $tmp0,q13 - aese $tmp1,q13 - aesmc $tmp0,$tmp0 - aesmc $tmp1,$tmp1 - aese $tmp0,q14 - aese $tmp1,q14 - vmov.32 ${dat0}[3], $tctr - aesmc $tmp0,$tmp0 - vmov.32 ${dat1}[3], $tctr1 - aesmc $tmp1,$tmp1 - aese $tmp0,q15 - aese $tmp1,q15 - - mov $cnt,$rounds - veor $in0,$in0,$tmp0 - veor $in1,$in1,$tmp1 - vst1.8 {$in0},[$out],#16 - vst1.8 {$in1},[$out],#16 - b.hs .Loop2x_ctr32 - - adds $len,$len,#2 - b.eq .Lctr32_done - b .Lctr32_tail - -.Lctr32_128: - vld1.32 {$tmp0-$tmp1},[$key_] - -.Loop2x_ctr32_128: - aese $dat0,q8 - aese $dat1,q8 - aesmc $dat0,$dat0 - vld1.8 {$in0},[$inp],#16 - aesmc $dat1,$dat1 - vld1.8 {$in1},[$inp],#16 - aese $dat0,q9 - aese $dat1,q9 - add $ctr,$ctr,#1 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - rev $tctr,$ctr - aese $dat0,$tmp0 - aese $dat1,$tmp0 - add $ctr,$ctr,#1 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - rev $tctr1,$ctr - aese $dat0,$tmp1 - aese $dat1,$tmp1 - subs $len,$len,#2 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - aese $dat0,q10 - aese $dat1,q10 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - aese $dat0,q11 - aese $dat1,q11 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - aese $dat0,q12 - aese $dat1,q12 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - aese $dat0,q13 - aese $dat1,q13 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - aese $dat0,q14 - aese $dat1,q14 - aesmc $dat0,$dat0 - aesmc $dat1,$dat1 - veor $in0,$in0,$rndlast - aese $dat0,q15 - veor $in1,$in1,$rndlast - aese $dat1,q15 - - veor $in0,$in0,$dat0 - vorr $dat0,$ivec,$ivec - veor $in1,$in1,$dat1 - vorr $dat1,$ivec,$ivec - vst1.8 {$in0},[$out],#16 - vmov.32 ${dat0}[3], $tctr - vst1.8 {$in1},[$out],#16 - vmov.32 ${dat1}[3], $tctr1 - b.hs .Loop2x_ctr32_128 - - adds $len,$len,#2 - b.eq .Lctr32_done - -.Lctr32_tail: - aese $dat,q8 - vld1.32 {q8},[$key_],#16 - aesmc $dat,$dat - subs $cnt,$cnt,#2 - aese $dat,q9 - vld1.32 {q9},[$key_],#16 - aesmc $dat,$dat - b.gt .Lctr32_tail - - aese $dat,q8 - aesmc $dat,$dat - aese $dat,q9 - aesmc $dat,$dat - vld1.8 {$in0},[$inp] - aese $dat,q10 - aesmc $dat,$dat - aese $dat,q11 - aesmc $dat,$dat - aese $dat,q12 - aesmc $dat,$dat - aese $dat,q13 - aesmc $dat,$dat - aese $dat,q14 - aesmc $dat,$dat - veor $in0,$in0,$rndlast - aese $dat,q15 - - veor $in0,$in0,$dat - vst1.8 {$in0},[$out] - -.Lctr32_done: -___ -$code.=<<___ if ($flavour !~ /64/); - vldmia sp!,{d8-d15} - ldmia sp!,{r4-r10,pc} -___ -$code.=<<___ if ($flavour =~ /64/); - ldr x29,[sp],#16 - ret -___ -$code.=<<___; -.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks -___ -}}} -$code.=<<___; -#endif -___ -######################################## -if ($flavour =~ /64/) { ######## 64-bit code - my %opcode = ( - "aesd" => 0x4e285800, "aese" => 0x4e284800, - "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); - - local *unaes = sub { - my ($mnemonic,$arg)=@_; - - $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && - sprintf ".inst\t0x%08x\t//%s %s", - $opcode{$mnemonic}|$1|($2<<5), - $mnemonic,$arg; - }; - - foreach(split("\n",$code)) { - s/\`([^\`]*)\`/eval($1)/geo; - - s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers - s/@\s/\/\//o; # old->new style commentary - - #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or - s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or - s/vmov\.i8/movi/o or # fix up legacy mnemonics - s/vext\.8/ext/o or - s/vrev32\.8/rev32/o or - s/vtst\.8/cmtst/o or - s/vshr/ushr/o or - s/^(\s+)v/$1/o or # strip off v prefix - s/\bbx\s+lr\b/ret/o; - - # fix up remainig legacy suffixes - s/\.[ui]?8//o; - m/\],#8/o and s/\.16b/\.8b/go; - s/\.[ui]?32//o and s/\.16b/\.4s/go; - s/\.[ui]?64//o and s/\.16b/\.2d/go; - s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; - - print $_,"\n"; - } -} else { ######## 32-bit code - my %opcode = ( - "aesd" => 0xf3b00340, "aese" => 0xf3b00300, - "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); - - local *unaes = sub { - my ($mnemonic,$arg)=@_; - - if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { - my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) - |(($2&7)<<1) |(($2&8)<<2); - # since ARMv7 instructions are always encoded little-endian. - # correct solution is to use .inst directive, but older - # assemblers don't implement it:-( - sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", - $word&0xff,($word>>8)&0xff, - ($word>>16)&0xff,($word>>24)&0xff, - $mnemonic,$arg; - } - }; - - sub unvtbl { - my $arg=shift; - - $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && - sprintf "vtbl.8 d%d,{q%d},d%d\n\t". - "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; - } - - sub unvdup32 { - my $arg=shift; - - $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && - sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; - } - - sub unvmov32 { - my $arg=shift; - - $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && - sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; - } - - foreach(split("\n",$code)) { - s/\`([^\`]*)\`/eval($1)/geo; - - s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers - s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers - s/\/\/\s?/@ /o; # new->old style commentary - - # fix up remainig new-style suffixes - s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or - s/\],#[0-9]+/]!/o; - - s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or - s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or - s/vtbl\.8\s+(.*)/unvtbl($1)/geo or - s/vdup\.32\s+(.*)/unvdup32($1)/geo or - s/vmov\.32\s+(.*)/unvmov32($1)/geo or - s/^(\s+)b\./$1b/o or - s/^(\s+)ret/$1bx\tlr/o; - - print $_,"\n"; - } -} - -close STDOUT; |