From d0e7ba3029b2fd42582413aa95773fe7dbdede90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Parm=C3=A9nides=20GV?= Date: Tue, 23 Sep 2014 18:10:57 +0200 Subject: Updated native subprojects from ics-openvpn. --- app/openssl/crypto/aes/asm/aes-armv4.pl | 139 +++- app/openssl/crypto/aes/asm/aes-armv4.s | 160 ++++- app/openssl/crypto/aes/asm/aesv8-armx-64.S | 761 ++++++++++++++++++++++ app/openssl/crypto/aes/asm/aesv8-armx.S | 767 ++++++++++++++++++++++ app/openssl/crypto/aes/asm/aesv8-armx.pl | 980 +++++++++++++++++++++++++++++ 5 files changed, 2752 insertions(+), 55 deletions(-) create mode 100644 app/openssl/crypto/aes/asm/aesv8-armx-64.S create mode 100644 app/openssl/crypto/aes/asm/aesv8-armx.S create mode 100644 app/openssl/crypto/aes/asm/aesv8-armx.pl (limited to 'app/openssl/crypto/aes') diff --git a/app/openssl/crypto/aes/asm/aes-armv4.pl b/app/openssl/crypto/aes/asm/aes-armv4.pl index 86b86c4a..4f891708 100644 --- a/app/openssl/crypto/aes/asm/aes-armv4.pl +++ b/app/openssl/crypto/aes/asm/aes-armv4.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov for the OpenSSL +# Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -51,9 +51,23 @@ $key="r11"; $rounds="r12"; $code=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + .text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else .code 32 +# endif +#endif .type AES_Te,%object .align 5 @@ -167,7 +181,11 @@ AES_Te: .type AES_encrypt,%function .align 5 AES_encrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_encrypt +#else + adr r3,AES_encrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -409,11 +427,21 @@ _armv4_AES_encrypt: .align 5 private_AES_set_encrypt_key: _armv4_AES_set_encrypt_key: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_set_encrypt_key +#else + adr r3,private_AES_set_encrypt_key +#endif teq r0,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt teq r2,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt @@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 +#if __ARM_ARCH__>=7 + itt ne @ Thumb2 thing, sanity check in ARM +#endif movne r0,#-1 bne .Labrt @@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-16] subs $rounds,$rounds,#1 str $s3,[$key,#-12] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,$key,#216 beq .Ldone @@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-24] subs $rounds,$rounds,#1 str $s3,[$key,#-20] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,$key,#256 beq .Ldone @@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key: str $i3,[$key,#-4] b .L256_loop +.align 2 .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} -.Labrt: tst lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .global private_AES_set_decrypt_key @@ -688,34 +731,57 @@ private_AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 - ldrne lr,[sp],#4 @ pop lr + ldr lr,[sp],#4 @ pop lr bne .Labrt - stmdb sp!,{r4-r12} + mov r0,r2 @ AES_set_encrypt_key preserves r2, + mov r1,r2 @ which is AES_KEY *key + b _armv4_AES_set_enc2dec_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key - ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2, - mov $key,r2 @ which is AES_KEY *key - mov $i1,r2 - add $i2,r2,$rounds,lsl#4 +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.global AES_set_enc2dec_key +.type AES_set_enc2dec_key,%function +.align 5 +AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: + stmdb sp!,{r4-r12,lr} + + ldr $rounds,[r0,#240] + mov $i1,r0 @ input + add $i2,r0,$rounds,lsl#4 + mov $key,r1 @ ouput + add $tbl,r1,$rounds,lsl#4 + str $rounds,[r1,#240] + +.Linv: ldr $s0,[$i1],#16 + ldr $s1,[$i1,#-12] + ldr $s2,[$i1,#-8] + ldr $s3,[$i1,#-4] + ldr $t1,[$i2],#-16 + ldr $t2,[$i2,#16+4] + ldr $t3,[$i2,#16+8] + ldr $i3,[$i2,#16+12] + str $s0,[$tbl],#-16 + str $s1,[$tbl,#16+4] + str $s2,[$tbl,#16+8] + str $s3,[$tbl,#16+12] + str $t1,[$key],#16 + str $t2,[$key,#-12] + str $t3,[$key,#-8] + str $i3,[$key,#-4] + teq $i1,$i2 + bne .Linv -.Linv: ldr $s0,[$i1] + ldr $s0,[$i1] ldr $s1,[$i1,#4] ldr $s2,[$i1,#8] ldr $s3,[$i1,#12] - ldr $t1,[$i2] - ldr $t2,[$i2,#4] - ldr $t3,[$i2,#8] - ldr $i3,[$i2,#12] - str $s0,[$i2],#-16 - str $s1,[$i2,#16+4] - str $s2,[$i2,#16+8] - str $s3,[$i2,#16+12] - str $t1,[$i1],#16 - str $t2,[$i1,#-12] - str $t3,[$i1,#-8] - str $i3,[$i1,#-4] - teq $i1,$i2 - bne .Linv + str $s0,[$key] + str $s1,[$key,#4] + str $s2,[$key,#8] + str $s3,[$key,#12] + sub $key,$key,$rounds,lsl#3 ___ $mask80=$i1; $mask1b=$i2; @@ -773,7 +839,7 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size AES_set_enc2dec_key,.-AES_set_enc2dec_key .type AES_Td,%object .align 5 @@ -883,7 +949,11 @@ AES_Td: .type AES_decrypt,%function .align 5 AES_decrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_decrypt +#else + adr r3,AES_decrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -1080,8 +1150,9 @@ _armv4_AES_decrypt: ldrb $t3,[$tbl,$i3] @ Td4[s0>>0] and $i3,lr,$s1,lsr#8 + add $s1,$tbl,$s1,lsr#24 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0] - ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24] + ldrb $s1,[$s1] @ Td4[s1>>24] ldrb $i2,[$tbl,$i2] @ Td4[s1>>16] eor $s0,$i1,$s0,lsl#24 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8] @@ -1094,7 +1165,8 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] and $i3,lr,$s2,lsr#16 - ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] + add $s2,$tbl,$s2,lsr#24 + ldrb $s2,[$s2] @ Td4[s2>>24] eor $s0,$s0,$i1,lsl#8 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] eor $s1,$i2,$s1,lsl#16 @@ -1106,8 +1178,9 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] and $i3,lr,$s3 @ i2 + add $s3,$tbl,$s3,lsr#24 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] - ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] + ldrb $s3,[$s3] @ Td4[s3>>24] eor $s0,$s0,$i1,lsl#16 ldr $i1,[$key,#0] eor $s1,$s1,$i2,lsl#8 @@ -1130,5 +1203,15 @@ _armv4_AES_decrypt: ___ $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx\tlr/gm; + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + print $code; close STDOUT; # enforce flush diff --git a/app/openssl/crypto/aes/asm/aes-armv4.s b/app/openssl/crypto/aes/asm/aes-armv4.s index 2697d4ce..333a5227 100644 --- a/app/openssl/crypto/aes/asm/aes-armv4.s +++ b/app/openssl/crypto/aes/asm/aes-armv4.s @@ -1,6 +1,53 @@ -#include "arm_arch.h" + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ AES for ARMv4 + +@ January 2007. +@ +@ Code uses single 1K S-box and is >2 times faster than code generated +@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which +@ allows to merge logical or arithmetic operation with shift or rotate +@ in one instruction and emit combined result every cycle. The module +@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit +@ key [on single-issue Xscale PXA250 core]. + +@ May 2007. +@ +@ AES_set_[en|de]crypt_key is added. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 12% improvement on +@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~21.5 cycles per byte. + +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + .text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else .code 32 +# endif +#endif .type AES_Te,%object .align 5 @@ -114,7 +161,11 @@ AES_Te: .type AES_encrypt,%function .align 5 AES_encrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_encrypt +#else + adr r3,AES_encrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov r12,r0 @ inp mov r11,r2 @@ -356,11 +407,21 @@ _armv4_AES_encrypt: .align 5 private_AES_set_encrypt_key: _armv4_AES_set_encrypt_key: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_set_encrypt_key +#else + adr r3,private_AES_set_encrypt_key +#endif teq r0,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt teq r2,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt @@ -369,6 +430,9 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 +#if __ARM_ARCH__>=7 + itt ne @ Thumb2 thing, sanity check in ARM +#endif movne r0,#-1 bne .Labrt @@ -523,6 +587,9 @@ _armv4_AES_set_encrypt_key: str r2,[r11,#-16] subs r12,r12,#1 str r3,[r11,#-12] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,r11,#216 beq .Ldone @@ -592,6 +659,9 @@ _armv4_AES_set_encrypt_key: str r2,[r11,#-24] subs r12,r12,#1 str r3,[r11,#-20] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,r11,#256 beq .Ldone @@ -621,11 +691,17 @@ _armv4_AES_set_encrypt_key: str r9,[r11,#-4] b .L256_loop +.align 2 .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} -.Labrt: tst lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 + bx lr @ .word 0xe12fff1e +#else + tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .global private_AES_set_decrypt_key @@ -635,34 +711,57 @@ private_AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 - ldrne lr,[sp],#4 @ pop lr + ldr lr,[sp],#4 @ pop lr bne .Labrt - stmdb sp!,{r4-r12} + mov r0,r2 @ AES_set_encrypt_key preserves r2, + mov r1,r2 @ which is AES_KEY *key + b _armv4_AES_set_enc2dec_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key + +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.global AES_set_enc2dec_key +.type AES_set_enc2dec_key,%function +.align 5 +AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: + stmdb sp!,{r4-r12,lr} + + ldr r12,[r0,#240] + mov r7,r0 @ input + add r8,r0,r12,lsl#4 + mov r11,r1 @ ouput + add r10,r1,r12,lsl#4 + str r12,[r1,#240] - ldr r12,[r2,#240] @ AES_set_encrypt_key preserves r2, - mov r11,r2 @ which is AES_KEY *key - mov r7,r2 - add r8,r2,r12,lsl#4 +.Linv: ldr r0,[r7],#16 + ldr r1,[r7,#-12] + ldr r2,[r7,#-8] + ldr r3,[r7,#-4] + ldr r4,[r8],#-16 + ldr r5,[r8,#16+4] + ldr r6,[r8,#16+8] + ldr r9,[r8,#16+12] + str r0,[r10],#-16 + str r1,[r10,#16+4] + str r2,[r10,#16+8] + str r3,[r10,#16+12] + str r4,[r11],#16 + str r5,[r11,#-12] + str r6,[r11,#-8] + str r9,[r11,#-4] + teq r7,r8 + bne .Linv -.Linv: ldr r0,[r7] + ldr r0,[r7] ldr r1,[r7,#4] ldr r2,[r7,#8] ldr r3,[r7,#12] - ldr r4,[r8] - ldr r5,[r8,#4] - ldr r6,[r8,#8] - ldr r9,[r8,#12] - str r0,[r8],#-16 - str r1,[r8,#16+4] - str r2,[r8,#16+8] - str r3,[r8,#16+12] - str r4,[r7],#16 - str r5,[r7,#-12] - str r6,[r7,#-8] - str r9,[r7,#-4] - teq r7,r8 - bne .Linv + str r0,[r11] + str r1,[r11,#4] + str r2,[r11,#8] + str r3,[r11,#12] + sub r11,r11,r12,lsl#3 ldr r0,[r11,#16]! @ prefetch tp1 mov r7,#0x80 mov r8,#0x1b @@ -715,7 +814,7 @@ private_AES_set_decrypt_key: moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size AES_set_enc2dec_key,.-AES_set_enc2dec_key .type AES_Td,%object .align 5 @@ -825,7 +924,11 @@ AES_Td: .type AES_decrypt,%function .align 5 AES_decrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_decrypt +#else + adr r3,AES_decrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov r12,r0 @ inp mov r11,r2 @@ -1022,8 +1125,9 @@ _armv4_AES_decrypt: ldrb r6,[r10,r9] @ Td4[s0>>0] and r9,lr,r1,lsr#8 + add r1,r10,r1,lsr#24 ldrb r7,[r10,r7] @ Td4[s1>>0] - ldrb r1,[r10,r1,lsr#24] @ Td4[s1>>24] + ldrb r1,[r1] @ Td4[s1>>24] ldrb r8,[r10,r8] @ Td4[s1>>16] eor r0,r7,r0,lsl#24 ldrb r9,[r10,r9] @ Td4[s1>>8] @@ -1036,7 +1140,8 @@ _armv4_AES_decrypt: ldrb r8,[r10,r8] @ Td4[s2>>0] and r9,lr,r2,lsr#16 - ldrb r2,[r10,r2,lsr#24] @ Td4[s2>>24] + add r2,r10,r2,lsr#24 + ldrb r2,[r2] @ Td4[s2>>24] eor r0,r0,r7,lsl#8 ldrb r9,[r10,r9] @ Td4[s2>>16] eor r1,r8,r1,lsl#16 @@ -1048,8 +1153,9 @@ _armv4_AES_decrypt: ldrb r8,[r10,r8] @ Td4[s3>>8] and r9,lr,r3 @ i2 + add r3,r10,r3,lsr#24 ldrb r9,[r10,r9] @ Td4[s3>>0] - ldrb r3,[r10,r3,lsr#24] @ Td4[s3>>24] + ldrb r3,[r3] @ Td4[s3>>24] eor r0,r0,r7,lsl#16 ldr r7,[r11,#0] eor r1,r1,r8,lsl#8 diff --git a/app/openssl/crypto/aes/asm/aesv8-armx-64.S b/app/openssl/crypto/aes/asm/aesv8-armx-64.S new file mode 100644 index 00000000..be0a13df --- /dev/null +++ b/app/openssl/crypto/aes/asm/aesv8-armx-64.S @@ -0,0 +1,761 @@ +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +.arch armv8-a+crypto +.align 5 +rcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl aes_v8_set_encrypt_key +.type aes_v8_set_encrypt_key,%function +.align 5 +aes_v8_set_encrypt_key: +.Lenc_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + adr x3,rcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne .Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b .Ldone + +.align 4 +.L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +.Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne .Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b .Ldone + +.align 4 +.L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +.Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq .Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b .Loop256 + +.Ldone: + str w12,[x2] + + eor x0,x0,x0 // return value + ldr x29,[sp],#16 + ret +.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key + +.globl aes_v8_set_decrypt_key +.type aes_v8_set_decrypt_key,%function +.align 5 +aes_v8_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl .Lenc_key + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +.Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi .Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value + ldp x29,x30,[sp],#16 + ret +.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key +.globl aes_v8_encrypt +.type aes_v8_encrypt,%function +.align 5 +aes_v8_encrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_enc: + aese v2.16b,v0.16b + ld1 {v0.4s},[x2],#16 + aesmc v2.16b,v2.16b + subs w3,w3,#2 + aese v2.16b,v1.16b + ld1 {v1.4s},[x2],#16 + aesmc v2.16b,v2.16b + b.gt .Loop_enc + + aese v2.16b,v0.16b + ld1 {v0.4s},[x2] + aesmc v2.16b,v2.16b + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_encrypt,.-aes_v8_encrypt +.globl aes_v8_decrypt +.type aes_v8_decrypt,%function +.align 5 +aes_v8_decrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_dec: + aesd v2.16b,v0.16b + ld1 {v0.4s},[x2],#16 + aesimc v2.16b,v2.16b + subs w3,w3,#2 + aesd v2.16b,v1.16b + ld1 {v1.4s},[x2],#16 + aesimc v2.16b,v2.16b + b.gt .Loop_dec + + aesd v2.16b,v0.16b + ld1 {v0.4s},[x2] + aesimc v2.16b,v2.16b + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_decrypt,.-aes_v8_decrypt +.globl aes_v8_cbc_encrypt +.type aes_v8_cbc_encrypt,%function +.align 5 +aes_v8_cbc_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo .Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s-v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s-v19.4s},[x7],#32 + ld1 {v20.4s-v21.4s},[x7],#32 + ld1 {v22.4s-v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq .Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq .Lcbc_enc128 + +.Loop_cbc_enc: + aese v0.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesmc v0.16b,v0.16b + subs w6,w6,#2 + aese v0.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesmc v0.16b,v0.16b + b.gt .Loop_cbc_enc + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + add x7,x3,#16 + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + + mov w6,w5 + eor v6.16b,v0.16b,v7.16b + st1 {v6.16b},[x1],#16 + b.hs .Loop_cbc_enc + + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + ld1 {v2.4s-v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs .Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b .Lcbc_done + +.align 5 +.Lcbc_dec128: + ld1 {v4.4s-v5.4s},[x7] + eor v6.16b,v6.16b,v7.16b + eor v2.16b,v0.16b,v7.16b + mov x12,x8 + +.Loop2x_cbc_dec128: + aesd v0.16b,v16.16b + aesd v1.16b,v16.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + subs x2,x2,#32 + aesd v0.16b,v17.16b + aesd v1.16b,v17.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + csel x8,xzr,x8,lo + aesd v0.16b,v4.16b + aesd v1.16b,v4.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + csel x12,xzr,x12,ls + aesd v0.16b,v5.16b + aesd v1.16b,v5.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v18.16b + aesd v1.16b,v18.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v19.16b + aesd v1.16b,v19.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v20.16b + aesd v1.16b,v20.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v21.16b + aesd v1.16b,v21.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v22.16b + aesd v1.16b,v22.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + + eor v6.16b,v6.16b,v0.16b + ld1 {v0.16b},[x0],x8 + eor v2.16b,v2.16b,v1.16b + ld1 {v1.16b},[x0],x12 + st1 {v6.16b},[x1],#16 + eor v6.16b,v3.16b,v7.16b + st1 {v2.16b},[x1],#16 + eor v2.16b,v0.16b,v7.16b + orr v3.16b,v1.16b,v1.16b + b.hs .Loop2x_cbc_dec128 + + adds x2,x2,#32 + eor v6.16b,v6.16b,v7.16b + b.eq .Lcbc_done + eor v2.16b,v2.16b,v7.16b + b .Lcbc_dec_tail + +.align 5 +.Lcbc_dec: + subs x2,x2,#16 + orr v2.16b,v0.16b,v0.16b + b.lo .Lcbc_dec_tail + + csel x8,xzr,x8,eq + cmp w5,#2 + ld1 {v1.16b},[x0],x8 + orr v3.16b,v1.16b,v1.16b + b.eq .Lcbc_dec128 + +.Loop2x_cbc_dec: + aesd v0.16b,v16.16b + aesd v1.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesd v1.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + b.gt .Loop2x_cbc_dec + + aesd v0.16b,v16.16b + aesd v1.16b,v16.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + eor v4.16b,v6.16b,v7.16b + eor v5.16b,v2.16b,v7.16b + aesd v0.16b,v17.16b + aesd v1.16b,v17.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + orr v6.16b,v3.16b,v3.16b + subs x2,x2,#32 + aesd v0.16b,v18.16b + aesd v1.16b,v18.16b + aesimc v0.16b,v0.16b + csel x8,xzr,x8,lo + aesimc v1.16b,v1.16b + mov x7,x3 + aesd v0.16b,v19.16b + aesd v1.16b,v19.16b + aesimc v0.16b,v0.16b + ld1 {v2.16b},[x0],x8 + aesimc v1.16b,v1.16b + csel x8,xzr,x8,ls + aesd v0.16b,v20.16b + aesd v1.16b,v20.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ld1 {v3.16b},[x0],x8 + aesd v0.16b,v21.16b + aesd v1.16b,v21.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + aesd v0.16b,v22.16b + aesd v1.16b,v22.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + + mov w6,w5 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + orr v0.16b,v2.16b,v2.16b + st1 {v4.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v5.16b},[x1],#16 + b.hs .Loop2x_cbc_dec + + adds x2,x2,#32 + b.eq .Lcbc_done + +.Lcbc_dec_tail: + aesd v0.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesimc v0.16b,v0.16b + subs w6,w6,#2 + aesd v0.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesimc v0.16b,v0.16b + b.gt .Lcbc_dec_tail + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + eor v4.16b,v6.16b,v7.16b + aesd v0.16b,v18.16b + aesimc v0.16b,v0.16b + orr v6.16b,v2.16b,v2.16b + aesd v0.16b,v19.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v23.16b + + eor v4.16b,v4.16b,v0.16b + st1 {v4.16b},[x1],#16 + +.Lcbc_done: + st1 {v6.16b},[x4] +.Lcbc_abort: + ldr x29,[sp],#16 + ret +.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl aes_v8_ctr32_encrypt_blocks +.type aes_v8_ctr32_encrypt_blocks,%function +.align 5 +aes_v8_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s-v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s-v19.4s},[x7],#32 + ld1 {v20.4s-v21.4s},[x7],#32 + ld1 {v22.4s-v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + + subs x2,x2,#2 + b.lo .Lctr32_tail + +#ifndef __ARMEB__ + rev w8, w8 +#endif + orr v1.16b,v0.16b,v0.16b + add w8, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w8 + cmp w5,#2 + mov v1.s[3],w10 + b.eq .Lctr32_128 + +.Loop2x_ctr32: + aese v0.16b,v16.16b + aese v1.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + subs w6,w6,#2 + aese v0.16b,v17.16b + aese v1.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + b.gt .Loop2x_ctr32 + + aese v0.16b,v16.16b + aese v1.16b,v16.16b + aesmc v4.16b,v0.16b + orr v0.16b,v6.16b,v6.16b + aesmc v5.16b,v1.16b + orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b + aese v5.16b,v17.16b + ld1 {v2.16b},[x0],#16 + aesmc v4.16b,v4.16b + ld1 {v3.16b},[x0],#16 + aesmc v5.16b,v5.16b + add w8,w8,#1 + aese v4.16b,v18.16b + aese v5.16b,v18.16b + rev w9,w8 + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + add w8,w8,#1 + aese v4.16b,v19.16b + aese v5.16b,v19.16b + eor v2.16b,v2.16b,v7.16b + rev w10,w8 + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + eor v3.16b,v3.16b,v7.16b + mov x7,x3 + aese v4.16b,v20.16b + aese v5.16b,v20.16b + subs x2,x2,#2 + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + ld1 {v16.4s-v17.4s},[x7],#32 // re-pre-load rndkey[0-1] + aese v4.16b,v21.16b + aese v5.16b,v21.16b + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + aese v4.16b,v22.16b + aese v5.16b,v22.16b + mov v0.s[3], w9 + aesmc v4.16b,v4.16b + mov v1.s[3], w10 + aesmc v5.16b,v5.16b + aese v4.16b,v23.16b + aese v5.16b,v23.16b + + mov w6,w5 + eor v2.16b,v2.16b,v4.16b + eor v3.16b,v3.16b,v5.16b + st1 {v2.16b},[x1],#16 + st1 {v3.16b},[x1],#16 + b.hs .Loop2x_ctr32 + + adds x2,x2,#2 + b.eq .Lctr32_done + b .Lctr32_tail + +.Lctr32_128: + ld1 {v4.4s-v5.4s},[x7] + +.Loop2x_ctr32_128: + aese v0.16b,v16.16b + aese v1.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v2.16b},[x0],#16 + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0],#16 + aese v0.16b,v17.16b + aese v1.16b,v17.16b + add w8,w8,#1 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + rev w9,w8 + aese v0.16b,v4.16b + aese v1.16b,v4.16b + add w8,w8,#1 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + rev w10,w8 + aese v0.16b,v5.16b + aese v1.16b,v5.16b + subs x2,x2,#2 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v18.16b + aese v1.16b,v18.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v19.16b + aese v1.16b,v19.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v20.16b + aese v1.16b,v20.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v21.16b + aese v1.16b,v21.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v22.16b + aese v1.16b,v22.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v23.16b + eor v3.16b,v3.16b,v7.16b + aese v1.16b,v23.16b + + eor v2.16b,v2.16b,v0.16b + orr v0.16b,v6.16b,v6.16b + eor v3.16b,v3.16b,v1.16b + orr v1.16b,v6.16b,v6.16b + st1 {v2.16b},[x1],#16 + mov v0.s[3], w9 + st1 {v3.16b},[x1],#16 + mov v1.s[3], w10 + b.hs .Loop2x_ctr32_128 + + adds x2,x2,#2 + b.eq .Lctr32_done + +.Lctr32_tail: + aese v0.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesmc v0.16b,v0.16b + subs w6,w6,#2 + aese v0.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesmc v0.16b,v0.16b + b.gt .Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v2.16b},[x0] + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v23.16b + + eor v2.16b,v2.16b,v0.16b + st1 {v2.16b},[x1] + +.Lctr32_done: + ldr x29,[sp],#16 + ret +.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +#endif diff --git a/app/openssl/crypto/aes/asm/aesv8-armx.S b/app/openssl/crypto/aes/asm/aesv8-armx.S new file mode 100644 index 00000000..1637e4d4 --- /dev/null +++ b/app/openssl/crypto/aes/asm/aesv8-armx.S @@ -0,0 +1,767 @@ +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +.fpu neon +.code 32 +.align 5 +rcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl aes_v8_set_encrypt_key +.type aes_v8_set_encrypt_key,%function +.align 5 +aes_v8_set_encrypt_key: +.Lenc_key: + adr r3,rcon + cmp r1,#192 + + veor q0,q0,q0 + vld1.8 {q3},[r0]! + mov r1,#8 @ reuse r1 + vld1.32 {q1,q2},[r3]! + + blt .Loop128 + beq .L192 + b .L256 + +.align 4 +.Loop128: + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + bne .Loop128 + + vld1.32 {q1},[r3] + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + veor q3,q3,q10 + vst1.32 {q3},[r2] + add r2,r2,#0x50 + + mov r12,#10 + b .Ldone + +.align 4 +.L192: + vld1.8 {d16},[r0]! + vmov.i8 q10,#8 @ borrow q10 + vst1.32 {q3},[r2]! + vsub.i8 q2,q2,q10 @ adjust the mask + +.Loop192: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {d16},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + + vdup.32 q9,d7[1] + veor q9,q9,q8 + veor q10,q10,q1 + vext.8 q8,q0,q8,#12 + vshl.u8 q1,q1,#1 + veor q8,q8,q9 + veor q3,q3,q10 + veor q8,q8,q10 + vst1.32 {q3},[r2]! + bne .Loop192 + + mov r12,#12 + add r2,r2,#0x20 + b .Ldone + +.align 4 +.L256: + vld1.8 {q8},[r0] + mov r1,#7 + mov r12,#14 + vst1.32 {q3},[r2]! + +.Loop256: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q8},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + vst1.32 {q3},[r2]! + beq .Ldone + + vdup.32 q10,d7[1] + vext.8 q9,q0,q8,#12 + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + + veor q8,q8,q10 + b .Loop256 + +.Ldone: + str r12,[r2] + + eor r0,r0,r0 @ return value + + bx lr +.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key + +.globl aes_v8_set_decrypt_key +.type aes_v8_set_decrypt_key,%function +.align 5 +aes_v8_set_decrypt_key: + stmdb sp!,{r4,lr} + bl .Lenc_key + + sub r2,r2,#240 @ restore original r2 + mov r4,#-16 + add r0,r2,r12,lsl#4 @ end of key schedule + + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + +.Loop_imc: + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + cmp r0,r2 + bhi .Loop_imc + + vld1.32 {q0},[r2] + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + vst1.32 {q0},[r0] + + eor r0,r0,r0 @ return value + ldmia sp!,{r4,pc} +.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key +.globl aes_v8_encrypt +.type aes_v8_encrypt,%function +.align 5 +aes_v8_encrypt: + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +.Loop_enc: + .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 + vld1.32 {q0},[r2]! + .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + subs r3,r3,#2 + .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 + vld1.32 {q1},[r2]! + .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + bgt .Loop_enc + + .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 + vld1.32 {q0},[r2] + .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr +.size aes_v8_encrypt,.-aes_v8_encrypt +.globl aes_v8_decrypt +.type aes_v8_decrypt,%function +.align 5 +aes_v8_decrypt: + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +.Loop_dec: + .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 + vld1.32 {q0},[r2]! + .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + subs r3,r3,#2 + .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 + vld1.32 {q1},[r2]! + .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + bgt .Loop_dec + + .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 + vld1.32 {q0},[r2] + .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr +.size aes_v8_decrypt,.-aes_v8_decrypt +.globl aes_v8_cbc_encrypt +.type aes_v8_cbc_encrypt,%function +.align 5 +aes_v8_cbc_encrypt: + mov ip,sp + stmdb sp!,{r4-r8,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load remaining args + subs r2,r2,#16 + mov r8,#16 + blo .Lcbc_abort + moveq r8,#0 + + cmp r5,#0 @ en- or decrypting? + ldr r5,[r3,#240] + and r2,r2,#-16 + vld1.8 {q6},[r4] + vld1.8 {q0},[r0],r8 + + vld1.32 {q8-q9},[r3] @ load key schedule... + sub r5,r5,#6 + add r7,r3,r5,lsl#4 @ pointer to last 7 round keys + sub r5,r5,#2 + vld1.32 {q10-q11},[r7]! + vld1.32 {q12-q13},[r7]! + vld1.32 {q14-q15},[r7]! + vld1.32 {q7},[r7] + + add r7,r3,#32 + mov r6,r5 + beq .Lcbc_dec + + cmp r5,#2 + veor q0,q0,q6 + veor q5,q8,q7 + beq .Lcbc_enc128 + +.Loop_cbc_enc: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + vld1.32 {q8},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r6,r6,#2 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + vld1.32 {q9},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + bgt .Loop_cbc_enc + + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 + .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + add r7,r3,#16 + .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 + .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 + .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + + mov r6,r5 + veor q6,q0,q7 + vst1.8 {q6},[r1]! + bhs .Loop_cbc_enc + + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + vld1.32 {q2-q3},[r7] + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vst1.8 {q6},[r1]! +.Lenter_cbc_enc128: + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 + .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 + .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 + .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 + .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q6,q0,q7 + bhs .Loop_cbc_enc128 + + vst1.8 {q6},[r1]! + b .Lcbc_done + +.align 5 +.Lcbc_dec128: + vld1.32 {q4-q5},[r7] + veor q6,q6,q7 + veor q2,q0,q7 + mov r12,r8 + +.Loop2x_cbc_dec128: + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + subs r2,r2,#32 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + movlo r8,#0 + .byte 0x48,0x03,0xb0,0xf3 @ aesd q0,q4 + .byte 0x48,0x23,0xb0,0xf3 @ aesd q1,q4 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + movls r12,#0 + .byte 0x4a,0x03,0xb0,0xf3 @ aesd q0,q5 + .byte 0x4a,0x23,0xb0,0xf3 @ aesd q1,q5 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10 + .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11 + .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 + .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 + .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 + .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 + .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 + + veor q6,q6,q0 + vld1.8 {q0},[r0],r8 + veor q2,q2,q1 + vld1.8 {q1},[r0],r12 + vst1.8 {q6},[r1]! + veor q6,q3,q7 + vst1.8 {q2},[r1]! + veor q2,q0,q7 + vorr q3,q1,q1 + bhs .Loop2x_cbc_dec128 + + adds r2,r2,#32 + veor q6,q6,q7 + beq .Lcbc_done + veor q2,q2,q7 + b .Lcbc_dec_tail + +.align 5 +.Lcbc_dec: + subs r2,r2,#16 + vorr q2,q0,q0 + blo .Lcbc_dec_tail + + moveq r8,#0 + cmp r5,#2 + vld1.8 {q1},[r0],r8 + vorr q3,q1,q1 + beq .Lcbc_dec128 + +.Loop2x_cbc_dec: + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 + vld1.32 {q8},[r7]! + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + subs r6,r6,#2 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 + vld1.32 {q9},[r7]! + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + bgt .Loop2x_cbc_dec + + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + veor q4,q6,q7 + veor q5,q2,q7 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vorr q6,q3,q3 + subs r2,r2,#32 + .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10 + .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + movlo r8,#0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + mov r7,r3 + .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11 + .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + vld1.8 {q2},[r0],r8 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + movls r8,#0 + .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 + .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vld1.8 {q3},[r0],r8 + .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 + .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 + .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 + .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 + + mov r6,r5 + veor q4,q4,q0 + veor q5,q5,q1 + vorr q0,q2,q2 + vst1.8 {q4},[r1]! + vorr q1,q3,q3 + vst1.8 {q5},[r1]! + bhs .Loop2x_cbc_dec + + adds r2,r2,#32 + beq .Lcbc_done + +.Lcbc_dec_tail: + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + vld1.32 {q8},[r7]! + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + subs r6,r6,#2 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + vld1.32 {q9},[r7]! + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + bgt .Lcbc_dec_tail + + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + veor q4,q6,q7 + .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + vorr q6,q2,q2 + .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 + + veor q4,q4,q0 + vst1.8 {q4},[r1]! + +.Lcbc_done: + vst1.8 {q6},[r4] +.Lcbc_abort: + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r8,pc} +.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl aes_v8_ctr32_encrypt_blocks +.type aes_v8_ctr32_encrypt_blocks,%function +.align 5 +aes_v8_ctr32_encrypt_blocks: + mov ip,sp + stmdb sp!,{r4-r10,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg + ldr r5,[r3,#240] + + ldr r8, [r4, #12] + vld1.32 {q0},[r4] + + vld1.32 {q8-q9},[r3] @ load key schedule... + sub r5,r5,#6 + add r7,r3,r5,lsl#4 @ pointer to last 7 round keys + sub r5,r5,#2 + vld1.32 {q10-q11},[r7]! + vld1.32 {q12-q13},[r7]! + vld1.32 {q14-q15},[r7]! + vld1.32 {q7},[r7] + + add r7,r3,#32 + mov r6,r5 + + subs r2,r2,#2 + blo .Lctr32_tail + +#ifndef __ARMEB__ + rev r8, r8 +#endif + vorr q1,q0,q0 + add r8, r8, #1 + vorr q6,q0,q0 + rev r10, r8 + cmp r5,#2 + vmov.32 d3[1],r10 + beq .Lctr32_128 + +.Loop2x_ctr32: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 + vld1.32 {q8},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + subs r6,r6,#2 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 + vld1.32 {q9},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + bgt .Loop2x_ctr32 + + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 + .byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 + vorr q0,q6,q6 + .byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 + vorr q1,q6,q6 + .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 + .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 + vld1.8 {q2},[r0]! + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + vld1.8 {q3},[r0]! + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + add r8,r8,#1 + .byte 0x24,0x83,0xb0,0xf3 @ aese q4,q10 + .byte 0x24,0xa3,0xb0,0xf3 @ aese q5,q10 + rev r9,r8 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + add r8,r8,#1 + .byte 0x26,0x83,0xb0,0xf3 @ aese q4,q11 + .byte 0x26,0xa3,0xb0,0xf3 @ aese q5,q11 + veor q2,q2,q7 + rev r10,r8 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + veor q3,q3,q7 + mov r7,r3 + .byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 + .byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 + subs r2,r2,#2 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vld1.32 {q8-q9},[r7]! @ re-pre-load rndkey[0-1] + .byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 + .byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 + .byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 + vmov.32 d1[1], r9 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + vmov.32 d3[1], r10 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 + .byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 + + mov r6,r5 + veor q2,q2,q4 + veor q3,q3,q5 + vst1.8 {q2},[r1]! + vst1.8 {q3},[r1]! + bhs .Loop2x_ctr32 + + adds r2,r2,#2 + beq .Lctr32_done + b .Lctr32_tail + +.Lctr32_128: + vld1.32 {q4-q5},[r7] + +.Loop2x_ctr32_128: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q2},[r0]! + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q3},[r0]! + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 + add r8,r8,#1 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + rev r9,r8 + .byte 0x08,0x03,0xb0,0xf3 @ aese q0,q4 + .byte 0x08,0x23,0xb0,0xf3 @ aese q1,q4 + add r8,r8,#1 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + rev r10,r8 + .byte 0x0a,0x03,0xb0,0xf3 @ aese q0,q5 + .byte 0x0a,0x23,0xb0,0xf3 @ aese q1,q5 + subs r2,r2,#2 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 + .byte 0x24,0x23,0xb0,0xf3 @ aese q1,q10 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 + .byte 0x26,0x23,0xb0,0xf3 @ aese q1,q11 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 + .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 + .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 + .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + veor q2,q2,q7 + .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q3,q3,q7 + .byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 + + veor q2,q2,q0 + vorr q0,q6,q6 + veor q3,q3,q1 + vorr q1,q6,q6 + vst1.8 {q2},[r1]! + vmov.32 d1[1], r9 + vst1.8 {q3},[r1]! + vmov.32 d3[1], r10 + bhs .Loop2x_ctr32_128 + + adds r2,r2,#2 + beq .Lctr32_done + +.Lctr32_tail: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + vld1.32 {q8},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r6,r6,#2 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + vld1.32 {q9},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + bgt .Lctr32_tail + + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q2},[r0] + .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q2,q2,q7 + .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + + veor q2,q2,q0 + vst1.8 {q2},[r1] + +.Lctr32_done: + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r10,pc} +.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +#endif diff --git a/app/openssl/crypto/aes/asm/aesv8-armx.pl b/app/openssl/crypto/aes/asm/aesv8-armx.pl new file mode 100644 index 00000000..415dc04a --- /dev/null +++ b/app/openssl/crypto/aes/asm/aesv8-armx.pl @@ -0,0 +1,980 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for ARMv8 AES instructions. The +# module is endian-agnostic in sense that it supports both big- and +# little-endian cases. As does it support both 32- and 64-bit modes +# of operation. Latter is achieved by limiting amount of utilized +# registers to 16, which implies additional instructions. This has +# no effect on mighty Apple A7, as results are literally equal to +# the theoretical estimates based on instruction latencies and issue +# rate. It remains to be seen how does it affect other platforms... +# +# Performance in cycles per byte processed with 128-bit key: +# +# CBC enc CBC dec CTR +# Apple A7 2.39 1.20 1.20 +# Cortex-A5x n/a n/a n/a + +$flavour = shift; +open STDOUT,">".shift; + +$prefix="aes_v8"; + +$code=<<___; +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); + +# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, +# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to +# maintain both 32- and 64-bit codes within single module and +# transliterate common code to either flavour with regex vodoo. +# +{{{ +my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); +my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= + $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); + + +$code.=<<___; +.align 5 +rcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: +.Lenc_key: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___; + adr $ptr,rcon + cmp $bits,#192 + + veor $zero,$zero,$zero + vld1.8 {$in0},[$inp],#16 + mov $bits,#8 // reuse $bits + vld1.32 {$rcon,$mask},[$ptr],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + b.ne .Loop128 + + vld1.32 {$rcon},[$ptr] + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + veor $in0,$in0,$key + vst1.32 {$in0},[$out] + add $out,$out,#0x50 + + mov $rounds,#10 + b .Ldone + +.align 4 +.L192: + vld1.8 {$in1},[$inp],#8 + vmov.i8 $key,#8 // borrow $key + vst1.32 {$in0},[$out],#16 + vsub.i8 $mask,$mask,$key // adjust the mask + +.Loop192: + vtbl.8 $key,{$in1},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in1},[$out],#8 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + + vdup.32 $tmp,${in0}[3] + veor $tmp,$tmp,$in1 + veor $key,$key,$rcon + vext.8 $in1,$zero,$in1,#12 + vshl.u8 $rcon,$rcon,#1 + veor $in1,$in1,$tmp + veor $in0,$in0,$key + veor $in1,$in1,$key + vst1.32 {$in0},[$out],#16 + b.ne .Loop192 + + mov $rounds,#12 + add $out,$out,#0x20 + b .Ldone + +.align 4 +.L256: + vld1.8 {$in1},[$inp] + mov $bits,#7 + mov $rounds,#14 + vst1.32 {$in0},[$out],#16 + +.Loop256: + vtbl.8 $key,{$in1},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in1},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + vst1.32 {$in0},[$out],#16 + b.eq .Ldone + + vdup.32 $key,${in0}[3] // just splat + vext.8 $tmp,$zero,$in1,#12 + aese $key,$zero + + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + + veor $in1,$in1,$key + b .Loop256 + +.Ldone: + str $rounds,[$out] + + eor x0,x0,x0 // return value + `"ldr x29,[sp],#16" if ($flavour =~ /64/)` + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key + +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + stmdb sp!,{r4,lr} +___ +$code.=<<___; + bl .Lenc_key + + sub $out,$out,#240 // restore original $out + mov x4,#-16 + add $inp,$out,x12,lsl#4 // end of key schedule + + vld1.32 {v0.16b},[$out] + vld1.32 {v1.16b},[$inp] + vst1.32 {v0.16b},[$inp],x4 + vst1.32 {v1.16b},[$out],#16 + +.Loop_imc: + vld1.32 {v0.16b},[$out] + vld1.32 {v1.16b},[$inp] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + vst1.32 {v0.16b},[$inp],x4 + vst1.32 {v1.16b},[$out],#16 + cmp $inp,$out + b.hi .Loop_imc + + vld1.32 {v0.16b},[$out] + aesimc v0.16b,v0.16b + vst1.32 {v0.16b},[$inp] + + eor x0,x0,x0 // return value +___ +$code.=<<___ if ($flavour !~ /64/); + ldmia sp!,{r4,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldp x29,x30,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} +{{{ +sub gen_block () { +my $dir = shift; +my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); +my ($inp,$out,$key)=map("x$_",(0..2)); +my $rounds="w3"; +my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); + +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + ldr $rounds,[$key,#240] + vld1.32 {$rndkey0},[$key],#16 + vld1.8 {$inout},[$inp] + sub $rounds,$rounds,#2 + vld1.32 {$rndkey1},[$key],#16 + +.Loop_${dir}c: + aes$e $inout,$rndkey0 + vld1.32 {$rndkey0},[$key],#16 + aes$mc $inout,$inout + subs $rounds,$rounds,#2 + aes$e $inout,$rndkey1 + vld1.32 {$rndkey1},[$key],#16 + aes$mc $inout,$inout + b.gt .Loop_${dir}c + + aes$e $inout,$rndkey0 + vld1.32 {$rndkey0},[$key] + aes$mc $inout,$inout + aes$e $inout,$rndkey1 + veor $inout,$inout,$rndkey0 + + vst1.8 {$inout},[$out] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; +my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + +my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r8,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load remaining args +___ +$code.=<<___; + subs $len,$len,#16 + mov $step,#16 + b.lo .Lcbc_abort + cclr $step,eq + + cmp $enc,#0 // en- or decrypting? + ldr $rounds,[$key,#240] + and $len,$len,#-16 + vld1.8 {$ivec},[$ivp] + vld1.8 {$dat},[$inp],$step + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#6 + add $key_,$key,x5,lsl#4 // pointer to last 7 round keys + sub $rounds,$rounds,#2 + vld1.32 {q10-q11},[$key_],#32 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + + add $key_,$key,#32 + mov $cnt,$rounds + b.eq .Lcbc_dec + + cmp $rounds,#2 + veor $dat,$dat,$ivec + veor $rndzero_n_last,q8,$rndlast + b.eq .Lcbc_enc128 + +.Loop_cbc_enc: + aese $dat,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat,$dat + subs $cnt,$cnt,#2 + aese $dat,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat,$dat + b.gt .Loop_cbc_enc + + aese $dat,q8 + aesmc $dat,$dat + subs $len,$len,#16 + aese $dat,q9 + aesmc $dat,$dat + cclr $step,eq + aese $dat,q10 + aesmc $dat,$dat + add $key_,$key,#16 + aese $dat,q11 + aesmc $dat,$dat + vld1.8 {q8},[$inp],$step + aese $dat,q12 + aesmc $dat,$dat + veor q8,q8,$rndzero_n_last + aese $dat,q13 + aesmc $dat,$dat + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + aese $dat,q14 + aesmc $dat,$dat + aese $dat,q15 + + mov $cnt,$rounds + veor $ivec,$dat,$rndlast + vst1.8 {$ivec},[$out],#16 + b.hs .Loop_cbc_enc + + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + vld1.32 {$in0-$in1},[$key_] + aese $dat,q8 + aesmc $dat,$dat + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + aese $dat,q8 + aesmc $dat,$dat + vst1.8 {$ivec},[$out],#16 +.Lenter_cbc_enc128: + aese $dat,q9 + aesmc $dat,$dat + subs $len,$len,#16 + aese $dat,$in0 + aesmc $dat,$dat + cclr $step,eq + aese $dat,$in1 + aesmc $dat,$dat + aese $dat,q10 + aesmc $dat,$dat + aese $dat,q11 + aesmc $dat,$dat + vld1.8 {q8},[$inp],$step + aese $dat,q12 + aesmc $dat,$dat + aese $dat,q13 + aesmc $dat,$dat + aese $dat,q14 + aesmc $dat,$dat + veor q8,q8,$rndzero_n_last + aese $dat,q15 + veor $ivec,$dat,$rndlast + b.hs .Loop_cbc_enc128 + + vst1.8 {$ivec},[$out],#16 + b .Lcbc_done + +.align 5 +.Lcbc_dec128: + vld1.32 {$tmp0-$tmp1},[$key_] + veor $ivec,$ivec,$rndlast + veor $in0,$dat0,$rndlast + mov $step1,$step + +.Loop2x_cbc_dec128: + aesd $dat0,q8 + aesd $dat1,q8 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + subs $len,$len,#32 + aesd $dat0,q9 + aesd $dat1,q9 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + cclr $step,lo + aesd $dat0,$tmp0 + aesd $dat1,$tmp0 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + cclr $step1,ls + aesd $dat0,$tmp1 + aesd $dat1,$tmp1 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q10 + aesd $dat1,q10 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q11 + aesd $dat1,q11 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q12 + aesd $dat1,q12 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q13 + aesd $dat1,q13 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q14 + aesd $dat1,q14 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q15 + aesd $dat1,q15 + + veor $ivec,$ivec,$dat0 + vld1.8 {$dat0},[$inp],$step + veor $in0,$in0,$dat1 + vld1.8 {$dat1},[$inp],$step1 + vst1.8 {$ivec},[$out],#16 + veor $ivec,$in1,$rndlast + vst1.8 {$in0},[$out],#16 + veor $in0,$dat0,$rndlast + vorr $in1,$dat1,$dat1 + b.hs .Loop2x_cbc_dec128 + + adds $len,$len,#32 + veor $ivec,$ivec,$rndlast + b.eq .Lcbc_done + veor $in0,$in0,$rndlast + b .Lcbc_dec_tail + +.align 5 +.Lcbc_dec: + subs $len,$len,#16 + vorr $in0,$dat,$dat + b.lo .Lcbc_dec_tail + + cclr $step,eq + cmp $rounds,#2 + vld1.8 {$dat1},[$inp],$step + vorr $in1,$dat1,$dat1 + b.eq .Lcbc_dec128 + +.Loop2x_cbc_dec: + aesd $dat0,q8 + aesd $dat1,q8 + vld1.32 {q8},[$key_],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + subs $cnt,$cnt,#2 + aesd $dat0,q9 + aesd $dat1,q9 + vld1.32 {q9},[$key_],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + b.gt .Loop2x_cbc_dec + + aesd $dat0,q8 + aesd $dat1,q8 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + veor $tmp0,$ivec,$rndlast + veor $tmp1,$in0,$rndlast + aesd $dat0,q9 + aesd $dat1,q9 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + vorr $ivec,$in1,$in1 + subs $len,$len,#32 + aesd $dat0,q10 + aesd $dat1,q10 + aesimc $dat0,$dat0 + cclr $step,lo + aesimc $dat1,$dat1 + mov $key_,$key + aesd $dat0,q11 + aesd $dat1,q11 + aesimc $dat0,$dat0 + vld1.8 {$in0},[$inp],$step + aesimc $dat1,$dat1 + cclr $step,ls + aesd $dat0,q12 + aesd $dat1,q12 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + vld1.8 {$in1},[$inp],$step + aesd $dat0,q13 + aesd $dat1,q13 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + aesd $dat0,q14 + aesd $dat1,q14 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + aesd $dat0,q15 + aesd $dat1,q15 + + mov $cnt,$rounds + veor $tmp0,$tmp0,$dat0 + veor $tmp1,$tmp1,$dat1 + vorr $dat0,$in0,$in0 + vst1.8 {$tmp0},[$out],#16 + vorr $dat1,$in1,$in1 + vst1.8 {$tmp1},[$out],#16 + b.hs .Loop2x_cbc_dec + + adds $len,$len,#32 + b.eq .Lcbc_done + +.Lcbc_dec_tail: + aesd $dat,q8 + vld1.32 {q8},[$key_],#16 + aesimc $dat,$dat + subs $cnt,$cnt,#2 + aesd $dat,q9 + vld1.32 {q9},[$key_],#16 + aesimc $dat,$dat + b.gt .Lcbc_dec_tail + + aesd $dat,q8 + aesimc $dat,$dat + aesd $dat,q9 + aesimc $dat,$dat + veor $tmp,$ivec,$rndlast + aesd $dat,q10 + aesimc $dat,$dat + vorr $ivec,$in0,$in0 + aesd $dat,q11 + aesimc $dat,$dat + aesd $dat,q12 + aesimc $dat,$dat + aesd $dat,q13 + aesimc $dat,$dat + aesd $dat,q14 + aesimc $dat,$dat + aesd $dat,q15 + + veor $tmp,$tmp,$dat + vst1.8 {$tmp},[$out],#16 + +.Lcbc_done: + vst1.8 {$ivec},[$ivp] +.Lcbc_abort: +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r8,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); +my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10"); +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + +my ($dat,$tmp)=($dat0,$tmp0); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r10,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg +___ +$code.=<<___; + ldr $rounds,[$key,#240] + + ldr $ctr, [$ivp, #12] + vld1.32 {$dat0},[$ivp] + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#6 + add $key_,$key,x5,lsl#4 // pointer to last 7 round keys + sub $rounds,$rounds,#2 + vld1.32 {q10-q11},[$key_],#32 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + + add $key_,$key,#32 + mov $cnt,$rounds + + subs $len,$len,#2 + b.lo .Lctr32_tail + +#ifndef __ARMEB__ + rev $ctr, $ctr +#endif + vorr $dat1,$dat0,$dat0 + add $ctr, $ctr, #1 + vorr $ivec,$dat0,$dat0 + rev $tctr1, $ctr + cmp $rounds,#2 + vmov.32 ${dat1}[3],$tctr1 + b.eq .Lctr32_128 + +.Loop2x_ctr32: + aese $dat0,q8 + aese $dat1,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aese $dat1,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + b.gt .Loop2x_ctr32 + + aese $dat0,q8 + aese $dat1,q8 + aesmc $tmp0,$dat0 + vorr $dat0,$ivec,$ivec + aesmc $tmp1,$dat1 + vorr $dat1,$ivec,$ivec + aese $tmp0,q9 + aese $tmp1,q9 + vld1.8 {$in0},[$inp],#16 + aesmc $tmp0,$tmp0 + vld1.8 {$in1},[$inp],#16 + aesmc $tmp1,$tmp1 + add $ctr,$ctr,#1 + aese $tmp0,q10 + aese $tmp1,q10 + rev $tctr,$ctr + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + add $ctr,$ctr,#1 + aese $tmp0,q11 + aese $tmp1,q11 + veor $in0,$in0,$rndlast + rev $tctr1,$ctr + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + veor $in1,$in1,$rndlast + mov $key_,$key + aese $tmp0,q12 + aese $tmp1,q12 + subs $len,$len,#2 + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1] + aese $tmp0,q13 + aese $tmp1,q13 + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + aese $tmp0,q14 + aese $tmp1,q14 + vmov.32 ${dat0}[3], $tctr + aesmc $tmp0,$tmp0 + vmov.32 ${dat1}[3], $tctr1 + aesmc $tmp1,$tmp1 + aese $tmp0,q15 + aese $tmp1,q15 + + mov $cnt,$rounds + veor $in0,$in0,$tmp0 + veor $in1,$in1,$tmp1 + vst1.8 {$in0},[$out],#16 + vst1.8 {$in1},[$out],#16 + b.hs .Loop2x_ctr32 + + adds $len,$len,#2 + b.eq .Lctr32_done + b .Lctr32_tail + +.Lctr32_128: + vld1.32 {$tmp0-$tmp1},[$key_] + +.Loop2x_ctr32_128: + aese $dat0,q8 + aese $dat1,q8 + aesmc $dat0,$dat0 + vld1.8 {$in0},[$inp],#16 + aesmc $dat1,$dat1 + vld1.8 {$in1},[$inp],#16 + aese $dat0,q9 + aese $dat1,q9 + add $ctr,$ctr,#1 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + rev $tctr,$ctr + aese $dat0,$tmp0 + aese $dat1,$tmp0 + add $ctr,$ctr,#1 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + rev $tctr1,$ctr + aese $dat0,$tmp1 + aese $dat1,$tmp1 + subs $len,$len,#2 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q10 + aese $dat1,q10 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q11 + aese $dat1,q11 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q12 + aese $dat1,q12 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q13 + aese $dat1,q13 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q14 + aese $dat1,q14 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + veor $in0,$in0,$rndlast + aese $dat0,q15 + veor $in1,$in1,$rndlast + aese $dat1,q15 + + veor $in0,$in0,$dat0 + vorr $dat0,$ivec,$ivec + veor $in1,$in1,$dat1 + vorr $dat1,$ivec,$ivec + vst1.8 {$in0},[$out],#16 + vmov.32 ${dat0}[3], $tctr + vst1.8 {$in1},[$out],#16 + vmov.32 ${dat1}[3], $tctr1 + b.hs .Loop2x_ctr32_128 + + adds $len,$len,#2 + b.eq .Lctr32_done + +.Lctr32_tail: + aese $dat,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat,$dat + subs $cnt,$cnt,#2 + aese $dat,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat,$dat + b.gt .Lctr32_tail + + aese $dat,q8 + aesmc $dat,$dat + aese $dat,q9 + aesmc $dat,$dat + vld1.8 {$in0},[$inp] + aese $dat,q10 + aesmc $dat,$dat + aese $dat,q11 + aesmc $dat,$dat + aese $dat,q12 + aesmc $dat,$dat + aese $dat,q13 + aesmc $dat,$dat + aese $dat,q14 + aesmc $dat,$dat + veor $in0,$in0,$rndlast + aese $dat,q15 + + veor $in0,$in0,$dat + vst1.8 {$in0},[$out] + +.Lctr32_done: +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r10,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +$code.=<<___; +#endif +___ +######################################## +if ($flavour =~ /64/) { ######## 64-bit code + my %opcode = ( + "aesd" => 0x4e285800, "aese" => 0x4e284800, + "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5), + $mnemonic,$arg; + }; + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vext\.8/ext/o or + s/vrev32\.8/rev32/o or + s/vtst\.8/cmtst/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + # fix up remainig legacy suffixes + s/\.[ui]?8//o; + m/\],#8/o and s/\.16b/\.8b/go; + s/\.[ui]?32//o and s/\.16b/\.4s/go; + s/\.[ui]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + print $_,"\n"; + } +} else { ######## 32-bit code + my %opcode = ( + "aesd" => 0xf3b00340, "aese" => 0xf3b00300, + "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<1) |(($2&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + }; + + sub unvtbl { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && + sprintf "vtbl.8 d%d,{q%d},d%d\n\t". + "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; + } + + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + + sub unvmov32 { + my $arg=shift; + + $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && + sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; + } + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remainig new-style suffixes + s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or + s/\],#[0-9]+/]!/o; + + s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vtbl\.8\s+(.*)/unvtbl($1)/geo or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/vmov\.32\s+(.*)/unvmov32($1)/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT; -- cgit v1.2.3