diff options
Diffstat (limited to 'app/openssl/crypto/aes/asm/aesv8-armx-64.S')
-rw-r--r-- | app/openssl/crypto/aes/asm/aesv8-armx-64.S | 761 |
1 files changed, 761 insertions, 0 deletions
diff --git a/app/openssl/crypto/aes/asm/aesv8-armx-64.S b/app/openssl/crypto/aes/asm/aesv8-armx-64.S new file mode 100644 index 00000000..be0a13df --- /dev/null +++ b/app/openssl/crypto/aes/asm/aesv8-armx-64.S @@ -0,0 +1,761 @@ +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +.arch armv8-a+crypto +.align 5 +rcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl aes_v8_set_encrypt_key +.type aes_v8_set_encrypt_key,%function +.align 5 +aes_v8_set_encrypt_key: +.Lenc_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + adr x3,rcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne .Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b .Ldone + +.align 4 +.L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +.Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne .Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b .Ldone + +.align 4 +.L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +.Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq .Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b .Loop256 + +.Ldone: + str w12,[x2] + + eor x0,x0,x0 // return value + ldr x29,[sp],#16 + ret +.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key + +.globl aes_v8_set_decrypt_key +.type aes_v8_set_decrypt_key,%function +.align 5 +aes_v8_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl .Lenc_key + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +.Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi .Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value + ldp x29,x30,[sp],#16 + ret +.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key +.globl aes_v8_encrypt +.type aes_v8_encrypt,%function +.align 5 +aes_v8_encrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_enc: + aese v2.16b,v0.16b + ld1 {v0.4s},[x2],#16 + aesmc v2.16b,v2.16b + subs w3,w3,#2 + aese v2.16b,v1.16b + ld1 {v1.4s},[x2],#16 + aesmc v2.16b,v2.16b + b.gt .Loop_enc + + aese v2.16b,v0.16b + ld1 {v0.4s},[x2] + aesmc v2.16b,v2.16b + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_encrypt,.-aes_v8_encrypt +.globl aes_v8_decrypt +.type aes_v8_decrypt,%function +.align 5 +aes_v8_decrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_dec: + aesd v2.16b,v0.16b + ld1 {v0.4s},[x2],#16 + aesimc v2.16b,v2.16b + subs w3,w3,#2 + aesd v2.16b,v1.16b + ld1 {v1.4s},[x2],#16 + aesimc v2.16b,v2.16b + b.gt .Loop_dec + + aesd v2.16b,v0.16b + ld1 {v0.4s},[x2] + aesimc v2.16b,v2.16b + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_decrypt,.-aes_v8_decrypt +.globl aes_v8_cbc_encrypt +.type aes_v8_cbc_encrypt,%function +.align 5 +aes_v8_cbc_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo .Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s-v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s-v19.4s},[x7],#32 + ld1 {v20.4s-v21.4s},[x7],#32 + ld1 {v22.4s-v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq .Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq .Lcbc_enc128 + +.Loop_cbc_enc: + aese v0.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesmc v0.16b,v0.16b + subs w6,w6,#2 + aese v0.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesmc v0.16b,v0.16b + b.gt .Loop_cbc_enc + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + add x7,x3,#16 + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + + mov w6,w5 + eor v6.16b,v0.16b,v7.16b + st1 {v6.16b},[x1],#16 + b.hs .Loop_cbc_enc + + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + ld1 {v2.4s-v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs .Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b .Lcbc_done + +.align 5 +.Lcbc_dec128: + ld1 {v4.4s-v5.4s},[x7] + eor v6.16b,v6.16b,v7.16b + eor v2.16b,v0.16b,v7.16b + mov x12,x8 + +.Loop2x_cbc_dec128: + aesd v0.16b,v16.16b + aesd v1.16b,v16.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + subs x2,x2,#32 + aesd v0.16b,v17.16b + aesd v1.16b,v17.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + csel x8,xzr,x8,lo + aesd v0.16b,v4.16b + aesd v1.16b,v4.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + csel x12,xzr,x12,ls + aesd v0.16b,v5.16b + aesd v1.16b,v5.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v18.16b + aesd v1.16b,v18.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v19.16b + aesd v1.16b,v19.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v20.16b + aesd v1.16b,v20.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v21.16b + aesd v1.16b,v21.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v22.16b + aesd v1.16b,v22.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + + eor v6.16b,v6.16b,v0.16b + ld1 {v0.16b},[x0],x8 + eor v2.16b,v2.16b,v1.16b + ld1 {v1.16b},[x0],x12 + st1 {v6.16b},[x1],#16 + eor v6.16b,v3.16b,v7.16b + st1 {v2.16b},[x1],#16 + eor v2.16b,v0.16b,v7.16b + orr v3.16b,v1.16b,v1.16b + b.hs .Loop2x_cbc_dec128 + + adds x2,x2,#32 + eor v6.16b,v6.16b,v7.16b + b.eq .Lcbc_done + eor v2.16b,v2.16b,v7.16b + b .Lcbc_dec_tail + +.align 5 +.Lcbc_dec: + subs x2,x2,#16 + orr v2.16b,v0.16b,v0.16b + b.lo .Lcbc_dec_tail + + csel x8,xzr,x8,eq + cmp w5,#2 + ld1 {v1.16b},[x0],x8 + orr v3.16b,v1.16b,v1.16b + b.eq .Lcbc_dec128 + +.Loop2x_cbc_dec: + aesd v0.16b,v16.16b + aesd v1.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesd v1.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + b.gt .Loop2x_cbc_dec + + aesd v0.16b,v16.16b + aesd v1.16b,v16.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + eor v4.16b,v6.16b,v7.16b + eor v5.16b,v2.16b,v7.16b + aesd v0.16b,v17.16b + aesd v1.16b,v17.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + orr v6.16b,v3.16b,v3.16b + subs x2,x2,#32 + aesd v0.16b,v18.16b + aesd v1.16b,v18.16b + aesimc v0.16b,v0.16b + csel x8,xzr,x8,lo + aesimc v1.16b,v1.16b + mov x7,x3 + aesd v0.16b,v19.16b + aesd v1.16b,v19.16b + aesimc v0.16b,v0.16b + ld1 {v2.16b},[x0],x8 + aesimc v1.16b,v1.16b + csel x8,xzr,x8,ls + aesd v0.16b,v20.16b + aesd v1.16b,v20.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ld1 {v3.16b},[x0],x8 + aesd v0.16b,v21.16b + aesd v1.16b,v21.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + aesd v0.16b,v22.16b + aesd v1.16b,v22.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + + mov w6,w5 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + orr v0.16b,v2.16b,v2.16b + st1 {v4.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v5.16b},[x1],#16 + b.hs .Loop2x_cbc_dec + + adds x2,x2,#32 + b.eq .Lcbc_done + +.Lcbc_dec_tail: + aesd v0.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesimc v0.16b,v0.16b + subs w6,w6,#2 + aesd v0.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesimc v0.16b,v0.16b + b.gt .Lcbc_dec_tail + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + eor v4.16b,v6.16b,v7.16b + aesd v0.16b,v18.16b + aesimc v0.16b,v0.16b + orr v6.16b,v2.16b,v2.16b + aesd v0.16b,v19.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v23.16b + + eor v4.16b,v4.16b,v0.16b + st1 {v4.16b},[x1],#16 + +.Lcbc_done: + st1 {v6.16b},[x4] +.Lcbc_abort: + ldr x29,[sp],#16 + ret +.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl aes_v8_ctr32_encrypt_blocks +.type aes_v8_ctr32_encrypt_blocks,%function +.align 5 +aes_v8_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s-v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s-v19.4s},[x7],#32 + ld1 {v20.4s-v21.4s},[x7],#32 + ld1 {v22.4s-v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + + subs x2,x2,#2 + b.lo .Lctr32_tail + +#ifndef __ARMEB__ + rev w8, w8 +#endif + orr v1.16b,v0.16b,v0.16b + add w8, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w8 + cmp w5,#2 + mov v1.s[3],w10 + b.eq .Lctr32_128 + +.Loop2x_ctr32: + aese v0.16b,v16.16b + aese v1.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + subs w6,w6,#2 + aese v0.16b,v17.16b + aese v1.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + b.gt .Loop2x_ctr32 + + aese v0.16b,v16.16b + aese v1.16b,v16.16b + aesmc v4.16b,v0.16b + orr v0.16b,v6.16b,v6.16b + aesmc v5.16b,v1.16b + orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b + aese v5.16b,v17.16b + ld1 {v2.16b},[x0],#16 + aesmc v4.16b,v4.16b + ld1 {v3.16b},[x0],#16 + aesmc v5.16b,v5.16b + add w8,w8,#1 + aese v4.16b,v18.16b + aese v5.16b,v18.16b + rev w9,w8 + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + add w8,w8,#1 + aese v4.16b,v19.16b + aese v5.16b,v19.16b + eor v2.16b,v2.16b,v7.16b + rev w10,w8 + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + eor v3.16b,v3.16b,v7.16b + mov x7,x3 + aese v4.16b,v20.16b + aese v5.16b,v20.16b + subs x2,x2,#2 + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + ld1 {v16.4s-v17.4s},[x7],#32 // re-pre-load rndkey[0-1] + aese v4.16b,v21.16b + aese v5.16b,v21.16b + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + aese v4.16b,v22.16b + aese v5.16b,v22.16b + mov v0.s[3], w9 + aesmc v4.16b,v4.16b + mov v1.s[3], w10 + aesmc v5.16b,v5.16b + aese v4.16b,v23.16b + aese v5.16b,v23.16b + + mov w6,w5 + eor v2.16b,v2.16b,v4.16b + eor v3.16b,v3.16b,v5.16b + st1 {v2.16b},[x1],#16 + st1 {v3.16b},[x1],#16 + b.hs .Loop2x_ctr32 + + adds x2,x2,#2 + b.eq .Lctr32_done + b .Lctr32_tail + +.Lctr32_128: + ld1 {v4.4s-v5.4s},[x7] + +.Loop2x_ctr32_128: + aese v0.16b,v16.16b + aese v1.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v2.16b},[x0],#16 + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0],#16 + aese v0.16b,v17.16b + aese v1.16b,v17.16b + add w8,w8,#1 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + rev w9,w8 + aese v0.16b,v4.16b + aese v1.16b,v4.16b + add w8,w8,#1 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + rev w10,w8 + aese v0.16b,v5.16b + aese v1.16b,v5.16b + subs x2,x2,#2 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v18.16b + aese v1.16b,v18.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v19.16b + aese v1.16b,v19.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v20.16b + aese v1.16b,v20.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v21.16b + aese v1.16b,v21.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v22.16b + aese v1.16b,v22.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v23.16b + eor v3.16b,v3.16b,v7.16b + aese v1.16b,v23.16b + + eor v2.16b,v2.16b,v0.16b + orr v0.16b,v6.16b,v6.16b + eor v3.16b,v3.16b,v1.16b + orr v1.16b,v6.16b,v6.16b + st1 {v2.16b},[x1],#16 + mov v0.s[3], w9 + st1 {v3.16b},[x1],#16 + mov v1.s[3], w10 + b.hs .Loop2x_ctr32_128 + + adds x2,x2,#2 + b.eq .Lctr32_done + +.Lctr32_tail: + aese v0.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesmc v0.16b,v0.16b + subs w6,w6,#2 + aese v0.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesmc v0.16b,v0.16b + b.gt .Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v2.16b},[x0] + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v23.16b + + eor v2.16b,v2.16b,v0.16b + st1 {v2.16b},[x1] + +.Lctr32_done: + ldr x29,[sp],#16 + ret +.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +#endif |