#include "arm_arch.h"

#if __ARM_ARCH__>=7
.text
.fpu	neon
.code	32
.align	5
rcon:
.long	0x01,0x01,0x01,0x01
.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
.long	0x1b,0x1b,0x1b,0x1b

.globl	aes_v8_set_encrypt_key
.type	aes_v8_set_encrypt_key,%function
.align	5
aes_v8_set_encrypt_key:
.Lenc_key:
	adr	r3,rcon
	cmp	r1,#192

	veor	q0,q0,q0
	vld1.8	{q3},[r0]!
	mov	r1,#8		@ reuse r1
	vld1.32	{q1,q2},[r3]!

	blt	.Loop128
	beq	.L192
	b	.L256

.align	4
.Loop128:
	vtbl.8	d20,{q3},d4
	vtbl.8	d21,{q3},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{q3},[r2]!
	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
	subs	r1,r1,#1

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	 veor	q10,q10,q1
	veor	q3,q3,q9
	vshl.u8	q1,q1,#1
	veor	q3,q3,q10
	bne	.Loop128

	vld1.32	{q1},[r3]

	vtbl.8	d20,{q3},d4
	vtbl.8	d21,{q3},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{q3},[r2]!
	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	 veor	q10,q10,q1
	veor	q3,q3,q9
	vshl.u8	q1,q1,#1
	veor	q3,q3,q10

	vtbl.8	d20,{q3},d4
	vtbl.8	d21,{q3},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{q3},[r2]!
	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	 veor	q10,q10,q1
	veor	q3,q3,q9
	veor	q3,q3,q10
	vst1.32	{q3},[r2]
	add	r2,r2,#0x50

	mov	r12,#10
	b	.Ldone

.align	4
.L192:
	vld1.8	{d16},[r0]!
	vmov.i8	q10,#8			@ borrow q10
	vst1.32	{q3},[r2]!
	vsub.i8	q2,q2,q10	@ adjust the mask

.Loop192:
	vtbl.8	d20,{q8},d4
	vtbl.8	d21,{q8},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{d16},[r2]!
	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
	subs	r1,r1,#1

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9

	vdup.32	q9,d7[1]
	veor	q9,q9,q8
	 veor	q10,q10,q1
	vext.8	q8,q0,q8,#12
	vshl.u8	q1,q1,#1
	veor	q8,q8,q9
	veor	q3,q3,q10
	veor	q8,q8,q10
	vst1.32	{q3},[r2]!
	bne	.Loop192

	mov	r12,#12
	add	r2,r2,#0x20
	b	.Ldone

.align	4
.L256:
	vld1.8	{q8},[r0]
	mov	r1,#7
	mov	r12,#14
	vst1.32	{q3},[r2]!

.Loop256:
	vtbl.8	d20,{q8},d4
	vtbl.8	d21,{q8},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{q8},[r2]!
	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
	subs	r1,r1,#1

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	 veor	q10,q10,q1
	veor	q3,q3,q9
	vshl.u8	q1,q1,#1
	veor	q3,q3,q10
	vst1.32	{q3},[r2]!
	beq	.Ldone

	vdup.32	q10,d7[1]
	vext.8	q9,q0,q8,#12
	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0

	veor	q8,q8,q9
	vext.8	q9,q0,q9,#12
	veor	q8,q8,q9
	vext.8	q9,q0,q9,#12
	veor	q8,q8,q9

	veor	q8,q8,q10
	b	.Loop256

.Ldone:
	str	r12,[r2]

	eor	r0,r0,r0		@ return value
	
	bx	lr
.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key

.globl	aes_v8_set_decrypt_key
.type	aes_v8_set_decrypt_key,%function
.align	5
aes_v8_set_decrypt_key:
	stmdb	sp!,{r4,lr}
	bl	.Lenc_key

	sub	r2,r2,#240		@ restore original r2
	mov	r4,#-16
	add	r0,r2,r12,lsl#4	@ end of key schedule

	vld1.32	{q0},[r2]
	vld1.32	{q1},[r0]
	vst1.32	{q0},[r0],r4
	vst1.32	{q1},[r2]!

.Loop_imc:
	vld1.32	{q0},[r2]
	vld1.32	{q1},[r0]
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	vst1.32	{q0},[r0],r4
	vst1.32	{q1},[r2]!
	cmp	r0,r2
	bhi	.Loop_imc

	vld1.32	{q0},[r2]
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	vst1.32	{q0},[r0]

	eor	r0,r0,r0		@ return value
	ldmia	sp!,{r4,pc}
.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
.globl	aes_v8_encrypt
.type	aes_v8_encrypt,%function
.align	5
aes_v8_encrypt:
	ldr	r3,[r2,#240]
	vld1.32	{q0},[r2]!
	vld1.8	{q2},[r0]
	sub	r3,r3,#2
	vld1.32	{q1},[r2]!

.Loop_enc:
	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
	vld1.32	{q0},[r2]!
	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
	subs	r3,r3,#2
	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
	vld1.32	{q1},[r2]!
	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
	bgt	.Loop_enc

	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
	vld1.32	{q0},[r2]
	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
	veor	q2,q2,q0

	vst1.8	{q2},[r1]
	bx	lr
.size	aes_v8_encrypt,.-aes_v8_encrypt
.globl	aes_v8_decrypt
.type	aes_v8_decrypt,%function
.align	5
aes_v8_decrypt:
	ldr	r3,[r2,#240]
	vld1.32	{q0},[r2]!
	vld1.8	{q2},[r0]
	sub	r3,r3,#2
	vld1.32	{q1},[r2]!

.Loop_dec:
	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
	vld1.32	{q0},[r2]!
	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
	subs	r3,r3,#2
	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
	vld1.32	{q1},[r2]!
	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
	bgt	.Loop_dec

	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
	vld1.32	{q0},[r2]
	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
	veor	q2,q2,q0

	vst1.8	{q2},[r1]
	bx	lr
.size	aes_v8_decrypt,.-aes_v8_decrypt
.globl	aes_v8_cbc_encrypt
.type	aes_v8_cbc_encrypt,%function
.align	5
aes_v8_cbc_encrypt:
	mov	ip,sp
	stmdb	sp!,{r4-r8,lr}
	vstmdb	sp!,{d8-d15}            @ ABI specification says so
	ldmia	ip,{r4-r5}		@ load remaining args
	subs	r2,r2,#16
	mov	r8,#16
	blo	.Lcbc_abort
	moveq	r8,#0

	cmp	r5,#0			@ en- or decrypting?
	ldr	r5,[r3,#240]
	and	r2,r2,#-16
	vld1.8	{q6},[r4]
	vld1.8	{q0},[r0],r8

	vld1.32	{q8-q9},[r3]		@ load key schedule...
	sub	r5,r5,#6
	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
	sub	r5,r5,#2
	vld1.32	{q10-q11},[r7]!
	vld1.32	{q12-q13},[r7]!
	vld1.32	{q14-q15},[r7]!
	vld1.32	{q7},[r7]

	add	r7,r3,#32
	mov	r6,r5
	beq	.Lcbc_dec

	cmp	r5,#2
	veor	q0,q0,q6
	veor	q5,q8,q7
	beq	.Lcbc_enc128

.Loop_cbc_enc:
	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
	vld1.32	{q8},[r7]!
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	subs	r6,r6,#2
	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
	vld1.32	{q9},[r7]!
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	bgt	.Loop_cbc_enc

	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 subs	r2,r2,#16
	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 moveq	r8,#0
	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 add	r7,r3,#16
	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 vld1.8	{q8},[r0],r8
	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 veor	q8,q8,q5
	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1]
	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15

	 mov	r6,r5
	veor	q6,q0,q7
	vst1.8	{q6},[r1]!
	bhs	.Loop_cbc_enc

	b	.Lcbc_done

.align	5
.Lcbc_enc128:
	vld1.32	{q2-q3},[r7]
	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	b	.Lenter_cbc_enc128
.Loop_cbc_enc128:
	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 vst1.8	{q6},[r1]!
.Lenter_cbc_enc128:
	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 subs	r2,r2,#16
	.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 moveq	r8,#0
	.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 vld1.8	{q8},[r0],r8
	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 veor	q8,q8,q5
	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
	veor	q6,q0,q7
	bhs	.Loop_cbc_enc128

	vst1.8	{q6},[r1]!
	b	.Lcbc_done

.align	5
.Lcbc_dec128:
	vld1.32	{q4-q5},[r7]
	veor	q6,q6,q7
	veor	q2,q0,q7
	mov	r12,r8

.Loop2x_cbc_dec128:
	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 subs	r2,r2,#32
	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 movlo	r8,#0
	.byte	0x48,0x03,0xb0,0xf3	@ aesd q0,q4
	.byte	0x48,0x23,0xb0,0xf3	@ aesd q1,q4
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 movls	r12,#0
	.byte	0x4a,0x03,0xb0,0xf3	@ aesd q0,q5
	.byte	0x4a,0x23,0xb0,0xf3	@ aesd q1,q5
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10
	.byte	0x64,0x23,0xb0,0xf3	@ aesd q1,q10
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11
	.byte	0x66,0x23,0xb0,0xf3	@ aesd q1,q11
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15

	veor	q6,q6,q0
	vld1.8	{q0},[r0],r8
	veor	q2,q2,q1
	vld1.8	{q1},[r0],r12
	vst1.8	{q6},[r1]!
	veor	q6,q3,q7
	vst1.8	{q2},[r1]!
	veor	q2,q0,q7
	vorr	q3,q1,q1
	bhs	.Loop2x_cbc_dec128

	adds	r2,r2,#32
	veor	q6,q6,q7
	beq	.Lcbc_done
	veor	q2,q2,q7
	b	.Lcbc_dec_tail

.align	5
.Lcbc_dec:
	subs	r2,r2,#16
	vorr	q2,q0,q0
	blo	.Lcbc_dec_tail

	moveq	r8,#0
	cmp	r5,#2
	vld1.8	{q1},[r0],r8
	vorr	q3,q1,q1
	beq	.Lcbc_dec128

.Loop2x_cbc_dec:
	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
	vld1.32	{q8},[r7]!
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	subs	r6,r6,#2
	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
	vld1.32	{q9},[r7]!
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	bgt	.Loop2x_cbc_dec

	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 veor	q4,q6,q7
	 veor	q5,q2,q7
	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 vorr	q6,q3,q3
	 subs	r2,r2,#32
	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10
	.byte	0x64,0x23,0xb0,0xf3	@ aesd q1,q10
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	 movlo	r8,#0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 mov	r7,r3
	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11
	.byte	0x66,0x23,0xb0,0xf3	@ aesd q1,q11
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	 vld1.8	{q2},[r0],r8
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 movls	r8,#0
	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 vld1.8	{q3},[r0],r8
	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 vld1.32 {q8},[r7]!	@ re-pre-load rndkey[0]
	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1]
	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15

	 mov	r6,r5
	veor	q4,q4,q0
	veor	q5,q5,q1
	 vorr	q0,q2,q2
	vst1.8	{q4},[r1]!
	 vorr	q1,q3,q3
	vst1.8	{q5},[r1]!
	bhs	.Loop2x_cbc_dec

	adds	r2,r2,#32
	beq	.Lcbc_done

.Lcbc_dec_tail:
	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
	vld1.32	{q8},[r7]!
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	subs	r6,r6,#2
	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
	vld1.32	{q9},[r7]!
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	bgt	.Lcbc_dec_tail

	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	 veor	q4,q6,q7
	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	 vorr	q6,q2,q2
	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15

	veor	q4,q4,q0
	vst1.8	{q4},[r1]!

.Lcbc_done:
	vst1.8	{q6},[r4]
.Lcbc_abort:
	vldmia	sp!,{d8-d15}
	ldmia	sp!,{r4-r8,pc}
.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
.globl	aes_v8_ctr32_encrypt_blocks
.type	aes_v8_ctr32_encrypt_blocks,%function
.align	5
aes_v8_ctr32_encrypt_blocks:
	mov		ip,sp
	stmdb		sp!,{r4-r10,lr}
	vstmdb		sp!,{d8-d15}            @ ABI specification says so
	ldr		r4, [ip]		@ load remaining arg
	ldr		r5,[r3,#240]

	ldr		r8, [r4, #12]
	vld1.32		{q0},[r4]

	vld1.32		{q8-q9},[r3]		@ load key schedule...
	sub		r5,r5,#6
	add		r7,r3,r5,lsl#4	@ pointer to last 7 round keys
	sub		r5,r5,#2
	vld1.32		{q10-q11},[r7]!
	vld1.32		{q12-q13},[r7]!
	vld1.32		{q14-q15},[r7]!
	vld1.32		{q7},[r7]

	add		r7,r3,#32
	mov		r6,r5

	subs		r2,r2,#2
	blo		.Lctr32_tail

#ifndef __ARMEB__
	rev		r8, r8
#endif
	vorr		q1,q0,q0
	add		r8, r8, #1
	vorr		q6,q0,q0
	rev		r10, r8
	cmp		r5,#2
	vmov.32	d3[1],r10
	beq		.Lctr32_128

.Loop2x_ctr32:
	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
	vld1.32		{q8},[r7]!
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	subs		r6,r6,#2
	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
	vld1.32		{q9},[r7]!
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	bgt		.Loop2x_ctr32

	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
	.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
	 vorr		q0,q6,q6
	.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
	 vorr		q1,q6,q6
	.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
	.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
	 vld1.8		{q2},[r0]!
	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
	 vld1.8		{q3},[r0]!
	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	 add		r8,r8,#1
	.byte	0x24,0x83,0xb0,0xf3	@ aese q4,q10
	.byte	0x24,0xa3,0xb0,0xf3	@ aese q5,q10
	 rev		r9,r8
	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	 add		r8,r8,#1
	.byte	0x26,0x83,0xb0,0xf3	@ aese q4,q11
	.byte	0x26,0xa3,0xb0,0xf3	@ aese q5,q11
	 veor		q2,q2,q7
	 rev		r10,r8
	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	 veor		q3,q3,q7
	 mov		r7,r3
	.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
	.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
	 subs		r2,r2,#2
	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	 vld1.32	 {q8-q9},[r7]!	@ re-pre-load rndkey[0-1]
	.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
	.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
	.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
	 vmov.32	d1[1], r9
	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
	 vmov.32	d3[1], r10
	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
	.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15

	 mov		r6,r5
	veor		q2,q2,q4
	veor		q3,q3,q5
	vst1.8		{q2},[r1]!
	vst1.8		{q3},[r1]!
	bhs		.Loop2x_ctr32

	adds		r2,r2,#2
	beq		.Lctr32_done
	b		.Lctr32_tail

.Lctr32_128:
	vld1.32		{q4-q5},[r7]

.Loop2x_ctr32_128:
	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 vld1.8		{q2},[r0]!
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	 vld1.8		{q3},[r0]!
	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
	 add		r8,r8,#1
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	 rev		r9,r8
	.byte	0x08,0x03,0xb0,0xf3	@ aese q0,q4
	.byte	0x08,0x23,0xb0,0xf3	@ aese q1,q4
	 add		r8,r8,#1
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	 rev		r10,r8
	.byte	0x0a,0x03,0xb0,0xf3	@ aese q0,q5
	.byte	0x0a,0x23,0xb0,0xf3	@ aese q1,q5
	 subs		r2,r2,#2
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
	.byte	0x24,0x23,0xb0,0xf3	@ aese q1,q10
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
	.byte	0x26,0x23,0xb0,0xf3	@ aese q1,q11
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
	.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
	.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
	.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	 veor		q2,q2,q7
	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
	 veor		q3,q3,q7
	.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15

	veor		q2,q2,q0
	vorr		q0,q6,q6
	veor		q3,q3,q1
	vorr		q1,q6,q6
	vst1.8		{q2},[r1]!
	vmov.32	d1[1], r9
	vst1.8		{q3},[r1]!
	vmov.32	d3[1], r10
	bhs		.Loop2x_ctr32_128

	adds		r2,r2,#2
	beq		.Lctr32_done

.Lctr32_tail:
	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
	vld1.32		{q8},[r7]!
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	subs		r6,r6,#2
	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
	vld1.32		{q9},[r7]!
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	bgt		.Lctr32_tail

	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 vld1.8		{q2},[r0]
	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	 veor		q2,q2,q7
	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15

	veor		q2,q2,q0
	vst1.8		{q2},[r1]

.Lctr32_done:
	vldmia		sp!,{d8-d15}
	ldmia		sp!,{r4-r10,pc}
.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
#endif