diff options
author | Parménides GV <parmegv@sdf.org> | 2014-10-02 18:19:30 +0200 |
---|---|---|
committer | Parménides GV <parmegv@sdf.org> | 2014-10-02 18:19:30 +0200 |
commit | 34643c6b5ab0643383e24025876b0d69859ba4f9 (patch) | |
tree | cb15666fb01b0f0410327ae7aaa23df444ac3b4c /app/openssl/crypto | |
parent | 22b7ee4614a2f47d55496de8a9b55040c0f4ba85 (diff) | |
parent | 914c5156b014970dde717b9a27c0c69f11cc7d98 (diff) |
Merge branch 'feature/Update-ndk-version-and-native-binaries-#6142' into develop
Diffstat (limited to 'app/openssl/crypto')
80 files changed, 17253 insertions, 7448 deletions
diff --git a/app/openssl/crypto/aes/asm/aes-armv4.S b/app/openssl/crypto/aes/asm/aes-armv4.S index 88959108..333a5227 120000..100644 --- a/app/openssl/crypto/aes/asm/aes-armv4.S +++ b/app/openssl/crypto/aes/asm/aes-armv4.S @@ -1 +1,1177 @@ -aes-armv4.s
\ No newline at end of file + +@ ==================================================================== +@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ AES for ARMv4 + +@ January 2007. +@ +@ Code uses single 1K S-box and is >2 times faster than code generated +@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which +@ allows to merge logical or arithmetic operation with shift or rotate +@ in one instruction and emit combined result every cycle. The module +@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit +@ key [on single-issue Xscale PXA250 core]. + +@ May 2007. +@ +@ AES_set_[en|de]crypt_key is added. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 12% improvement on +@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~21.5 cycles per byte. + +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif + +.type AES_Te,%object +.align 5 +AES_Te: +.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d +.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 +.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d +.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a +.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 +.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b +.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea +.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b +.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a +.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f +.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 +.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f +.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e +.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 +.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d +.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f +.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e +.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb +.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce +.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 +.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c +.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed +.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b +.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a +.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 +.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 +.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 +.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 +.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a +.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 +.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 +.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d +.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f +.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 +.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 +.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 +.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f +.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 +.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c +.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 +.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e +.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 +.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 +.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b +.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 +.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 +.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 +.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 +.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 +.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 +.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 +.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 +.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa +.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 +.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 +.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 +.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 +.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 +.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 +.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a +.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 +.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 +.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 +.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a +@ Te4[256] +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +@ rcon[] +.word 0x01000000, 0x02000000, 0x04000000, 0x08000000 +.word 0x10000000, 0x20000000, 0x40000000, 0x80000000 +.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 +.size AES_Te,.-AES_Te + +@ void AES_encrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.global AES_encrypt +.type AES_encrypt,%function +.align 5 +AES_encrypt: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ AES_encrypt +#else + adr r3,AES_encrypt +#endif + stmdb sp!,{r1,r4-r12,lr} + mov r12,r0 @ inp + mov r11,r2 + sub r10,r3,#AES_encrypt-AES_Te @ Te +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_encrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size AES_encrypt,.-AES_encrypt + +.type _armv4_AES_encrypt,%function +.align 2 +_armv4_AES_encrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4-r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0 + and r8,lr,r0,lsr#8 + and r9,lr,r0,lsr#16 + mov r0,r0,lsr#24 +.Lenc_loop: + ldr r4,[r10,r7,lsl#2] @ Te3[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldr r5,[r10,r8,lsl#2] @ Te2[s0>>8] + and r8,lr,r1 + ldr r6,[r10,r9,lsl#2] @ Te1[s0>>16] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Te0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Te1[s1>>16] + ldr r8,[r10,r8,lsl#2] @ Te3[s1>>0] + ldr r9,[r10,r9,lsl#2] @ Te2[s1>>8] + eor r0,r0,r7,ror#8 + ldr r1,[r10,r1,lsl#2] @ Te0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,ror#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r6,r9,ror#8 + and r9,lr,r2 + ldr r7,[r10,r7,lsl#2] @ Te2[s2>>8] + eor r1,r1,r4,ror#24 + ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te3[s2>>0] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Te0[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,ror#8 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r6,r9,ror#16 + and r9,lr,r3,lsr#16 @ i2 + ldr r7,[r10,r7,lsl#2] @ Te3[s3>>0] + eor r2,r2,r5,ror#16 + ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te1[s3>>16] + eor r0,r0,r7,ror#24 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24] + eor r2,r2,r9,ror#8 + ldr r4,[r11,#-12] + eor r3,r3,r6,ror#8 + + ldr r5,[r11,#-8] + eor r0,r0,r7 + ldr r6,[r11,#-4] + and r7,lr,r0 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0,lsr#16 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne .Lenc_loop + + add r10,r10,#2 + + ldrb r4,[r10,r7,lsl#2] @ Te4[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldrb r5,[r10,r8,lsl#2] @ Te4[s0>>8] + and r8,lr,r1 + ldrb r6,[r10,r9,lsl#2] @ Te4[s0>>16] + and r9,lr,r1,lsr#8 + ldrb r0,[r10,r0,lsl#2] @ Te4[s0>>24] + mov r1,r1,lsr#24 + + ldrb r7,[r10,r7,lsl#2] @ Te4[s1>>16] + ldrb r8,[r10,r8,lsl#2] @ Te4[s1>>0] + ldrb r9,[r10,r9,lsl#2] @ Te4[s1>>8] + eor r0,r7,r0,lsl#8 + ldrb r1,[r10,r1,lsl#2] @ Te4[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,lsl#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s2>>8] + eor r1,r4,r1,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16] + mov r2,r2,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s2>>0] + eor r0,r7,r0,lsl#8 + ldrb r2,[r10,r2,lsl#2] @ Te4[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r3,lsr#16 @ i2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s3>>0] + eor r2,r5,r2,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8] + mov r3,r3,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s3>>16] + eor r0,r7,r0,lsl#8 + ldr r7,[r11,#0] + ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r2,r9,lsl#16 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#2 + ldr pc,[sp],#4 @ pop and return +.size _armv4_AES_encrypt,.-_armv4_AES_encrypt + +.global private_AES_set_encrypt_key +.type private_AES_set_encrypt_key,%function +.align 5 +private_AES_set_encrypt_key: +_armv4_AES_set_encrypt_key: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ AES_set_encrypt_key +#else + adr r3,private_AES_set_encrypt_key +#endif + teq r0,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif + moveq r0,#-1 + beq .Labrt + teq r2,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif + moveq r0,#-1 + beq .Labrt + + teq r1,#128 + beq .Lok + teq r1,#192 + beq .Lok + teq r1,#256 +#if __ARM_ARCH__>=7 + itt ne @ Thumb2 thing, sanity check in ARM +#endif + movne r0,#-1 + bne .Labrt + +.Lok: stmdb sp!,{r4-r12,lr} + sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 + + mov r12,r0 @ inp + mov lr,r1 @ bits + mov r11,r2 @ key + +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + str r0,[r11],#16 + orr r3,r3,r5,lsl#16 + str r1,[r11,#-12] + orr r3,r3,r6,lsl#24 + str r2,[r11,#-8] + str r3,[r11,#-4] +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r11],#16 + str r1,[r11,#-12] + str r2,[r11,#-8] + str r3,[r11,#-4] +#endif + + teq lr,#128 + bne .Lnot128 + mov r12,#10 + str r12,[r11,#240-16] + add r6,r10,#256 @ rcon + mov lr,#255 + +.L128_loop: + and r5,lr,r3,lsr#24 + and r7,lr,r3,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r3 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r5,r5,r4 + eor r0,r0,r5 @ rk[4]=rk[0]^... + eor r1,r1,r0 @ rk[5]=rk[1]^rk[4] + str r0,[r11],#16 + eor r2,r2,r1 @ rk[6]=rk[2]^rk[5] + str r1,[r11,#-12] + eor r3,r3,r2 @ rk[7]=rk[3]^rk[6] + str r2,[r11,#-8] + subs r12,r12,#1 + str r3,[r11,#-4] + bne .L128_loop + sub r2,r11,#176 + b .Ldone + +.Lnot128: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#19] + ldrb r4,[r12,#18] + ldrb r5,[r12,#17] + ldrb r6,[r12,#16] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#23] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#22] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#21] + ldrb r6,[r12,#20] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#16] + ldr r9,[r12,#20] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + teq lr,#192 + bne .Lnot192 + mov r12,#12 + str r12,[r11,#240-24] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#8 + +.L192_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[6]=rk[0]^... + eor r1,r1,r0 @ rk[7]=rk[1]^rk[6] + str r0,[r11],#24 + eor r2,r2,r1 @ rk[8]=rk[2]^rk[7] + str r1,[r11,#-20] + eor r3,r3,r2 @ rk[9]=rk[3]^rk[8] + str r2,[r11,#-16] + subs r12,r12,#1 + str r3,[r11,#-12] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif + subeq r2,r11,#216 + beq .Ldone + + ldr r7,[r11,#-32] + ldr r8,[r11,#-28] + eor r7,r7,r3 @ rk[10]=rk[4]^rk[9] + eor r9,r8,r7 @ rk[11]=rk[5]^rk[10] + str r7,[r11,#-8] + str r9,[r11,#-4] + b .L192_loop + +.Lnot192: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#27] + ldrb r4,[r12,#26] + ldrb r5,[r12,#25] + ldrb r6,[r12,#24] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#31] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#30] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#29] + ldrb r6,[r12,#28] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#24] + ldr r9,[r12,#28] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + mov r12,#14 + str r12,[r11,#240-32] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#7 + +.L256_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[8]=rk[0]^... + eor r1,r1,r0 @ rk[9]=rk[1]^rk[8] + str r0,[r11],#32 + eor r2,r2,r1 @ rk[10]=rk[2]^rk[9] + str r1,[r11,#-28] + eor r3,r3,r2 @ rk[11]=rk[3]^rk[10] + str r2,[r11,#-24] + subs r12,r12,#1 + str r3,[r11,#-20] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif + subeq r2,r11,#256 + beq .Ldone + + and r5,lr,r3 + and r7,lr,r3,lsr#8 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#16 + ldrb r7,[r10,r7] + and r9,lr,r3,lsr#24 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#8 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r11,#-48] + orr r5,r5,r9,lsl#24 + + ldr r7,[r11,#-44] + ldr r8,[r11,#-40] + eor r4,r4,r5 @ rk[12]=rk[4]^... + ldr r9,[r11,#-36] + eor r7,r7,r4 @ rk[13]=rk[5]^rk[12] + str r4,[r11,#-16] + eor r8,r8,r7 @ rk[14]=rk[6]^rk[13] + str r7,[r11,#-12] + eor r9,r9,r8 @ rk[15]=rk[7]^rk[14] + str r8,[r11,#-8] + str r9,[r11,#-4] + b .L256_loop + +.align 2 +.Ldone: mov r0,#0 + ldmia sp!,{r4-r12,lr} +.Labrt: +#if __ARM_ARCH__>=5 + bx lr @ .word 0xe12fff1e +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key + +.global private_AES_set_decrypt_key +.type private_AES_set_decrypt_key,%function +.align 5 +private_AES_set_decrypt_key: + str lr,[sp,#-4]! @ push lr + bl _armv4_AES_set_encrypt_key + teq r0,#0 + ldr lr,[sp],#4 @ pop lr + bne .Labrt + + mov r0,r2 @ AES_set_encrypt_key preserves r2, + mov r1,r2 @ which is AES_KEY *key + b _armv4_AES_set_enc2dec_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key + +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.global AES_set_enc2dec_key +.type AES_set_enc2dec_key,%function +.align 5 +AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: + stmdb sp!,{r4-r12,lr} + + ldr r12,[r0,#240] + mov r7,r0 @ input + add r8,r0,r12,lsl#4 + mov r11,r1 @ ouput + add r10,r1,r12,lsl#4 + str r12,[r1,#240] + +.Linv: ldr r0,[r7],#16 + ldr r1,[r7,#-12] + ldr r2,[r7,#-8] + ldr r3,[r7,#-4] + ldr r4,[r8],#-16 + ldr r5,[r8,#16+4] + ldr r6,[r8,#16+8] + ldr r9,[r8,#16+12] + str r0,[r10],#-16 + str r1,[r10,#16+4] + str r2,[r10,#16+8] + str r3,[r10,#16+12] + str r4,[r11],#16 + str r5,[r11,#-12] + str r6,[r11,#-8] + str r9,[r11,#-4] + teq r7,r8 + bne .Linv + + ldr r0,[r7] + ldr r1,[r7,#4] + ldr r2,[r7,#8] + ldr r3,[r7,#12] + str r0,[r11] + str r1,[r11,#4] + str r2,[r11,#8] + str r3,[r11,#12] + sub r11,r11,r12,lsl#3 + ldr r0,[r11,#16]! @ prefetch tp1 + mov r7,#0x80 + mov r8,#0x1b + orr r7,r7,#0x8000 + orr r8,r8,#0x1b00 + orr r7,r7,r7,lsl#16 + orr r8,r8,r8,lsl#16 + sub r12,r12,#1 + mvn r9,r7 + mov r12,r12,lsl#2 @ (rounds-1)*4 + +.Lmix: and r4,r0,r7 + and r1,r0,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r1,r4,r1,lsl#1 @ tp2 + + and r4,r1,r7 + and r2,r1,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r2,r4,r2,lsl#1 @ tp4 + + and r4,r2,r7 + and r3,r2,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r3,r4,r3,lsl#1 @ tp8 + + eor r4,r1,r2 + eor r5,r0,r3 @ tp9 + eor r4,r4,r3 @ tpe + eor r4,r4,r1,ror#24 + eor r4,r4,r5,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8) + eor r4,r4,r2,ror#16 + eor r4,r4,r5,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16) + eor r4,r4,r5,ror#8 @ ^= ROTATE(tp9,24) + + ldr r0,[r11,#4] @ prefetch tp1 + str r4,[r11],#4 + subs r12,r12,#1 + bne .Lmix + + mov r0,#0 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size AES_set_enc2dec_key,.-AES_set_enc2dec_key + +.type AES_Td,%object +.align 5 +AES_Td: +.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 +.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 +.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 +.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f +.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 +.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 +.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da +.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 +.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd +.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 +.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 +.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 +.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 +.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a +.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 +.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c +.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 +.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a +.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 +.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 +.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 +.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff +.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 +.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb +.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 +.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e +.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 +.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a +.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e +.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 +.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d +.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 +.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd +.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 +.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 +.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 +.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d +.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 +.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 +.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef +.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 +.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 +.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 +.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 +.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 +.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b +.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 +.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 +.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 +.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 +.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 +.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f +.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df +.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f +.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e +.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 +.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 +.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c +.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf +.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 +.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f +.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 +.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 +.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 +@ Td4[256] +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +.size AES_Td,.-AES_Td + +@ void AES_decrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.global AES_decrypt +.type AES_decrypt,%function +.align 5 +AES_decrypt: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ AES_decrypt +#else + adr r3,AES_decrypt +#endif + stmdb sp!,{r1,r4-r12,lr} + mov r12,r0 @ inp + mov r11,r2 + sub r10,r3,#AES_decrypt-AES_Td @ Td +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_decrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size AES_decrypt,.-AES_decrypt + +.type _armv4_AES_decrypt,%function +.align 2 +_armv4_AES_decrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4-r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0,lsr#16 + and r8,lr,r0,lsr#8 + and r9,lr,r0 + mov r0,r0,lsr#24 +.Ldec_loop: + ldr r4,[r10,r7,lsl#2] @ Td1[s0>>16] + and r7,lr,r1 @ i0 + ldr r5,[r10,r8,lsl#2] @ Td2[s0>>8] + and r8,lr,r1,lsr#16 + ldr r6,[r10,r9,lsl#2] @ Td3[s0>>0] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Td0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Td3[s1>>0] + ldr r8,[r10,r8,lsl#2] @ Td1[s1>>16] + ldr r9,[r10,r9,lsl#2] @ Td2[s1>>8] + eor r0,r0,r7,ror#24 + ldr r1,[r10,r1,lsl#2] @ Td0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,ror#8 + and r8,lr,r2 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r2,lsr#16 + ldr r7,[r10,r7,lsl#2] @ Td2[s2>>8] + eor r1,r1,r4,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td1[s2>>16] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Td0[s2>>24] + and r7,lr,r3,lsr#16 @ i0 + eor r1,r1,r8,ror#24 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r3 @ i2 + ldr r7,[r10,r7,lsl#2] @ Td1[s3>>16] + eor r2,r2,r5,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td3[s3>>0] + eor r0,r0,r7,ror#8 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24] + eor r2,r2,r9,ror#24 + + ldr r4,[r11,#-12] + eor r0,r0,r7 + ldr r5,[r11,#-8] + eor r3,r3,r6,ror#8 + ldr r6,[r11,#-4] + and r7,lr,r0,lsr#16 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne .Ldec_loop + + add r10,r10,#1024 + + ldr r5,[r10,#0] @ prefetch Td4 + ldr r6,[r10,#32] + ldr r4,[r10,#64] + ldr r5,[r10,#96] + ldr r6,[r10,#128] + ldr r4,[r10,#160] + ldr r5,[r10,#192] + ldr r6,[r10,#224] + + ldrb r0,[r10,r0] @ Td4[s0>>24] + ldrb r4,[r10,r7] @ Td4[s0>>16] + and r7,lr,r1 @ i0 + ldrb r5,[r10,r8] @ Td4[s0>>8] + and r8,lr,r1,lsr#16 + ldrb r6,[r10,r9] @ Td4[s0>>0] + and r9,lr,r1,lsr#8 + + add r1,r10,r1,lsr#24 + ldrb r7,[r10,r7] @ Td4[s1>>0] + ldrb r1,[r1] @ Td4[s1>>24] + ldrb r8,[r10,r8] @ Td4[s1>>16] + eor r0,r7,r0,lsl#24 + ldrb r9,[r10,r9] @ Td4[s1>>8] + eor r1,r4,r1,lsl#8 + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,lsl#8 + and r8,lr,r2 @ i1 + ldrb r7,[r10,r7] @ Td4[s2>>8] + eor r6,r6,r9,lsl#8 + ldrb r8,[r10,r8] @ Td4[s2>>0] + and r9,lr,r2,lsr#16 + + add r2,r10,r2,lsr#24 + ldrb r2,[r2] @ Td4[s2>>24] + eor r0,r0,r7,lsl#8 + ldrb r9,[r10,r9] @ Td4[s2>>16] + eor r1,r8,r1,lsl#16 + and r7,lr,r3,lsr#16 @ i0 + eor r2,r5,r2,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + ldrb r7,[r10,r7] @ Td4[s3>>16] + eor r6,r6,r9,lsl#16 + ldrb r8,[r10,r8] @ Td4[s3>>8] + and r9,lr,r3 @ i2 + + add r3,r10,r3,lsr#24 + ldrb r9,[r10,r9] @ Td4[s3>>0] + ldrb r3,[r3] @ Td4[s3>>24] + eor r0,r0,r7,lsl#16 + ldr r7,[r11,#0] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r9,r2,lsl#8 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#1024 + ldr pc,[sp],#4 @ pop and return +.size _armv4_AES_decrypt,.-_armv4_AES_decrypt +.asciz "AES for ARMv4, CRYPTOGAMS by <appro@openssl.org>" +.align 2 diff --git a/app/openssl/crypto/aes/asm/aes-armv4.pl b/app/openssl/crypto/aes/asm/aes-armv4.pl index 86b86c4a..4f891708 100644 --- a/app/openssl/crypto/aes/asm/aes-armv4.pl +++ b/app/openssl/crypto/aes/asm/aes-armv4.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -51,9 +51,23 @@ $key="r11"; $rounds="r12"; $code=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + .text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else .code 32 +# endif +#endif .type AES_Te,%object .align 5 @@ -167,7 +181,11 @@ AES_Te: .type AES_encrypt,%function .align 5 AES_encrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_encrypt +#else + adr r3,AES_encrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -409,11 +427,21 @@ _armv4_AES_encrypt: .align 5 private_AES_set_encrypt_key: _armv4_AES_set_encrypt_key: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_set_encrypt_key +#else + adr r3,private_AES_set_encrypt_key +#endif teq r0,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt teq r2,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt @@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 +#if __ARM_ARCH__>=7 + itt ne @ Thumb2 thing, sanity check in ARM +#endif movne r0,#-1 bne .Labrt @@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-16] subs $rounds,$rounds,#1 str $s3,[$key,#-12] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,$key,#216 beq .Ldone @@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-24] subs $rounds,$rounds,#1 str $s3,[$key,#-20] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,$key,#256 beq .Ldone @@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key: str $i3,[$key,#-4] b .L256_loop +.align 2 .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} -.Labrt: tst lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .global private_AES_set_decrypt_key @@ -688,34 +731,57 @@ private_AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 - ldrne lr,[sp],#4 @ pop lr + ldr lr,[sp],#4 @ pop lr bne .Labrt - stmdb sp!,{r4-r12} + mov r0,r2 @ AES_set_encrypt_key preserves r2, + mov r1,r2 @ which is AES_KEY *key + b _armv4_AES_set_enc2dec_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key - ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2, - mov $key,r2 @ which is AES_KEY *key - mov $i1,r2 - add $i2,r2,$rounds,lsl#4 +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.global AES_set_enc2dec_key +.type AES_set_enc2dec_key,%function +.align 5 +AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: + stmdb sp!,{r4-r12,lr} + + ldr $rounds,[r0,#240] + mov $i1,r0 @ input + add $i2,r0,$rounds,lsl#4 + mov $key,r1 @ ouput + add $tbl,r1,$rounds,lsl#4 + str $rounds,[r1,#240] + +.Linv: ldr $s0,[$i1],#16 + ldr $s1,[$i1,#-12] + ldr $s2,[$i1,#-8] + ldr $s3,[$i1,#-4] + ldr $t1,[$i2],#-16 + ldr $t2,[$i2,#16+4] + ldr $t3,[$i2,#16+8] + ldr $i3,[$i2,#16+12] + str $s0,[$tbl],#-16 + str $s1,[$tbl,#16+4] + str $s2,[$tbl,#16+8] + str $s3,[$tbl,#16+12] + str $t1,[$key],#16 + str $t2,[$key,#-12] + str $t3,[$key,#-8] + str $i3,[$key,#-4] + teq $i1,$i2 + bne .Linv -.Linv: ldr $s0,[$i1] + ldr $s0,[$i1] ldr $s1,[$i1,#4] ldr $s2,[$i1,#8] ldr $s3,[$i1,#12] - ldr $t1,[$i2] - ldr $t2,[$i2,#4] - ldr $t3,[$i2,#8] - ldr $i3,[$i2,#12] - str $s0,[$i2],#-16 - str $s1,[$i2,#16+4] - str $s2,[$i2,#16+8] - str $s3,[$i2,#16+12] - str $t1,[$i1],#16 - str $t2,[$i1,#-12] - str $t3,[$i1,#-8] - str $i3,[$i1,#-4] - teq $i1,$i2 - bne .Linv + str $s0,[$key] + str $s1,[$key,#4] + str $s2,[$key,#8] + str $s3,[$key,#12] + sub $key,$key,$rounds,lsl#3 ___ $mask80=$i1; $mask1b=$i2; @@ -773,7 +839,7 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size AES_set_enc2dec_key,.-AES_set_enc2dec_key .type AES_Td,%object .align 5 @@ -883,7 +949,11 @@ AES_Td: .type AES_decrypt,%function .align 5 AES_decrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_decrypt +#else + adr r3,AES_decrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -1080,8 +1150,9 @@ _armv4_AES_decrypt: ldrb $t3,[$tbl,$i3] @ Td4[s0>>0] and $i3,lr,$s1,lsr#8 + add $s1,$tbl,$s1,lsr#24 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0] - ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24] + ldrb $s1,[$s1] @ Td4[s1>>24] ldrb $i2,[$tbl,$i2] @ Td4[s1>>16] eor $s0,$i1,$s0,lsl#24 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8] @@ -1094,7 +1165,8 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] and $i3,lr,$s2,lsr#16 - ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] + add $s2,$tbl,$s2,lsr#24 + ldrb $s2,[$s2] @ Td4[s2>>24] eor $s0,$s0,$i1,lsl#8 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] eor $s1,$i2,$s1,lsl#16 @@ -1106,8 +1178,9 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] and $i3,lr,$s3 @ i2 + add $s3,$tbl,$s3,lsr#24 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] - ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] + ldrb $s3,[$s3] @ Td4[s3>>24] eor $s0,$s0,$i1,lsl#16 ldr $i1,[$key,#0] eor $s1,$s1,$i2,lsl#8 @@ -1130,5 +1203,15 @@ _armv4_AES_decrypt: ___ $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx\tlr/gm; + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + print $code; close STDOUT; # enforce flush diff --git a/app/openssl/crypto/aes/asm/aes-armv4.s b/app/openssl/crypto/aes/asm/aes-armv4.s deleted file mode 100644 index 2697d4ce..00000000 --- a/app/openssl/crypto/aes/asm/aes-armv4.s +++ /dev/null @@ -1,1071 +0,0 @@ -#include "arm_arch.h" -.text -.code 32 - -.type AES_Te,%object -.align 5 -AES_Te: -.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d -.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 -.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d -.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a -.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 -.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b -.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea -.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b -.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a -.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f -.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 -.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f -.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e -.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 -.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d -.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f -.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e -.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb -.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce -.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 -.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c -.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed -.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b -.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a -.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 -.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 -.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 -.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 -.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a -.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 -.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 -.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d -.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f -.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 -.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 -.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 -.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f -.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 -.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c -.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 -.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e -.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 -.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 -.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b -.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 -.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 -.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 -.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 -.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 -.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 -.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 -.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 -.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa -.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 -.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 -.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 -.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 -.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 -.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 -.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a -.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 -.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 -.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 -.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a -@ Te4[256] -.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 -.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 -.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 -.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 -.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc -.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 -.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a -.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 -.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 -.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 -.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b -.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf -.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 -.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 -.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 -.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 -.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 -.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 -.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 -.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb -.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c -.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 -.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 -.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 -.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 -.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a -.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e -.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e -.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 -.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf -.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 -.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 -@ rcon[] -.word 0x01000000, 0x02000000, 0x04000000, 0x08000000 -.word 0x10000000, 0x20000000, 0x40000000, 0x80000000 -.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 -.size AES_Te,.-AES_Te - -@ void AES_encrypt(const unsigned char *in, unsigned char *out, -@ const AES_KEY *key) { -.global AES_encrypt -.type AES_encrypt,%function -.align 5 -AES_encrypt: - sub r3,pc,#8 @ AES_encrypt - stmdb sp!,{r1,r4-r12,lr} - mov r12,r0 @ inp - mov r11,r2 - sub r10,r3,#AES_encrypt-AES_Te @ Te -#if __ARM_ARCH__<7 - ldrb r0,[r12,#3] @ load input data in endian-neutral - ldrb r4,[r12,#2] @ manner... - ldrb r5,[r12,#1] - ldrb r6,[r12,#0] - orr r0,r0,r4,lsl#8 - ldrb r1,[r12,#7] - orr r0,r0,r5,lsl#16 - ldrb r4,[r12,#6] - orr r0,r0,r6,lsl#24 - ldrb r5,[r12,#5] - ldrb r6,[r12,#4] - orr r1,r1,r4,lsl#8 - ldrb r2,[r12,#11] - orr r1,r1,r5,lsl#16 - ldrb r4,[r12,#10] - orr r1,r1,r6,lsl#24 - ldrb r5,[r12,#9] - ldrb r6,[r12,#8] - orr r2,r2,r4,lsl#8 - ldrb r3,[r12,#15] - orr r2,r2,r5,lsl#16 - ldrb r4,[r12,#14] - orr r2,r2,r6,lsl#24 - ldrb r5,[r12,#13] - ldrb r6,[r12,#12] - orr r3,r3,r4,lsl#8 - orr r3,r3,r5,lsl#16 - orr r3,r3,r6,lsl#24 -#else - ldr r0,[r12,#0] - ldr r1,[r12,#4] - ldr r2,[r12,#8] - ldr r3,[r12,#12] -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif -#endif - bl _armv4_AES_encrypt - - ldr r12,[sp],#4 @ pop out -#if __ARM_ARCH__>=7 -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif - str r0,[r12,#0] - str r1,[r12,#4] - str r2,[r12,#8] - str r3,[r12,#12] -#else - mov r4,r0,lsr#24 @ write output in endian-neutral - mov r5,r0,lsr#16 @ manner... - mov r6,r0,lsr#8 - strb r4,[r12,#0] - strb r5,[r12,#1] - mov r4,r1,lsr#24 - strb r6,[r12,#2] - mov r5,r1,lsr#16 - strb r0,[r12,#3] - mov r6,r1,lsr#8 - strb r4,[r12,#4] - strb r5,[r12,#5] - mov r4,r2,lsr#24 - strb r6,[r12,#6] - mov r5,r2,lsr#16 - strb r1,[r12,#7] - mov r6,r2,lsr#8 - strb r4,[r12,#8] - strb r5,[r12,#9] - mov r4,r3,lsr#24 - strb r6,[r12,#10] - mov r5,r3,lsr#16 - strb r2,[r12,#11] - mov r6,r3,lsr#8 - strb r4,[r12,#12] - strb r5,[r12,#13] - strb r6,[r12,#14] - strb r3,[r12,#15] -#endif -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size AES_encrypt,.-AES_encrypt - -.type _armv4_AES_encrypt,%function -.align 2 -_armv4_AES_encrypt: - str lr,[sp,#-4]! @ push lr - ldmia r11!,{r4-r7} - eor r0,r0,r4 - ldr r12,[r11,#240-16] - eor r1,r1,r5 - eor r2,r2,r6 - eor r3,r3,r7 - sub r12,r12,#1 - mov lr,#255 - - and r7,lr,r0 - and r8,lr,r0,lsr#8 - and r9,lr,r0,lsr#16 - mov r0,r0,lsr#24 -.Lenc_loop: - ldr r4,[r10,r7,lsl#2] @ Te3[s0>>0] - and r7,lr,r1,lsr#16 @ i0 - ldr r5,[r10,r8,lsl#2] @ Te2[s0>>8] - and r8,lr,r1 - ldr r6,[r10,r9,lsl#2] @ Te1[s0>>16] - and r9,lr,r1,lsr#8 - ldr r0,[r10,r0,lsl#2] @ Te0[s0>>24] - mov r1,r1,lsr#24 - - ldr r7,[r10,r7,lsl#2] @ Te1[s1>>16] - ldr r8,[r10,r8,lsl#2] @ Te3[s1>>0] - ldr r9,[r10,r9,lsl#2] @ Te2[s1>>8] - eor r0,r0,r7,ror#8 - ldr r1,[r10,r1,lsl#2] @ Te0[s1>>24] - and r7,lr,r2,lsr#8 @ i0 - eor r5,r5,r8,ror#8 - and r8,lr,r2,lsr#16 @ i1 - eor r6,r6,r9,ror#8 - and r9,lr,r2 - ldr r7,[r10,r7,lsl#2] @ Te2[s2>>8] - eor r1,r1,r4,ror#24 - ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16] - mov r2,r2,lsr#24 - - ldr r9,[r10,r9,lsl#2] @ Te3[s2>>0] - eor r0,r0,r7,ror#16 - ldr r2,[r10,r2,lsl#2] @ Te0[s2>>24] - and r7,lr,r3 @ i0 - eor r1,r1,r8,ror#8 - and r8,lr,r3,lsr#8 @ i1 - eor r6,r6,r9,ror#16 - and r9,lr,r3,lsr#16 @ i2 - ldr r7,[r10,r7,lsl#2] @ Te3[s3>>0] - eor r2,r2,r5,ror#16 - ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8] - mov r3,r3,lsr#24 - - ldr r9,[r10,r9,lsl#2] @ Te1[s3>>16] - eor r0,r0,r7,ror#24 - ldr r7,[r11],#16 - eor r1,r1,r8,ror#16 - ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24] - eor r2,r2,r9,ror#8 - ldr r4,[r11,#-12] - eor r3,r3,r6,ror#8 - - ldr r5,[r11,#-8] - eor r0,r0,r7 - ldr r6,[r11,#-4] - and r7,lr,r0 - eor r1,r1,r4 - and r8,lr,r0,lsr#8 - eor r2,r2,r5 - and r9,lr,r0,lsr#16 - eor r3,r3,r6 - mov r0,r0,lsr#24 - - subs r12,r12,#1 - bne .Lenc_loop - - add r10,r10,#2 - - ldrb r4,[r10,r7,lsl#2] @ Te4[s0>>0] - and r7,lr,r1,lsr#16 @ i0 - ldrb r5,[r10,r8,lsl#2] @ Te4[s0>>8] - and r8,lr,r1 - ldrb r6,[r10,r9,lsl#2] @ Te4[s0>>16] - and r9,lr,r1,lsr#8 - ldrb r0,[r10,r0,lsl#2] @ Te4[s0>>24] - mov r1,r1,lsr#24 - - ldrb r7,[r10,r7,lsl#2] @ Te4[s1>>16] - ldrb r8,[r10,r8,lsl#2] @ Te4[s1>>0] - ldrb r9,[r10,r9,lsl#2] @ Te4[s1>>8] - eor r0,r7,r0,lsl#8 - ldrb r1,[r10,r1,lsl#2] @ Te4[s1>>24] - and r7,lr,r2,lsr#8 @ i0 - eor r5,r8,r5,lsl#8 - and r8,lr,r2,lsr#16 @ i1 - eor r6,r9,r6,lsl#8 - and r9,lr,r2 - ldrb r7,[r10,r7,lsl#2] @ Te4[s2>>8] - eor r1,r4,r1,lsl#24 - ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16] - mov r2,r2,lsr#24 - - ldrb r9,[r10,r9,lsl#2] @ Te4[s2>>0] - eor r0,r7,r0,lsl#8 - ldrb r2,[r10,r2,lsl#2] @ Te4[s2>>24] - and r7,lr,r3 @ i0 - eor r1,r1,r8,lsl#16 - and r8,lr,r3,lsr#8 @ i1 - eor r6,r9,r6,lsl#8 - and r9,lr,r3,lsr#16 @ i2 - ldrb r7,[r10,r7,lsl#2] @ Te4[s3>>0] - eor r2,r5,r2,lsl#24 - ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8] - mov r3,r3,lsr#24 - - ldrb r9,[r10,r9,lsl#2] @ Te4[s3>>16] - eor r0,r7,r0,lsl#8 - ldr r7,[r11,#0] - ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24] - eor r1,r1,r8,lsl#8 - ldr r4,[r11,#4] - eor r2,r2,r9,lsl#16 - ldr r5,[r11,#8] - eor r3,r6,r3,lsl#24 - ldr r6,[r11,#12] - - eor r0,r0,r7 - eor r1,r1,r4 - eor r2,r2,r5 - eor r3,r3,r6 - - sub r10,r10,#2 - ldr pc,[sp],#4 @ pop and return -.size _armv4_AES_encrypt,.-_armv4_AES_encrypt - -.global private_AES_set_encrypt_key -.type private_AES_set_encrypt_key,%function -.align 5 -private_AES_set_encrypt_key: -_armv4_AES_set_encrypt_key: - sub r3,pc,#8 @ AES_set_encrypt_key - teq r0,#0 - moveq r0,#-1 - beq .Labrt - teq r2,#0 - moveq r0,#-1 - beq .Labrt - - teq r1,#128 - beq .Lok - teq r1,#192 - beq .Lok - teq r1,#256 - movne r0,#-1 - bne .Labrt - -.Lok: stmdb sp!,{r4-r12,lr} - sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 - - mov r12,r0 @ inp - mov lr,r1 @ bits - mov r11,r2 @ key - -#if __ARM_ARCH__<7 - ldrb r0,[r12,#3] @ load input data in endian-neutral - ldrb r4,[r12,#2] @ manner... - ldrb r5,[r12,#1] - ldrb r6,[r12,#0] - orr r0,r0,r4,lsl#8 - ldrb r1,[r12,#7] - orr r0,r0,r5,lsl#16 - ldrb r4,[r12,#6] - orr r0,r0,r6,lsl#24 - ldrb r5,[r12,#5] - ldrb r6,[r12,#4] - orr r1,r1,r4,lsl#8 - ldrb r2,[r12,#11] - orr r1,r1,r5,lsl#16 - ldrb r4,[r12,#10] - orr r1,r1,r6,lsl#24 - ldrb r5,[r12,#9] - ldrb r6,[r12,#8] - orr r2,r2,r4,lsl#8 - ldrb r3,[r12,#15] - orr r2,r2,r5,lsl#16 - ldrb r4,[r12,#14] - orr r2,r2,r6,lsl#24 - ldrb r5,[r12,#13] - ldrb r6,[r12,#12] - orr r3,r3,r4,lsl#8 - str r0,[r11],#16 - orr r3,r3,r5,lsl#16 - str r1,[r11,#-12] - orr r3,r3,r6,lsl#24 - str r2,[r11,#-8] - str r3,[r11,#-4] -#else - ldr r0,[r12,#0] - ldr r1,[r12,#4] - ldr r2,[r12,#8] - ldr r3,[r12,#12] -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif - str r0,[r11],#16 - str r1,[r11,#-12] - str r2,[r11,#-8] - str r3,[r11,#-4] -#endif - - teq lr,#128 - bne .Lnot128 - mov r12,#10 - str r12,[r11,#240-16] - add r6,r10,#256 @ rcon - mov lr,#255 - -.L128_loop: - and r5,lr,r3,lsr#24 - and r7,lr,r3,lsr#16 - ldrb r5,[r10,r5] - and r8,lr,r3,lsr#8 - ldrb r7,[r10,r7] - and r9,lr,r3 - ldrb r8,[r10,r8] - orr r5,r5,r7,lsl#24 - ldrb r9,[r10,r9] - orr r5,r5,r8,lsl#16 - ldr r4,[r6],#4 @ rcon[i++] - orr r5,r5,r9,lsl#8 - eor r5,r5,r4 - eor r0,r0,r5 @ rk[4]=rk[0]^... - eor r1,r1,r0 @ rk[5]=rk[1]^rk[4] - str r0,[r11],#16 - eor r2,r2,r1 @ rk[6]=rk[2]^rk[5] - str r1,[r11,#-12] - eor r3,r3,r2 @ rk[7]=rk[3]^rk[6] - str r2,[r11,#-8] - subs r12,r12,#1 - str r3,[r11,#-4] - bne .L128_loop - sub r2,r11,#176 - b .Ldone - -.Lnot128: -#if __ARM_ARCH__<7 - ldrb r8,[r12,#19] - ldrb r4,[r12,#18] - ldrb r5,[r12,#17] - ldrb r6,[r12,#16] - orr r8,r8,r4,lsl#8 - ldrb r9,[r12,#23] - orr r8,r8,r5,lsl#16 - ldrb r4,[r12,#22] - orr r8,r8,r6,lsl#24 - ldrb r5,[r12,#21] - ldrb r6,[r12,#20] - orr r9,r9,r4,lsl#8 - orr r9,r9,r5,lsl#16 - str r8,[r11],#8 - orr r9,r9,r6,lsl#24 - str r9,[r11,#-4] -#else - ldr r8,[r12,#16] - ldr r9,[r12,#20] -#ifdef __ARMEL__ - rev r8,r8 - rev r9,r9 -#endif - str r8,[r11],#8 - str r9,[r11,#-4] -#endif - - teq lr,#192 - bne .Lnot192 - mov r12,#12 - str r12,[r11,#240-24] - add r6,r10,#256 @ rcon - mov lr,#255 - mov r12,#8 - -.L192_loop: - and r5,lr,r9,lsr#24 - and r7,lr,r9,lsr#16 - ldrb r5,[r10,r5] - and r8,lr,r9,lsr#8 - ldrb r7,[r10,r7] - and r9,lr,r9 - ldrb r8,[r10,r8] - orr r5,r5,r7,lsl#24 - ldrb r9,[r10,r9] - orr r5,r5,r8,lsl#16 - ldr r4,[r6],#4 @ rcon[i++] - orr r5,r5,r9,lsl#8 - eor r9,r5,r4 - eor r0,r0,r9 @ rk[6]=rk[0]^... - eor r1,r1,r0 @ rk[7]=rk[1]^rk[6] - str r0,[r11],#24 - eor r2,r2,r1 @ rk[8]=rk[2]^rk[7] - str r1,[r11,#-20] - eor r3,r3,r2 @ rk[9]=rk[3]^rk[8] - str r2,[r11,#-16] - subs r12,r12,#1 - str r3,[r11,#-12] - subeq r2,r11,#216 - beq .Ldone - - ldr r7,[r11,#-32] - ldr r8,[r11,#-28] - eor r7,r7,r3 @ rk[10]=rk[4]^rk[9] - eor r9,r8,r7 @ rk[11]=rk[5]^rk[10] - str r7,[r11,#-8] - str r9,[r11,#-4] - b .L192_loop - -.Lnot192: -#if __ARM_ARCH__<7 - ldrb r8,[r12,#27] - ldrb r4,[r12,#26] - ldrb r5,[r12,#25] - ldrb r6,[r12,#24] - orr r8,r8,r4,lsl#8 - ldrb r9,[r12,#31] - orr r8,r8,r5,lsl#16 - ldrb r4,[r12,#30] - orr r8,r8,r6,lsl#24 - ldrb r5,[r12,#29] - ldrb r6,[r12,#28] - orr r9,r9,r4,lsl#8 - orr r9,r9,r5,lsl#16 - str r8,[r11],#8 - orr r9,r9,r6,lsl#24 - str r9,[r11,#-4] -#else - ldr r8,[r12,#24] - ldr r9,[r12,#28] -#ifdef __ARMEL__ - rev r8,r8 - rev r9,r9 -#endif - str r8,[r11],#8 - str r9,[r11,#-4] -#endif - - mov r12,#14 - str r12,[r11,#240-32] - add r6,r10,#256 @ rcon - mov lr,#255 - mov r12,#7 - -.L256_loop: - and r5,lr,r9,lsr#24 - and r7,lr,r9,lsr#16 - ldrb r5,[r10,r5] - and r8,lr,r9,lsr#8 - ldrb r7,[r10,r7] - and r9,lr,r9 - ldrb r8,[r10,r8] - orr r5,r5,r7,lsl#24 - ldrb r9,[r10,r9] - orr r5,r5,r8,lsl#16 - ldr r4,[r6],#4 @ rcon[i++] - orr r5,r5,r9,lsl#8 - eor r9,r5,r4 - eor r0,r0,r9 @ rk[8]=rk[0]^... - eor r1,r1,r0 @ rk[9]=rk[1]^rk[8] - str r0,[r11],#32 - eor r2,r2,r1 @ rk[10]=rk[2]^rk[9] - str r1,[r11,#-28] - eor r3,r3,r2 @ rk[11]=rk[3]^rk[10] - str r2,[r11,#-24] - subs r12,r12,#1 - str r3,[r11,#-20] - subeq r2,r11,#256 - beq .Ldone - - and r5,lr,r3 - and r7,lr,r3,lsr#8 - ldrb r5,[r10,r5] - and r8,lr,r3,lsr#16 - ldrb r7,[r10,r7] - and r9,lr,r3,lsr#24 - ldrb r8,[r10,r8] - orr r5,r5,r7,lsl#8 - ldrb r9,[r10,r9] - orr r5,r5,r8,lsl#16 - ldr r4,[r11,#-48] - orr r5,r5,r9,lsl#24 - - ldr r7,[r11,#-44] - ldr r8,[r11,#-40] - eor r4,r4,r5 @ rk[12]=rk[4]^... - ldr r9,[r11,#-36] - eor r7,r7,r4 @ rk[13]=rk[5]^rk[12] - str r4,[r11,#-16] - eor r8,r8,r7 @ rk[14]=rk[6]^rk[13] - str r7,[r11,#-12] - eor r9,r9,r8 @ rk[15]=rk[7]^rk[14] - str r8,[r11,#-8] - str r9,[r11,#-4] - b .L256_loop - -.Ldone: mov r0,#0 - ldmia sp!,{r4-r12,lr} -.Labrt: tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key - -.global private_AES_set_decrypt_key -.type private_AES_set_decrypt_key,%function -.align 5 -private_AES_set_decrypt_key: - str lr,[sp,#-4]! @ push lr - bl _armv4_AES_set_encrypt_key - teq r0,#0 - ldrne lr,[sp],#4 @ pop lr - bne .Labrt - - stmdb sp!,{r4-r12} - - ldr r12,[r2,#240] @ AES_set_encrypt_key preserves r2, - mov r11,r2 @ which is AES_KEY *key - mov r7,r2 - add r8,r2,r12,lsl#4 - -.Linv: ldr r0,[r7] - ldr r1,[r7,#4] - ldr r2,[r7,#8] - ldr r3,[r7,#12] - ldr r4,[r8] - ldr r5,[r8,#4] - ldr r6,[r8,#8] - ldr r9,[r8,#12] - str r0,[r8],#-16 - str r1,[r8,#16+4] - str r2,[r8,#16+8] - str r3,[r8,#16+12] - str r4,[r7],#16 - str r5,[r7,#-12] - str r6,[r7,#-8] - str r9,[r7,#-4] - teq r7,r8 - bne .Linv - ldr r0,[r11,#16]! @ prefetch tp1 - mov r7,#0x80 - mov r8,#0x1b - orr r7,r7,#0x8000 - orr r8,r8,#0x1b00 - orr r7,r7,r7,lsl#16 - orr r8,r8,r8,lsl#16 - sub r12,r12,#1 - mvn r9,r7 - mov r12,r12,lsl#2 @ (rounds-1)*4 - -.Lmix: and r4,r0,r7 - and r1,r0,r9 - sub r4,r4,r4,lsr#7 - and r4,r4,r8 - eor r1,r4,r1,lsl#1 @ tp2 - - and r4,r1,r7 - and r2,r1,r9 - sub r4,r4,r4,lsr#7 - and r4,r4,r8 - eor r2,r4,r2,lsl#1 @ tp4 - - and r4,r2,r7 - and r3,r2,r9 - sub r4,r4,r4,lsr#7 - and r4,r4,r8 - eor r3,r4,r3,lsl#1 @ tp8 - - eor r4,r1,r2 - eor r5,r0,r3 @ tp9 - eor r4,r4,r3 @ tpe - eor r4,r4,r1,ror#24 - eor r4,r4,r5,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8) - eor r4,r4,r2,ror#16 - eor r4,r4,r5,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16) - eor r4,r4,r5,ror#8 @ ^= ROTATE(tp9,24) - - ldr r0,[r11,#4] @ prefetch tp1 - str r4,[r11],#4 - subs r12,r12,#1 - bne .Lmix - - mov r0,#0 -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key - -.type AES_Td,%object -.align 5 -AES_Td: -.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 -.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 -.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 -.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f -.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 -.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 -.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da -.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 -.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd -.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 -.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 -.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 -.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 -.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a -.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 -.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c -.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 -.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a -.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 -.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 -.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 -.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff -.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 -.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb -.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 -.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e -.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 -.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a -.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e -.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 -.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d -.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 -.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd -.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 -.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 -.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 -.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d -.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 -.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 -.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef -.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 -.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 -.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 -.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 -.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 -.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b -.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 -.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 -.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 -.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 -.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 -.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f -.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df -.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f -.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e -.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 -.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 -.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c -.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf -.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 -.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f -.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 -.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 -.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 -@ Td4[256] -.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 -.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb -.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 -.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb -.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d -.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e -.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 -.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 -.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 -.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 -.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda -.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 -.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a -.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 -.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 -.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b -.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea -.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 -.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 -.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e -.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 -.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b -.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 -.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 -.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 -.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f -.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d -.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef -.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 -.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 -.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 -.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d -.size AES_Td,.-AES_Td - -@ void AES_decrypt(const unsigned char *in, unsigned char *out, -@ const AES_KEY *key) { -.global AES_decrypt -.type AES_decrypt,%function -.align 5 -AES_decrypt: - sub r3,pc,#8 @ AES_decrypt - stmdb sp!,{r1,r4-r12,lr} - mov r12,r0 @ inp - mov r11,r2 - sub r10,r3,#AES_decrypt-AES_Td @ Td -#if __ARM_ARCH__<7 - ldrb r0,[r12,#3] @ load input data in endian-neutral - ldrb r4,[r12,#2] @ manner... - ldrb r5,[r12,#1] - ldrb r6,[r12,#0] - orr r0,r0,r4,lsl#8 - ldrb r1,[r12,#7] - orr r0,r0,r5,lsl#16 - ldrb r4,[r12,#6] - orr r0,r0,r6,lsl#24 - ldrb r5,[r12,#5] - ldrb r6,[r12,#4] - orr r1,r1,r4,lsl#8 - ldrb r2,[r12,#11] - orr r1,r1,r5,lsl#16 - ldrb r4,[r12,#10] - orr r1,r1,r6,lsl#24 - ldrb r5,[r12,#9] - ldrb r6,[r12,#8] - orr r2,r2,r4,lsl#8 - ldrb r3,[r12,#15] - orr r2,r2,r5,lsl#16 - ldrb r4,[r12,#14] - orr r2,r2,r6,lsl#24 - ldrb r5,[r12,#13] - ldrb r6,[r12,#12] - orr r3,r3,r4,lsl#8 - orr r3,r3,r5,lsl#16 - orr r3,r3,r6,lsl#24 -#else - ldr r0,[r12,#0] - ldr r1,[r12,#4] - ldr r2,[r12,#8] - ldr r3,[r12,#12] -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif -#endif - bl _armv4_AES_decrypt - - ldr r12,[sp],#4 @ pop out -#if __ARM_ARCH__>=7 -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif - str r0,[r12,#0] - str r1,[r12,#4] - str r2,[r12,#8] - str r3,[r12,#12] -#else - mov r4,r0,lsr#24 @ write output in endian-neutral - mov r5,r0,lsr#16 @ manner... - mov r6,r0,lsr#8 - strb r4,[r12,#0] - strb r5,[r12,#1] - mov r4,r1,lsr#24 - strb r6,[r12,#2] - mov r5,r1,lsr#16 - strb r0,[r12,#3] - mov r6,r1,lsr#8 - strb r4,[r12,#4] - strb r5,[r12,#5] - mov r4,r2,lsr#24 - strb r6,[r12,#6] - mov r5,r2,lsr#16 - strb r1,[r12,#7] - mov r6,r2,lsr#8 - strb r4,[r12,#8] - strb r5,[r12,#9] - mov r4,r3,lsr#24 - strb r6,[r12,#10] - mov r5,r3,lsr#16 - strb r2,[r12,#11] - mov r6,r3,lsr#8 - strb r4,[r12,#12] - strb r5,[r12,#13] - strb r6,[r12,#14] - strb r3,[r12,#15] -#endif -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size AES_decrypt,.-AES_decrypt - -.type _armv4_AES_decrypt,%function -.align 2 -_armv4_AES_decrypt: - str lr,[sp,#-4]! @ push lr - ldmia r11!,{r4-r7} - eor r0,r0,r4 - ldr r12,[r11,#240-16] - eor r1,r1,r5 - eor r2,r2,r6 - eor r3,r3,r7 - sub r12,r12,#1 - mov lr,#255 - - and r7,lr,r0,lsr#16 - and r8,lr,r0,lsr#8 - and r9,lr,r0 - mov r0,r0,lsr#24 -.Ldec_loop: - ldr r4,[r10,r7,lsl#2] @ Td1[s0>>16] - and r7,lr,r1 @ i0 - ldr r5,[r10,r8,lsl#2] @ Td2[s0>>8] - and r8,lr,r1,lsr#16 - ldr r6,[r10,r9,lsl#2] @ Td3[s0>>0] - and r9,lr,r1,lsr#8 - ldr r0,[r10,r0,lsl#2] @ Td0[s0>>24] - mov r1,r1,lsr#24 - - ldr r7,[r10,r7,lsl#2] @ Td3[s1>>0] - ldr r8,[r10,r8,lsl#2] @ Td1[s1>>16] - ldr r9,[r10,r9,lsl#2] @ Td2[s1>>8] - eor r0,r0,r7,ror#24 - ldr r1,[r10,r1,lsl#2] @ Td0[s1>>24] - and r7,lr,r2,lsr#8 @ i0 - eor r5,r8,r5,ror#8 - and r8,lr,r2 @ i1 - eor r6,r9,r6,ror#8 - and r9,lr,r2,lsr#16 - ldr r7,[r10,r7,lsl#2] @ Td2[s2>>8] - eor r1,r1,r4,ror#8 - ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0] - mov r2,r2,lsr#24 - - ldr r9,[r10,r9,lsl#2] @ Td1[s2>>16] - eor r0,r0,r7,ror#16 - ldr r2,[r10,r2,lsl#2] @ Td0[s2>>24] - and r7,lr,r3,lsr#16 @ i0 - eor r1,r1,r8,ror#24 - and r8,lr,r3,lsr#8 @ i1 - eor r6,r9,r6,ror#8 - and r9,lr,r3 @ i2 - ldr r7,[r10,r7,lsl#2] @ Td1[s3>>16] - eor r2,r2,r5,ror#8 - ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8] - mov r3,r3,lsr#24 - - ldr r9,[r10,r9,lsl#2] @ Td3[s3>>0] - eor r0,r0,r7,ror#8 - ldr r7,[r11],#16 - eor r1,r1,r8,ror#16 - ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24] - eor r2,r2,r9,ror#24 - - ldr r4,[r11,#-12] - eor r0,r0,r7 - ldr r5,[r11,#-8] - eor r3,r3,r6,ror#8 - ldr r6,[r11,#-4] - and r7,lr,r0,lsr#16 - eor r1,r1,r4 - and r8,lr,r0,lsr#8 - eor r2,r2,r5 - and r9,lr,r0 - eor r3,r3,r6 - mov r0,r0,lsr#24 - - subs r12,r12,#1 - bne .Ldec_loop - - add r10,r10,#1024 - - ldr r5,[r10,#0] @ prefetch Td4 - ldr r6,[r10,#32] - ldr r4,[r10,#64] - ldr r5,[r10,#96] - ldr r6,[r10,#128] - ldr r4,[r10,#160] - ldr r5,[r10,#192] - ldr r6,[r10,#224] - - ldrb r0,[r10,r0] @ Td4[s0>>24] - ldrb r4,[r10,r7] @ Td4[s0>>16] - and r7,lr,r1 @ i0 - ldrb r5,[r10,r8] @ Td4[s0>>8] - and r8,lr,r1,lsr#16 - ldrb r6,[r10,r9] @ Td4[s0>>0] - and r9,lr,r1,lsr#8 - - ldrb r7,[r10,r7] @ Td4[s1>>0] - ldrb r1,[r10,r1,lsr#24] @ Td4[s1>>24] - ldrb r8,[r10,r8] @ Td4[s1>>16] - eor r0,r7,r0,lsl#24 - ldrb r9,[r10,r9] @ Td4[s1>>8] - eor r1,r4,r1,lsl#8 - and r7,lr,r2,lsr#8 @ i0 - eor r5,r5,r8,lsl#8 - and r8,lr,r2 @ i1 - ldrb r7,[r10,r7] @ Td4[s2>>8] - eor r6,r6,r9,lsl#8 - ldrb r8,[r10,r8] @ Td4[s2>>0] - and r9,lr,r2,lsr#16 - - ldrb r2,[r10,r2,lsr#24] @ Td4[s2>>24] - eor r0,r0,r7,lsl#8 - ldrb r9,[r10,r9] @ Td4[s2>>16] - eor r1,r8,r1,lsl#16 - and r7,lr,r3,lsr#16 @ i0 - eor r2,r5,r2,lsl#16 - and r8,lr,r3,lsr#8 @ i1 - ldrb r7,[r10,r7] @ Td4[s3>>16] - eor r6,r6,r9,lsl#16 - ldrb r8,[r10,r8] @ Td4[s3>>8] - and r9,lr,r3 @ i2 - - ldrb r9,[r10,r9] @ Td4[s3>>0] - ldrb r3,[r10,r3,lsr#24] @ Td4[s3>>24] - eor r0,r0,r7,lsl#16 - ldr r7,[r11,#0] - eor r1,r1,r8,lsl#8 - ldr r4,[r11,#4] - eor r2,r9,r2,lsl#8 - ldr r5,[r11,#8] - eor r3,r6,r3,lsl#24 - ldr r6,[r11,#12] - - eor r0,r0,r7 - eor r1,r1,r4 - eor r2,r2,r5 - eor r3,r3,r6 - - sub r10,r10,#1024 - ldr pc,[sp],#4 @ pop and return -.size _armv4_AES_decrypt,.-_armv4_AES_decrypt -.asciz "AES for ARMv4, CRYPTOGAMS by <appro@openssl.org>" -.align 2 diff --git a/app/openssl/crypto/aes/asm/aesv8-armx-64.S b/app/openssl/crypto/aes/asm/aesv8-armx-64.S new file mode 100644 index 00000000..be0a13df --- /dev/null +++ b/app/openssl/crypto/aes/asm/aesv8-armx-64.S @@ -0,0 +1,761 @@ +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +.arch armv8-a+crypto +.align 5 +rcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl aes_v8_set_encrypt_key +.type aes_v8_set_encrypt_key,%function +.align 5 +aes_v8_set_encrypt_key: +.Lenc_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + adr x3,rcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne .Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b .Ldone + +.align 4 +.L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +.Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne .Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b .Ldone + +.align 4 +.L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +.Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq .Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b .Loop256 + +.Ldone: + str w12,[x2] + + eor x0,x0,x0 // return value + ldr x29,[sp],#16 + ret +.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key + +.globl aes_v8_set_decrypt_key +.type aes_v8_set_decrypt_key,%function +.align 5 +aes_v8_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl .Lenc_key + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +.Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi .Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value + ldp x29,x30,[sp],#16 + ret +.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key +.globl aes_v8_encrypt +.type aes_v8_encrypt,%function +.align 5 +aes_v8_encrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_enc: + aese v2.16b,v0.16b + ld1 {v0.4s},[x2],#16 + aesmc v2.16b,v2.16b + subs w3,w3,#2 + aese v2.16b,v1.16b + ld1 {v1.4s},[x2],#16 + aesmc v2.16b,v2.16b + b.gt .Loop_enc + + aese v2.16b,v0.16b + ld1 {v0.4s},[x2] + aesmc v2.16b,v2.16b + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_encrypt,.-aes_v8_encrypt +.globl aes_v8_decrypt +.type aes_v8_decrypt,%function +.align 5 +aes_v8_decrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_dec: + aesd v2.16b,v0.16b + ld1 {v0.4s},[x2],#16 + aesimc v2.16b,v2.16b + subs w3,w3,#2 + aesd v2.16b,v1.16b + ld1 {v1.4s},[x2],#16 + aesimc v2.16b,v2.16b + b.gt .Loop_dec + + aesd v2.16b,v0.16b + ld1 {v0.4s},[x2] + aesimc v2.16b,v2.16b + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_decrypt,.-aes_v8_decrypt +.globl aes_v8_cbc_encrypt +.type aes_v8_cbc_encrypt,%function +.align 5 +aes_v8_cbc_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo .Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s-v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s-v19.4s},[x7],#32 + ld1 {v20.4s-v21.4s},[x7],#32 + ld1 {v22.4s-v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq .Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq .Lcbc_enc128 + +.Loop_cbc_enc: + aese v0.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesmc v0.16b,v0.16b + subs w6,w6,#2 + aese v0.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesmc v0.16b,v0.16b + b.gt .Loop_cbc_enc + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + add x7,x3,#16 + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + + mov w6,w5 + eor v6.16b,v0.16b,v7.16b + st1 {v6.16b},[x1],#16 + b.hs .Loop_cbc_enc + + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + ld1 {v2.4s-v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs .Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b .Lcbc_done + +.align 5 +.Lcbc_dec128: + ld1 {v4.4s-v5.4s},[x7] + eor v6.16b,v6.16b,v7.16b + eor v2.16b,v0.16b,v7.16b + mov x12,x8 + +.Loop2x_cbc_dec128: + aesd v0.16b,v16.16b + aesd v1.16b,v16.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + subs x2,x2,#32 + aesd v0.16b,v17.16b + aesd v1.16b,v17.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + csel x8,xzr,x8,lo + aesd v0.16b,v4.16b + aesd v1.16b,v4.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + csel x12,xzr,x12,ls + aesd v0.16b,v5.16b + aesd v1.16b,v5.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v18.16b + aesd v1.16b,v18.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v19.16b + aesd v1.16b,v19.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v20.16b + aesd v1.16b,v20.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v21.16b + aesd v1.16b,v21.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v22.16b + aesd v1.16b,v22.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + + eor v6.16b,v6.16b,v0.16b + ld1 {v0.16b},[x0],x8 + eor v2.16b,v2.16b,v1.16b + ld1 {v1.16b},[x0],x12 + st1 {v6.16b},[x1],#16 + eor v6.16b,v3.16b,v7.16b + st1 {v2.16b},[x1],#16 + eor v2.16b,v0.16b,v7.16b + orr v3.16b,v1.16b,v1.16b + b.hs .Loop2x_cbc_dec128 + + adds x2,x2,#32 + eor v6.16b,v6.16b,v7.16b + b.eq .Lcbc_done + eor v2.16b,v2.16b,v7.16b + b .Lcbc_dec_tail + +.align 5 +.Lcbc_dec: + subs x2,x2,#16 + orr v2.16b,v0.16b,v0.16b + b.lo .Lcbc_dec_tail + + csel x8,xzr,x8,eq + cmp w5,#2 + ld1 {v1.16b},[x0],x8 + orr v3.16b,v1.16b,v1.16b + b.eq .Lcbc_dec128 + +.Loop2x_cbc_dec: + aesd v0.16b,v16.16b + aesd v1.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesd v1.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + b.gt .Loop2x_cbc_dec + + aesd v0.16b,v16.16b + aesd v1.16b,v16.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + eor v4.16b,v6.16b,v7.16b + eor v5.16b,v2.16b,v7.16b + aesd v0.16b,v17.16b + aesd v1.16b,v17.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + orr v6.16b,v3.16b,v3.16b + subs x2,x2,#32 + aesd v0.16b,v18.16b + aesd v1.16b,v18.16b + aesimc v0.16b,v0.16b + csel x8,xzr,x8,lo + aesimc v1.16b,v1.16b + mov x7,x3 + aesd v0.16b,v19.16b + aesd v1.16b,v19.16b + aesimc v0.16b,v0.16b + ld1 {v2.16b},[x0],x8 + aesimc v1.16b,v1.16b + csel x8,xzr,x8,ls + aesd v0.16b,v20.16b + aesd v1.16b,v20.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ld1 {v3.16b},[x0],x8 + aesd v0.16b,v21.16b + aesd v1.16b,v21.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + aesd v0.16b,v22.16b + aesd v1.16b,v22.16b + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + + mov w6,w5 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + orr v0.16b,v2.16b,v2.16b + st1 {v4.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v5.16b},[x1],#16 + b.hs .Loop2x_cbc_dec + + adds x2,x2,#32 + b.eq .Lcbc_done + +.Lcbc_dec_tail: + aesd v0.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesimc v0.16b,v0.16b + subs w6,w6,#2 + aesd v0.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesimc v0.16b,v0.16b + b.gt .Lcbc_dec_tail + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + eor v4.16b,v6.16b,v7.16b + aesd v0.16b,v18.16b + aesimc v0.16b,v0.16b + orr v6.16b,v2.16b,v2.16b + aesd v0.16b,v19.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v23.16b + + eor v4.16b,v4.16b,v0.16b + st1 {v4.16b},[x1],#16 + +.Lcbc_done: + st1 {v6.16b},[x4] +.Lcbc_abort: + ldr x29,[sp],#16 + ret +.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl aes_v8_ctr32_encrypt_blocks +.type aes_v8_ctr32_encrypt_blocks,%function +.align 5 +aes_v8_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s-v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s-v19.4s},[x7],#32 + ld1 {v20.4s-v21.4s},[x7],#32 + ld1 {v22.4s-v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + + subs x2,x2,#2 + b.lo .Lctr32_tail + +#ifndef __ARMEB__ + rev w8, w8 +#endif + orr v1.16b,v0.16b,v0.16b + add w8, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w8 + cmp w5,#2 + mov v1.s[3],w10 + b.eq .Lctr32_128 + +.Loop2x_ctr32: + aese v0.16b,v16.16b + aese v1.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + subs w6,w6,#2 + aese v0.16b,v17.16b + aese v1.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + b.gt .Loop2x_ctr32 + + aese v0.16b,v16.16b + aese v1.16b,v16.16b + aesmc v4.16b,v0.16b + orr v0.16b,v6.16b,v6.16b + aesmc v5.16b,v1.16b + orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b + aese v5.16b,v17.16b + ld1 {v2.16b},[x0],#16 + aesmc v4.16b,v4.16b + ld1 {v3.16b},[x0],#16 + aesmc v5.16b,v5.16b + add w8,w8,#1 + aese v4.16b,v18.16b + aese v5.16b,v18.16b + rev w9,w8 + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + add w8,w8,#1 + aese v4.16b,v19.16b + aese v5.16b,v19.16b + eor v2.16b,v2.16b,v7.16b + rev w10,w8 + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + eor v3.16b,v3.16b,v7.16b + mov x7,x3 + aese v4.16b,v20.16b + aese v5.16b,v20.16b + subs x2,x2,#2 + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + ld1 {v16.4s-v17.4s},[x7],#32 // re-pre-load rndkey[0-1] + aese v4.16b,v21.16b + aese v5.16b,v21.16b + aesmc v4.16b,v4.16b + aesmc v5.16b,v5.16b + aese v4.16b,v22.16b + aese v5.16b,v22.16b + mov v0.s[3], w9 + aesmc v4.16b,v4.16b + mov v1.s[3], w10 + aesmc v5.16b,v5.16b + aese v4.16b,v23.16b + aese v5.16b,v23.16b + + mov w6,w5 + eor v2.16b,v2.16b,v4.16b + eor v3.16b,v3.16b,v5.16b + st1 {v2.16b},[x1],#16 + st1 {v3.16b},[x1],#16 + b.hs .Loop2x_ctr32 + + adds x2,x2,#2 + b.eq .Lctr32_done + b .Lctr32_tail + +.Lctr32_128: + ld1 {v4.4s-v5.4s},[x7] + +.Loop2x_ctr32_128: + aese v0.16b,v16.16b + aese v1.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v2.16b},[x0],#16 + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0],#16 + aese v0.16b,v17.16b + aese v1.16b,v17.16b + add w8,w8,#1 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + rev w9,w8 + aese v0.16b,v4.16b + aese v1.16b,v4.16b + add w8,w8,#1 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + rev w10,w8 + aese v0.16b,v5.16b + aese v1.16b,v5.16b + subs x2,x2,#2 + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v18.16b + aese v1.16b,v18.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v19.16b + aese v1.16b,v19.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v20.16b + aese v1.16b,v20.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v21.16b + aese v1.16b,v21.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + aese v0.16b,v22.16b + aese v1.16b,v22.16b + aesmc v0.16b,v0.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v23.16b + eor v3.16b,v3.16b,v7.16b + aese v1.16b,v23.16b + + eor v2.16b,v2.16b,v0.16b + orr v0.16b,v6.16b,v6.16b + eor v3.16b,v3.16b,v1.16b + orr v1.16b,v6.16b,v6.16b + st1 {v2.16b},[x1],#16 + mov v0.s[3], w9 + st1 {v3.16b},[x1],#16 + mov v1.s[3], w10 + b.hs .Loop2x_ctr32_128 + + adds x2,x2,#2 + b.eq .Lctr32_done + +.Lctr32_tail: + aese v0.16b,v16.16b + ld1 {v16.4s},[x7],#16 + aesmc v0.16b,v0.16b + subs w6,w6,#2 + aese v0.16b,v17.16b + ld1 {v17.4s},[x7],#16 + aesmc v0.16b,v0.16b + b.gt .Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v2.16b},[x0] + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v23.16b + + eor v2.16b,v2.16b,v0.16b + st1 {v2.16b},[x1] + +.Lctr32_done: + ldr x29,[sp],#16 + ret +.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +#endif diff --git a/app/openssl/crypto/aes/asm/aesv8-armx.S b/app/openssl/crypto/aes/asm/aesv8-armx.S new file mode 100644 index 00000000..1637e4d4 --- /dev/null +++ b/app/openssl/crypto/aes/asm/aesv8-armx.S @@ -0,0 +1,767 @@ +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +.fpu neon +.code 32 +.align 5 +rcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl aes_v8_set_encrypt_key +.type aes_v8_set_encrypt_key,%function +.align 5 +aes_v8_set_encrypt_key: +.Lenc_key: + adr r3,rcon + cmp r1,#192 + + veor q0,q0,q0 + vld1.8 {q3},[r0]! + mov r1,#8 @ reuse r1 + vld1.32 {q1,q2},[r3]! + + blt .Loop128 + beq .L192 + b .L256 + +.align 4 +.Loop128: + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + bne .Loop128 + + vld1.32 {q1},[r3] + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + veor q3,q3,q10 + vst1.32 {q3},[r2] + add r2,r2,#0x50 + + mov r12,#10 + b .Ldone + +.align 4 +.L192: + vld1.8 {d16},[r0]! + vmov.i8 q10,#8 @ borrow q10 + vst1.32 {q3},[r2]! + vsub.i8 q2,q2,q10 @ adjust the mask + +.Loop192: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {d16},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + + vdup.32 q9,d7[1] + veor q9,q9,q8 + veor q10,q10,q1 + vext.8 q8,q0,q8,#12 + vshl.u8 q1,q1,#1 + veor q8,q8,q9 + veor q3,q3,q10 + veor q8,q8,q10 + vst1.32 {q3},[r2]! + bne .Loop192 + + mov r12,#12 + add r2,r2,#0x20 + b .Ldone + +.align 4 +.L256: + vld1.8 {q8},[r0] + mov r1,#7 + mov r12,#14 + vst1.32 {q3},[r2]! + +.Loop256: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q8},[r2]! + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + vst1.32 {q3},[r2]! + beq .Ldone + + vdup.32 q10,d7[1] + vext.8 q9,q0,q8,#12 + .byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + + veor q8,q8,q10 + b .Loop256 + +.Ldone: + str r12,[r2] + + eor r0,r0,r0 @ return value + + bx lr +.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key + +.globl aes_v8_set_decrypt_key +.type aes_v8_set_decrypt_key,%function +.align 5 +aes_v8_set_decrypt_key: + stmdb sp!,{r4,lr} + bl .Lenc_key + + sub r2,r2,#240 @ restore original r2 + mov r4,#-16 + add r0,r2,r12,lsl#4 @ end of key schedule + + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + +.Loop_imc: + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + cmp r0,r2 + bhi .Loop_imc + + vld1.32 {q0},[r2] + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + vst1.32 {q0},[r0] + + eor r0,r0,r0 @ return value + ldmia sp!,{r4,pc} +.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key +.globl aes_v8_encrypt +.type aes_v8_encrypt,%function +.align 5 +aes_v8_encrypt: + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +.Loop_enc: + .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 + vld1.32 {q0},[r2]! + .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + subs r3,r3,#2 + .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 + vld1.32 {q1},[r2]! + .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + bgt .Loop_enc + + .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 + vld1.32 {q0},[r2] + .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr +.size aes_v8_encrypt,.-aes_v8_encrypt +.globl aes_v8_decrypt +.type aes_v8_decrypt,%function +.align 5 +aes_v8_decrypt: + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +.Loop_dec: + .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 + vld1.32 {q0},[r2]! + .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + subs r3,r3,#2 + .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 + vld1.32 {q1},[r2]! + .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + bgt .Loop_dec + + .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 + vld1.32 {q0},[r2] + .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr +.size aes_v8_decrypt,.-aes_v8_decrypt +.globl aes_v8_cbc_encrypt +.type aes_v8_cbc_encrypt,%function +.align 5 +aes_v8_cbc_encrypt: + mov ip,sp + stmdb sp!,{r4-r8,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load remaining args + subs r2,r2,#16 + mov r8,#16 + blo .Lcbc_abort + moveq r8,#0 + + cmp r5,#0 @ en- or decrypting? + ldr r5,[r3,#240] + and r2,r2,#-16 + vld1.8 {q6},[r4] + vld1.8 {q0},[r0],r8 + + vld1.32 {q8-q9},[r3] @ load key schedule... + sub r5,r5,#6 + add r7,r3,r5,lsl#4 @ pointer to last 7 round keys + sub r5,r5,#2 + vld1.32 {q10-q11},[r7]! + vld1.32 {q12-q13},[r7]! + vld1.32 {q14-q15},[r7]! + vld1.32 {q7},[r7] + + add r7,r3,#32 + mov r6,r5 + beq .Lcbc_dec + + cmp r5,#2 + veor q0,q0,q6 + veor q5,q8,q7 + beq .Lcbc_enc128 + +.Loop_cbc_enc: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + vld1.32 {q8},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r6,r6,#2 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + vld1.32 {q9},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + bgt .Loop_cbc_enc + + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 + .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + add r7,r3,#16 + .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 + .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 + .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + + mov r6,r5 + veor q6,q0,q7 + vst1.8 {q6},[r1]! + bhs .Loop_cbc_enc + + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + vld1.32 {q2-q3},[r7] + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vst1.8 {q6},[r1]! +.Lenter_cbc_enc128: + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 + .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 + .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 + .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 + .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q6,q0,q7 + bhs .Loop_cbc_enc128 + + vst1.8 {q6},[r1]! + b .Lcbc_done + +.align 5 +.Lcbc_dec128: + vld1.32 {q4-q5},[r7] + veor q6,q6,q7 + veor q2,q0,q7 + mov r12,r8 + +.Loop2x_cbc_dec128: + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + subs r2,r2,#32 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + movlo r8,#0 + .byte 0x48,0x03,0xb0,0xf3 @ aesd q0,q4 + .byte 0x48,0x23,0xb0,0xf3 @ aesd q1,q4 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + movls r12,#0 + .byte 0x4a,0x03,0xb0,0xf3 @ aesd q0,q5 + .byte 0x4a,0x23,0xb0,0xf3 @ aesd q1,q5 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10 + .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11 + .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 + .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 + .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 + .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 + .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 + + veor q6,q6,q0 + vld1.8 {q0},[r0],r8 + veor q2,q2,q1 + vld1.8 {q1},[r0],r12 + vst1.8 {q6},[r1]! + veor q6,q3,q7 + vst1.8 {q2},[r1]! + veor q2,q0,q7 + vorr q3,q1,q1 + bhs .Loop2x_cbc_dec128 + + adds r2,r2,#32 + veor q6,q6,q7 + beq .Lcbc_done + veor q2,q2,q7 + b .Lcbc_dec_tail + +.align 5 +.Lcbc_dec: + subs r2,r2,#16 + vorr q2,q0,q0 + blo .Lcbc_dec_tail + + moveq r8,#0 + cmp r5,#2 + vld1.8 {q1},[r0],r8 + vorr q3,q1,q1 + beq .Lcbc_dec128 + +.Loop2x_cbc_dec: + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 + vld1.32 {q8},[r7]! + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + subs r6,r6,#2 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 + vld1.32 {q9},[r7]! + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + bgt .Loop2x_cbc_dec + + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + veor q4,q6,q7 + veor q5,q2,q7 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vorr q6,q3,q3 + subs r2,r2,#32 + .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10 + .byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + movlo r8,#0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + mov r7,r3 + .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11 + .byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + vld1.8 {q2},[r0],r8 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + movls r8,#0 + .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 + .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vld1.8 {q3},[r0],r8 + .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 + .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 + .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 + .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 + + mov r6,r5 + veor q4,q4,q0 + veor q5,q5,q1 + vorr q0,q2,q2 + vst1.8 {q4},[r1]! + vorr q1,q3,q3 + vst1.8 {q5},[r1]! + bhs .Loop2x_cbc_dec + + adds r2,r2,#32 + beq .Lcbc_done + +.Lcbc_dec_tail: + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + vld1.32 {q8},[r7]! + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + subs r6,r6,#2 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + vld1.32 {q9},[r7]! + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + bgt .Lcbc_dec_tail + + .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + veor q4,q6,q7 + .byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + vorr q6,q2,q2 + .byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 + .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 + + veor q4,q4,q0 + vst1.8 {q4},[r1]! + +.Lcbc_done: + vst1.8 {q6},[r4] +.Lcbc_abort: + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r8,pc} +.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl aes_v8_ctr32_encrypt_blocks +.type aes_v8_ctr32_encrypt_blocks,%function +.align 5 +aes_v8_ctr32_encrypt_blocks: + mov ip,sp + stmdb sp!,{r4-r10,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg + ldr r5,[r3,#240] + + ldr r8, [r4, #12] + vld1.32 {q0},[r4] + + vld1.32 {q8-q9},[r3] @ load key schedule... + sub r5,r5,#6 + add r7,r3,r5,lsl#4 @ pointer to last 7 round keys + sub r5,r5,#2 + vld1.32 {q10-q11},[r7]! + vld1.32 {q12-q13},[r7]! + vld1.32 {q14-q15},[r7]! + vld1.32 {q7},[r7] + + add r7,r3,#32 + mov r6,r5 + + subs r2,r2,#2 + blo .Lctr32_tail + +#ifndef __ARMEB__ + rev r8, r8 +#endif + vorr q1,q0,q0 + add r8, r8, #1 + vorr q6,q0,q0 + rev r10, r8 + cmp r5,#2 + vmov.32 d3[1],r10 + beq .Lctr32_128 + +.Loop2x_ctr32: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 + vld1.32 {q8},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + subs r6,r6,#2 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 + vld1.32 {q9},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + bgt .Loop2x_ctr32 + + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 + .byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 + vorr q0,q6,q6 + .byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 + vorr q1,q6,q6 + .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 + .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 + vld1.8 {q2},[r0]! + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + vld1.8 {q3},[r0]! + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + add r8,r8,#1 + .byte 0x24,0x83,0xb0,0xf3 @ aese q4,q10 + .byte 0x24,0xa3,0xb0,0xf3 @ aese q5,q10 + rev r9,r8 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + add r8,r8,#1 + .byte 0x26,0x83,0xb0,0xf3 @ aese q4,q11 + .byte 0x26,0xa3,0xb0,0xf3 @ aese q5,q11 + veor q2,q2,q7 + rev r10,r8 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + veor q3,q3,q7 + mov r7,r3 + .byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 + .byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 + subs r2,r2,#2 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vld1.32 {q8-q9},[r7]! @ re-pre-load rndkey[0-1] + .byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 + .byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 + .byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 + vmov.32 d1[1], r9 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + vmov.32 d3[1], r10 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 + .byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 + + mov r6,r5 + veor q2,q2,q4 + veor q3,q3,q5 + vst1.8 {q2},[r1]! + vst1.8 {q3},[r1]! + bhs .Loop2x_ctr32 + + adds r2,r2,#2 + beq .Lctr32_done + b .Lctr32_tail + +.Lctr32_128: + vld1.32 {q4-q5},[r7] + +.Loop2x_ctr32_128: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q2},[r0]! + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q3},[r0]! + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 + add r8,r8,#1 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + rev r9,r8 + .byte 0x08,0x03,0xb0,0xf3 @ aese q0,q4 + .byte 0x08,0x23,0xb0,0xf3 @ aese q1,q4 + add r8,r8,#1 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + rev r10,r8 + .byte 0x0a,0x03,0xb0,0xf3 @ aese q0,q5 + .byte 0x0a,0x23,0xb0,0xf3 @ aese q1,q5 + subs r2,r2,#2 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 + .byte 0x24,0x23,0xb0,0xf3 @ aese q1,q10 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 + .byte 0x26,0x23,0xb0,0xf3 @ aese q1,q11 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 + .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 + .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 + .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + veor q2,q2,q7 + .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q3,q3,q7 + .byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 + + veor q2,q2,q0 + vorr q0,q6,q6 + veor q3,q3,q1 + vorr q1,q6,q6 + vst1.8 {q2},[r1]! + vmov.32 d1[1], r9 + vst1.8 {q3},[r1]! + vmov.32 d3[1], r10 + bhs .Loop2x_ctr32_128 + + adds r2,r2,#2 + beq .Lctr32_done + +.Lctr32_tail: + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + vld1.32 {q8},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r6,r6,#2 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + vld1.32 {q9},[r7]! + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + bgt .Lctr32_tail + + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q2},[r0] + .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q2,q2,q7 + .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + + veor q2,q2,q0 + vst1.8 {q2},[r1] + +.Lctr32_done: + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r10,pc} +.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +#endif diff --git a/app/openssl/crypto/aes/asm/aesv8-armx.pl b/app/openssl/crypto/aes/asm/aesv8-armx.pl new file mode 100644 index 00000000..415dc04a --- /dev/null +++ b/app/openssl/crypto/aes/asm/aesv8-armx.pl @@ -0,0 +1,980 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for ARMv8 AES instructions. The +# module is endian-agnostic in sense that it supports both big- and +# little-endian cases. As does it support both 32- and 64-bit modes +# of operation. Latter is achieved by limiting amount of utilized +# registers to 16, which implies additional instructions. This has +# no effect on mighty Apple A7, as results are literally equal to +# the theoretical estimates based on instruction latencies and issue +# rate. It remains to be seen how does it affect other platforms... +# +# Performance in cycles per byte processed with 128-bit key: +# +# CBC enc CBC dec CTR +# Apple A7 2.39 1.20 1.20 +# Cortex-A5x n/a n/a n/a + +$flavour = shift; +open STDOUT,">".shift; + +$prefix="aes_v8"; + +$code=<<___; +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); + +# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, +# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to +# maintain both 32- and 64-bit codes within single module and +# transliterate common code to either flavour with regex vodoo. +# +{{{ +my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); +my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= + $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); + + +$code.=<<___; +.align 5 +rcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: +.Lenc_key: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___; + adr $ptr,rcon + cmp $bits,#192 + + veor $zero,$zero,$zero + vld1.8 {$in0},[$inp],#16 + mov $bits,#8 // reuse $bits + vld1.32 {$rcon,$mask},[$ptr],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + b.ne .Loop128 + + vld1.32 {$rcon},[$ptr] + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + veor $in0,$in0,$key + vst1.32 {$in0},[$out] + add $out,$out,#0x50 + + mov $rounds,#10 + b .Ldone + +.align 4 +.L192: + vld1.8 {$in1},[$inp],#8 + vmov.i8 $key,#8 // borrow $key + vst1.32 {$in0},[$out],#16 + vsub.i8 $mask,$mask,$key // adjust the mask + +.Loop192: + vtbl.8 $key,{$in1},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in1},[$out],#8 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + + vdup.32 $tmp,${in0}[3] + veor $tmp,$tmp,$in1 + veor $key,$key,$rcon + vext.8 $in1,$zero,$in1,#12 + vshl.u8 $rcon,$rcon,#1 + veor $in1,$in1,$tmp + veor $in0,$in0,$key + veor $in1,$in1,$key + vst1.32 {$in0},[$out],#16 + b.ne .Loop192 + + mov $rounds,#12 + add $out,$out,#0x20 + b .Ldone + +.align 4 +.L256: + vld1.8 {$in1},[$inp] + mov $bits,#7 + mov $rounds,#14 + vst1.32 {$in0},[$out],#16 + +.Loop256: + vtbl.8 $key,{$in1},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in1},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + vst1.32 {$in0},[$out],#16 + b.eq .Ldone + + vdup.32 $key,${in0}[3] // just splat + vext.8 $tmp,$zero,$in1,#12 + aese $key,$zero + + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + + veor $in1,$in1,$key + b .Loop256 + +.Ldone: + str $rounds,[$out] + + eor x0,x0,x0 // return value + `"ldr x29,[sp],#16" if ($flavour =~ /64/)` + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key + +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + stmdb sp!,{r4,lr} +___ +$code.=<<___; + bl .Lenc_key + + sub $out,$out,#240 // restore original $out + mov x4,#-16 + add $inp,$out,x12,lsl#4 // end of key schedule + + vld1.32 {v0.16b},[$out] + vld1.32 {v1.16b},[$inp] + vst1.32 {v0.16b},[$inp],x4 + vst1.32 {v1.16b},[$out],#16 + +.Loop_imc: + vld1.32 {v0.16b},[$out] + vld1.32 {v1.16b},[$inp] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + vst1.32 {v0.16b},[$inp],x4 + vst1.32 {v1.16b},[$out],#16 + cmp $inp,$out + b.hi .Loop_imc + + vld1.32 {v0.16b},[$out] + aesimc v0.16b,v0.16b + vst1.32 {v0.16b},[$inp] + + eor x0,x0,x0 // return value +___ +$code.=<<___ if ($flavour !~ /64/); + ldmia sp!,{r4,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldp x29,x30,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} +{{{ +sub gen_block () { +my $dir = shift; +my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); +my ($inp,$out,$key)=map("x$_",(0..2)); +my $rounds="w3"; +my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); + +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + ldr $rounds,[$key,#240] + vld1.32 {$rndkey0},[$key],#16 + vld1.8 {$inout},[$inp] + sub $rounds,$rounds,#2 + vld1.32 {$rndkey1},[$key],#16 + +.Loop_${dir}c: + aes$e $inout,$rndkey0 + vld1.32 {$rndkey0},[$key],#16 + aes$mc $inout,$inout + subs $rounds,$rounds,#2 + aes$e $inout,$rndkey1 + vld1.32 {$rndkey1},[$key],#16 + aes$mc $inout,$inout + b.gt .Loop_${dir}c + + aes$e $inout,$rndkey0 + vld1.32 {$rndkey0},[$key] + aes$mc $inout,$inout + aes$e $inout,$rndkey1 + veor $inout,$inout,$rndkey0 + + vst1.8 {$inout},[$out] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; +my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + +my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r8,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load remaining args +___ +$code.=<<___; + subs $len,$len,#16 + mov $step,#16 + b.lo .Lcbc_abort + cclr $step,eq + + cmp $enc,#0 // en- or decrypting? + ldr $rounds,[$key,#240] + and $len,$len,#-16 + vld1.8 {$ivec},[$ivp] + vld1.8 {$dat},[$inp],$step + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#6 + add $key_,$key,x5,lsl#4 // pointer to last 7 round keys + sub $rounds,$rounds,#2 + vld1.32 {q10-q11},[$key_],#32 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + + add $key_,$key,#32 + mov $cnt,$rounds + b.eq .Lcbc_dec + + cmp $rounds,#2 + veor $dat,$dat,$ivec + veor $rndzero_n_last,q8,$rndlast + b.eq .Lcbc_enc128 + +.Loop_cbc_enc: + aese $dat,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat,$dat + subs $cnt,$cnt,#2 + aese $dat,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat,$dat + b.gt .Loop_cbc_enc + + aese $dat,q8 + aesmc $dat,$dat + subs $len,$len,#16 + aese $dat,q9 + aesmc $dat,$dat + cclr $step,eq + aese $dat,q10 + aesmc $dat,$dat + add $key_,$key,#16 + aese $dat,q11 + aesmc $dat,$dat + vld1.8 {q8},[$inp],$step + aese $dat,q12 + aesmc $dat,$dat + veor q8,q8,$rndzero_n_last + aese $dat,q13 + aesmc $dat,$dat + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + aese $dat,q14 + aesmc $dat,$dat + aese $dat,q15 + + mov $cnt,$rounds + veor $ivec,$dat,$rndlast + vst1.8 {$ivec},[$out],#16 + b.hs .Loop_cbc_enc + + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + vld1.32 {$in0-$in1},[$key_] + aese $dat,q8 + aesmc $dat,$dat + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + aese $dat,q8 + aesmc $dat,$dat + vst1.8 {$ivec},[$out],#16 +.Lenter_cbc_enc128: + aese $dat,q9 + aesmc $dat,$dat + subs $len,$len,#16 + aese $dat,$in0 + aesmc $dat,$dat + cclr $step,eq + aese $dat,$in1 + aesmc $dat,$dat + aese $dat,q10 + aesmc $dat,$dat + aese $dat,q11 + aesmc $dat,$dat + vld1.8 {q8},[$inp],$step + aese $dat,q12 + aesmc $dat,$dat + aese $dat,q13 + aesmc $dat,$dat + aese $dat,q14 + aesmc $dat,$dat + veor q8,q8,$rndzero_n_last + aese $dat,q15 + veor $ivec,$dat,$rndlast + b.hs .Loop_cbc_enc128 + + vst1.8 {$ivec},[$out],#16 + b .Lcbc_done + +.align 5 +.Lcbc_dec128: + vld1.32 {$tmp0-$tmp1},[$key_] + veor $ivec,$ivec,$rndlast + veor $in0,$dat0,$rndlast + mov $step1,$step + +.Loop2x_cbc_dec128: + aesd $dat0,q8 + aesd $dat1,q8 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + subs $len,$len,#32 + aesd $dat0,q9 + aesd $dat1,q9 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + cclr $step,lo + aesd $dat0,$tmp0 + aesd $dat1,$tmp0 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + cclr $step1,ls + aesd $dat0,$tmp1 + aesd $dat1,$tmp1 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q10 + aesd $dat1,q10 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q11 + aesd $dat1,q11 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q12 + aesd $dat1,q12 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q13 + aesd $dat1,q13 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q14 + aesd $dat1,q14 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesd $dat0,q15 + aesd $dat1,q15 + + veor $ivec,$ivec,$dat0 + vld1.8 {$dat0},[$inp],$step + veor $in0,$in0,$dat1 + vld1.8 {$dat1},[$inp],$step1 + vst1.8 {$ivec},[$out],#16 + veor $ivec,$in1,$rndlast + vst1.8 {$in0},[$out],#16 + veor $in0,$dat0,$rndlast + vorr $in1,$dat1,$dat1 + b.hs .Loop2x_cbc_dec128 + + adds $len,$len,#32 + veor $ivec,$ivec,$rndlast + b.eq .Lcbc_done + veor $in0,$in0,$rndlast + b .Lcbc_dec_tail + +.align 5 +.Lcbc_dec: + subs $len,$len,#16 + vorr $in0,$dat,$dat + b.lo .Lcbc_dec_tail + + cclr $step,eq + cmp $rounds,#2 + vld1.8 {$dat1},[$inp],$step + vorr $in1,$dat1,$dat1 + b.eq .Lcbc_dec128 + +.Loop2x_cbc_dec: + aesd $dat0,q8 + aesd $dat1,q8 + vld1.32 {q8},[$key_],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + subs $cnt,$cnt,#2 + aesd $dat0,q9 + aesd $dat1,q9 + vld1.32 {q9},[$key_],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + b.gt .Loop2x_cbc_dec + + aesd $dat0,q8 + aesd $dat1,q8 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + veor $tmp0,$ivec,$rndlast + veor $tmp1,$in0,$rndlast + aesd $dat0,q9 + aesd $dat1,q9 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + vorr $ivec,$in1,$in1 + subs $len,$len,#32 + aesd $dat0,q10 + aesd $dat1,q10 + aesimc $dat0,$dat0 + cclr $step,lo + aesimc $dat1,$dat1 + mov $key_,$key + aesd $dat0,q11 + aesd $dat1,q11 + aesimc $dat0,$dat0 + vld1.8 {$in0},[$inp],$step + aesimc $dat1,$dat1 + cclr $step,ls + aesd $dat0,q12 + aesd $dat1,q12 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + vld1.8 {$in1},[$inp],$step + aesd $dat0,q13 + aesd $dat1,q13 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + aesd $dat0,q14 + aesd $dat1,q14 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + aesd $dat0,q15 + aesd $dat1,q15 + + mov $cnt,$rounds + veor $tmp0,$tmp0,$dat0 + veor $tmp1,$tmp1,$dat1 + vorr $dat0,$in0,$in0 + vst1.8 {$tmp0},[$out],#16 + vorr $dat1,$in1,$in1 + vst1.8 {$tmp1},[$out],#16 + b.hs .Loop2x_cbc_dec + + adds $len,$len,#32 + b.eq .Lcbc_done + +.Lcbc_dec_tail: + aesd $dat,q8 + vld1.32 {q8},[$key_],#16 + aesimc $dat,$dat + subs $cnt,$cnt,#2 + aesd $dat,q9 + vld1.32 {q9},[$key_],#16 + aesimc $dat,$dat + b.gt .Lcbc_dec_tail + + aesd $dat,q8 + aesimc $dat,$dat + aesd $dat,q9 + aesimc $dat,$dat + veor $tmp,$ivec,$rndlast + aesd $dat,q10 + aesimc $dat,$dat + vorr $ivec,$in0,$in0 + aesd $dat,q11 + aesimc $dat,$dat + aesd $dat,q12 + aesimc $dat,$dat + aesd $dat,q13 + aesimc $dat,$dat + aesd $dat,q14 + aesimc $dat,$dat + aesd $dat,q15 + + veor $tmp,$tmp,$dat + vst1.8 {$tmp},[$out],#16 + +.Lcbc_done: + vst1.8 {$ivec},[$ivp] +.Lcbc_abort: +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r8,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); +my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10"); +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + +my ($dat,$tmp)=($dat0,$tmp0); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r10,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg +___ +$code.=<<___; + ldr $rounds,[$key,#240] + + ldr $ctr, [$ivp, #12] + vld1.32 {$dat0},[$ivp] + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#6 + add $key_,$key,x5,lsl#4 // pointer to last 7 round keys + sub $rounds,$rounds,#2 + vld1.32 {q10-q11},[$key_],#32 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + + add $key_,$key,#32 + mov $cnt,$rounds + + subs $len,$len,#2 + b.lo .Lctr32_tail + +#ifndef __ARMEB__ + rev $ctr, $ctr +#endif + vorr $dat1,$dat0,$dat0 + add $ctr, $ctr, #1 + vorr $ivec,$dat0,$dat0 + rev $tctr1, $ctr + cmp $rounds,#2 + vmov.32 ${dat1}[3],$tctr1 + b.eq .Lctr32_128 + +.Loop2x_ctr32: + aese $dat0,q8 + aese $dat1,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aese $dat1,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + b.gt .Loop2x_ctr32 + + aese $dat0,q8 + aese $dat1,q8 + aesmc $tmp0,$dat0 + vorr $dat0,$ivec,$ivec + aesmc $tmp1,$dat1 + vorr $dat1,$ivec,$ivec + aese $tmp0,q9 + aese $tmp1,q9 + vld1.8 {$in0},[$inp],#16 + aesmc $tmp0,$tmp0 + vld1.8 {$in1},[$inp],#16 + aesmc $tmp1,$tmp1 + add $ctr,$ctr,#1 + aese $tmp0,q10 + aese $tmp1,q10 + rev $tctr,$ctr + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + add $ctr,$ctr,#1 + aese $tmp0,q11 + aese $tmp1,q11 + veor $in0,$in0,$rndlast + rev $tctr1,$ctr + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + veor $in1,$in1,$rndlast + mov $key_,$key + aese $tmp0,q12 + aese $tmp1,q12 + subs $len,$len,#2 + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1] + aese $tmp0,q13 + aese $tmp1,q13 + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + aese $tmp0,q14 + aese $tmp1,q14 + vmov.32 ${dat0}[3], $tctr + aesmc $tmp0,$tmp0 + vmov.32 ${dat1}[3], $tctr1 + aesmc $tmp1,$tmp1 + aese $tmp0,q15 + aese $tmp1,q15 + + mov $cnt,$rounds + veor $in0,$in0,$tmp0 + veor $in1,$in1,$tmp1 + vst1.8 {$in0},[$out],#16 + vst1.8 {$in1},[$out],#16 + b.hs .Loop2x_ctr32 + + adds $len,$len,#2 + b.eq .Lctr32_done + b .Lctr32_tail + +.Lctr32_128: + vld1.32 {$tmp0-$tmp1},[$key_] + +.Loop2x_ctr32_128: + aese $dat0,q8 + aese $dat1,q8 + aesmc $dat0,$dat0 + vld1.8 {$in0},[$inp],#16 + aesmc $dat1,$dat1 + vld1.8 {$in1},[$inp],#16 + aese $dat0,q9 + aese $dat1,q9 + add $ctr,$ctr,#1 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + rev $tctr,$ctr + aese $dat0,$tmp0 + aese $dat1,$tmp0 + add $ctr,$ctr,#1 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + rev $tctr1,$ctr + aese $dat0,$tmp1 + aese $dat1,$tmp1 + subs $len,$len,#2 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q10 + aese $dat1,q10 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q11 + aese $dat1,q11 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q12 + aese $dat1,q12 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q13 + aese $dat1,q13 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q14 + aese $dat1,q14 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + veor $in0,$in0,$rndlast + aese $dat0,q15 + veor $in1,$in1,$rndlast + aese $dat1,q15 + + veor $in0,$in0,$dat0 + vorr $dat0,$ivec,$ivec + veor $in1,$in1,$dat1 + vorr $dat1,$ivec,$ivec + vst1.8 {$in0},[$out],#16 + vmov.32 ${dat0}[3], $tctr + vst1.8 {$in1},[$out],#16 + vmov.32 ${dat1}[3], $tctr1 + b.hs .Loop2x_ctr32_128 + + adds $len,$len,#2 + b.eq .Lctr32_done + +.Lctr32_tail: + aese $dat,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat,$dat + subs $cnt,$cnt,#2 + aese $dat,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat,$dat + b.gt .Lctr32_tail + + aese $dat,q8 + aesmc $dat,$dat + aese $dat,q9 + aesmc $dat,$dat + vld1.8 {$in0},[$inp] + aese $dat,q10 + aesmc $dat,$dat + aese $dat,q11 + aesmc $dat,$dat + aese $dat,q12 + aesmc $dat,$dat + aese $dat,q13 + aesmc $dat,$dat + aese $dat,q14 + aesmc $dat,$dat + veor $in0,$in0,$rndlast + aese $dat,q15 + + veor $in0,$in0,$dat + vst1.8 {$in0},[$out] + +.Lctr32_done: +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r10,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +$code.=<<___; +#endif +___ +######################################## +if ($flavour =~ /64/) { ######## 64-bit code + my %opcode = ( + "aesd" => 0x4e285800, "aese" => 0x4e284800, + "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5), + $mnemonic,$arg; + }; + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vext\.8/ext/o or + s/vrev32\.8/rev32/o or + s/vtst\.8/cmtst/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + # fix up remainig legacy suffixes + s/\.[ui]?8//o; + m/\],#8/o and s/\.16b/\.8b/go; + s/\.[ui]?32//o and s/\.16b/\.4s/go; + s/\.[ui]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + print $_,"\n"; + } +} else { ######## 32-bit code + my %opcode = ( + "aesd" => 0xf3b00340, "aese" => 0xf3b00300, + "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<1) |(($2&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + }; + + sub unvtbl { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && + sprintf "vtbl.8 d%d,{q%d},d%d\n\t". + "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; + } + + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + + sub unvmov32 { + my $arg=shift; + + $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && + sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; + } + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remainig new-style suffixes + s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or + s/\],#[0-9]+/]!/o; + + s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vtbl\.8\s+(.*)/unvtbl($1)/geo or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/vmov\.32\s+(.*)/unvmov32($1)/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT; diff --git a/app/openssl/crypto/arm64cpuid.S b/app/openssl/crypto/arm64cpuid.S new file mode 100644 index 00000000..4778ac1d --- /dev/null +++ b/app/openssl/crypto/arm64cpuid.S @@ -0,0 +1,46 @@ +#include "arm_arch.h" + +.text +.arch armv8-a+crypto + +.align 5 +.global _armv7_neon_probe +.type _armv7_neon_probe,%function +_armv7_neon_probe: + orr v15.16b, v15.16b, v15.16b + ret +.size _armv7_neon_probe,.-_armv7_neon_probe + +.global _armv7_tick +.type _armv7_tick,%function +_armv7_tick: + mrs x0, CNTVCT_EL0 + ret +.size _armv7_tick,.-_armv7_tick + +.global _armv8_aes_probe +.type _armv8_aes_probe,%function +_armv8_aes_probe: + aese v0.16b, v0.16b + ret +.size _armv8_aes_probe,.-_armv8_aes_probe + +.global _armv8_sha1_probe +.type _armv8_sha1_probe,%function +_armv8_sha1_probe: + sha1h s0, s0 + ret +.size _armv8_sha1_probe,.-_armv8_sha1_probe + +.global _armv8_sha256_probe +.type _armv8_sha256_probe,%function +_armv8_sha256_probe: + sha256su0 v0.4s, v0.4s + ret +.size _armv8_sha256_probe,.-_armv8_sha256_probe +.global _armv8_pmull_probe +.type _armv8_pmull_probe,%function +_armv8_pmull_probe: + pmull v0.1q, v0.1d, v0.1d + ret +.size _armv8_pmull_probe,.-_armv8_pmull_probe diff --git a/app/openssl/crypto/arm_arch.h b/app/openssl/crypto/arm_arch.h index 5a831076..6fa87244 100644 --- a/app/openssl/crypto/arm_arch.h +++ b/app/openssl/crypto/arm_arch.h @@ -10,13 +10,24 @@ # define __ARMEL__ # endif # elif defined(__GNUC__) +# if defined(__aarch64__) +# define __ARM_ARCH__ 8 +# if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__ +# define __ARMEB__ +# else +# define __ARMEL__ +# endif /* * Why doesn't gcc define __ARM_ARCH__? Instead it defines * bunch of below macros. See all_architectires[] table in * gcc/config/arm/arm.c. On a side note it defines * __ARMEL__/__ARMEB__ for little-/big-endian. */ -# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ +# elif defined(__ARM_ARCH) +# define __ARM_ARCH__ __ARM_ARCH +# elif defined(__ARM_ARCH_8A__) +# define __ARM_ARCH__ 8 +# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ defined(__ARM_ARCH_7EM__) # define __ARM_ARCH__ 7 @@ -43,9 +54,13 @@ #if !__ASSEMBLER__ extern unsigned int OPENSSL_armcap_P; +#endif #define ARMV7_NEON (1<<0) #define ARMV7_TICK (1<<1) -#endif +#define ARMV8_AES (1<<2) +#define ARMV8_SHA1 (1<<3) +#define ARMV8_SHA256 (1<<4) +#define ARMV8_PMULL (1<<5) #endif diff --git a/app/openssl/crypto/armcap.c b/app/openssl/crypto/armcap.c index 9abaf396..7e46d07a 100644 --- a/app/openssl/crypto/armcap.c +++ b/app/openssl/crypto/armcap.c @@ -19,9 +19,13 @@ static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } * ARM compilers support inline assembler... */ void _armv7_neon_probe(void); -unsigned int _armv7_tick(void); +void _armv8_aes_probe(void); +void _armv8_sha1_probe(void); +void _armv8_sha256_probe(void); +void _armv8_pmull_probe(void); +unsigned long _armv7_tick(void); -unsigned int OPENSSL_rdtsc(void) +unsigned long OPENSSL_rdtsc(void) { if (OPENSSL_armcap_P & ARMV7_TICK) return _armv7_tick(); @@ -29,9 +33,41 @@ unsigned int OPENSSL_rdtsc(void) return 0; } +/* + * Use a weak reference to getauxval() so we can use it if it is available but + * don't break the build if it is not. + */ #if defined(__GNUC__) && __GNUC__>=2 void OPENSSL_cpuid_setup(void) __attribute__((constructor)); +extern unsigned long getauxval(unsigned long type) __attribute__((weak)); +#else +static unsigned long (*getauxval)(unsigned long) = NULL; #endif + +/* + * ARM puts the the feature bits for Crypto Extensions in AT_HWCAP2, whereas + * AArch64 used AT_HWCAP. + */ +#if defined(__arm__) || defined (__arm) +# define HWCAP 16 /* AT_HWCAP */ +# define HWCAP_NEON (1 << 12) + +# define HWCAP_CE 26 /* AT_HWCAP2 */ +# define HWCAP_CE_AES (1 << 0) +# define HWCAP_CE_PMULL (1 << 1) +# define HWCAP_CE_SHA1 (1 << 2) +# define HWCAP_CE_SHA256 (1 << 3) +#elif defined(__aarch64__) +# define HWCAP 16 /* AT_HWCAP */ +# define HWCAP_NEON (1 << 1) + +# define HWCAP_CE HWCAP +# define HWCAP_CE_AES (1 << 3) +# define HWCAP_CE_PMULL (1 << 4) +# define HWCAP_CE_SHA1 (1 << 5) +# define HWCAP_CE_SHA256 (1 << 6) +#endif + void OPENSSL_cpuid_setup(void) { char *e; @@ -44,7 +80,7 @@ void OPENSSL_cpuid_setup(void) if ((e=getenv("OPENSSL_armcap"))) { - OPENSSL_armcap_P=strtoul(e,NULL,0); + OPENSSL_armcap_P=(unsigned int)strtoul(e,NULL,0); return; } @@ -64,10 +100,51 @@ void OPENSSL_cpuid_setup(void) sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); sigaction(SIGILL,&ill_act,&ill_oact); - if (sigsetjmp(ill_jmp,1) == 0) + if (getauxval != NULL) + { + if (getauxval(HWCAP) & HWCAP_NEON) + { + unsigned long hwcap = getauxval(HWCAP_CE); + + OPENSSL_armcap_P |= ARMV7_NEON; + + if (hwcap & HWCAP_CE_AES) + OPENSSL_armcap_P |= ARMV8_AES; + + if (hwcap & HWCAP_CE_PMULL) + OPENSSL_armcap_P |= ARMV8_PMULL; + + if (hwcap & HWCAP_CE_SHA1) + OPENSSL_armcap_P |= ARMV8_SHA1; + + if (hwcap & HWCAP_CE_SHA256) + OPENSSL_armcap_P |= ARMV8_SHA256; + } + } + else if (sigsetjmp(ill_jmp,1) == 0) { _armv7_neon_probe(); OPENSSL_armcap_P |= ARMV7_NEON; + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv8_pmull_probe(); + OPENSSL_armcap_P |= ARMV8_PMULL|ARMV8_AES; + } + else if (sigsetjmp(ill_jmp,1) == 0) + { + _armv8_aes_probe(); + OPENSSL_armcap_P |= ARMV8_AES; + } + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv8_sha1_probe(); + OPENSSL_armcap_P |= ARMV8_SHA1; + } + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv8_sha256_probe(); + OPENSSL_armcap_P |= ARMV8_SHA256; + } } if (sigsetjmp(ill_jmp,1) == 0) { diff --git a/app/openssl/crypto/armv4cpuid.S b/app/openssl/crypto/armv4cpuid.S index 2d618dea..add11d40 100644 --- a/app/openssl/crypto/armv4cpuid.S +++ b/app/openssl/crypto/armv4cpuid.S @@ -7,17 +7,49 @@ .global _armv7_neon_probe .type _armv7_neon_probe,%function _armv7_neon_probe: - .word 0xf26ee1fe @ vorr q15,q15,q15 - .word 0xe12fff1e @ bx lr + .byte 0xf0,0x01,0x60,0xf2 @ vorr q8,q8,q8 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr .size _armv7_neon_probe,.-_armv7_neon_probe .global _armv7_tick .type _armv7_tick,%function _armv7_tick: - mrc p15,0,r0,c9,c13,0 - .word 0xe12fff1e @ bx lr + mrrc p15,1,r0,r1,c14 @ CNTVCT +#if __ARM_ARCH__>=5 + bx lr +#else + .word 0xe12fff1e @ bx lr +#endif .size _armv7_tick,.-_armv7_tick +.global _armv8_aes_probe +.type _armv8_aes_probe,%function +_armv8_aes_probe: + .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr +.size _armv8_aes_probe,.-_armv8_aes_probe + +.global _armv8_sha1_probe +.type _armv8_sha1_probe,%function +_armv8_sha1_probe: + .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr +.size _armv8_sha1_probe,.-_armv8_sha1_probe + +.global _armv8_sha256_probe +.type _armv8_sha256_probe,%function +_armv8_sha256_probe: + .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr +.size _armv8_sha256_probe,.-_armv8_sha256_probe +.global _armv8_pmull_probe +.type _armv8_pmull_probe,%function +_armv8_pmull_probe: + .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr +.size _armv8_pmull_probe,.-_armv8_pmull_probe + +.align 5 .global OPENSSL_atomic_add .type OPENSSL_atomic_add,%function OPENSSL_atomic_add: @@ -28,7 +60,7 @@ OPENSSL_atomic_add: cmp r2,#0 bne .Ladd mov r0,r3 - .word 0xe12fff1e @ bx lr + bx lr #else stmdb sp!,{r4-r6,lr} ldr r2,.Lspinlock @@ -81,9 +113,13 @@ OPENSSL_cleanse: adds r1,r1,#4 bne .Little .Lcleanse_done: +#if __ARM_ARCH__>=5 + bx lr +#else tst lr,#1 moveq pc,lr .word 0xe12fff1e @ bx lr +#endif .size OPENSSL_cleanse,.-OPENSSL_cleanse .global OPENSSL_wipe_cpu @@ -97,41 +133,53 @@ OPENSSL_wipe_cpu: eor ip,ip,ip tst r0,#1 beq .Lwipe_done - .word 0xf3000150 @ veor q0, q0, q0 - .word 0xf3022152 @ veor q1, q1, q1 - .word 0xf3044154 @ veor q2, q2, q2 - .word 0xf3066156 @ veor q3, q3, q3 - .word 0xf34001f0 @ veor q8, q8, q8 - .word 0xf34221f2 @ veor q9, q9, q9 - .word 0xf34441f4 @ veor q10, q10, q10 - .word 0xf34661f6 @ veor q11, q11, q11 - .word 0xf34881f8 @ veor q12, q12, q12 - .word 0xf34aa1fa @ veor q13, q13, q13 - .word 0xf34cc1fc @ veor q14, q14, q14 - .word 0xf34ee1fe @ veor q15, q15, q15 + .byte 0x50,0x01,0x00,0xf3 @ veor q0, q0, q0 + .byte 0x52,0x21,0x02,0xf3 @ veor q1, q1, q1 + .byte 0x54,0x41,0x04,0xf3 @ veor q2, q2, q2 + .byte 0x56,0x61,0x06,0xf3 @ veor q3, q3, q3 + .byte 0xf0,0x01,0x40,0xf3 @ veor q8, q8, q8 + .byte 0xf2,0x21,0x42,0xf3 @ veor q9, q9, q9 + .byte 0xf4,0x41,0x44,0xf3 @ veor q10, q10, q10 + .byte 0xf6,0x61,0x46,0xf3 @ veor q11, q11, q11 + .byte 0xf8,0x81,0x48,0xf3 @ veor q12, q12, q12 + .byte 0xfa,0xa1,0x4a,0xf3 @ veor q13, q13, q13 + .byte 0xfc,0xc1,0x4c,0xf3 @ veor q14, q14, q14 + .byte 0xfe,0xe1,0x4e,0xf3 @ veor q14, q14, q14 .Lwipe_done: mov r0,sp +#if __ARM_ARCH__>=5 + bx lr +#else tst lr,#1 moveq pc,lr .word 0xe12fff1e @ bx lr +#endif .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu .global OPENSSL_instrument_bus .type OPENSSL_instrument_bus,%function OPENSSL_instrument_bus: eor r0,r0,r0 +#if __ARM_ARCH__>=5 + bx lr +#else tst lr,#1 moveq pc,lr .word 0xe12fff1e @ bx lr +#endif .size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus .global OPENSSL_instrument_bus2 .type OPENSSL_instrument_bus2,%function OPENSSL_instrument_bus2: eor r0,r0,r0 +#if __ARM_ARCH__>=5 + bx lr +#else tst lr,#1 moveq pc,lr .word 0xe12fff1e @ bx lr +#endif .size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 .align 5 diff --git a/app/openssl/crypto/asn1/a_strnid.c b/app/openssl/crypto/asn1/a_strnid.c index 2fc48c15..2afd5a41 100644 --- a/app/openssl/crypto/asn1/a_strnid.c +++ b/app/openssl/crypto/asn1/a_strnid.c @@ -74,7 +74,7 @@ static int sk_table_cmp(const ASN1_STRING_TABLE * const *a, * certain software (e.g. Netscape) has problems with them. */ -static unsigned long global_mask = 0xFFFFFFFFL; +static unsigned long global_mask = B_ASN1_UTF8STRING; void ASN1_STRING_set_default_mask(unsigned long mask) { diff --git a/app/openssl/crypto/bio/bio.h b/app/openssl/crypto/bio/bio.h index 05699ab2..d05fa22a 100644 --- a/app/openssl/crypto/bio/bio.h +++ b/app/openssl/crypto/bio/bio.h @@ -266,6 +266,9 @@ void BIO_clear_flags(BIO *b, int flags); #define BIO_RR_CONNECT 0x02 /* Returned from the accept BIO when an accept would have blocked */ #define BIO_RR_ACCEPT 0x03 +/* Returned from the SSL bio when the channel id retrieval code cannot find the + * private key. */ +#define BIO_RR_SSL_CHANNEL_ID_LOOKUP 0x04 /* These are passed by the BIO callback */ #define BIO_CB_FREE 0x01 diff --git a/app/openssl/crypto/bio/bss_dgram.c b/app/openssl/crypto/bio/bss_dgram.c index 54c012c4..d9967e72 100644 --- a/app/openssl/crypto/bio/bss_dgram.c +++ b/app/openssl/crypto/bio/bss_dgram.c @@ -1333,7 +1333,7 @@ static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr) bio_dgram_sctp_data *data = NULL; socklen_t sockopt_len = 0; struct sctp_authkeyid authkeyid; - struct sctp_authkey *authkey; + struct sctp_authkey *authkey = NULL; data = (bio_dgram_sctp_data *)b->ptr; @@ -1388,6 +1388,11 @@ static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr) /* Add new key */ sockopt_len = sizeof(struct sctp_authkey) + 64 * sizeof(uint8_t); authkey = OPENSSL_malloc(sockopt_len); + if (authkey == NULL) + { + ret = -1; + break; + } memset(authkey, 0x00, sockopt_len); authkey->sca_keynumber = authkeyid.scact_keynumber + 1; #ifndef __FreeBSD__ @@ -1399,6 +1404,8 @@ static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr) memcpy(&authkey->sca_key[0], ptr, 64 * sizeof(uint8_t)); ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_KEY, authkey, sockopt_len); + OPENSSL_free(authkey); + authkey = NULL; if (ret < 0) break; /* Reset active key */ diff --git a/app/openssl/crypto/bn/asm/armv4-gf2m.S b/app/openssl/crypto/bn/asm/armv4-gf2m.S index 038f0864..0fa25b26 100644 --- a/app/openssl/crypto/bn/asm/armv4-gf2m.S +++ b/app/openssl/crypto/bn/asm/armv4-gf2m.S @@ -5,31 +5,6 @@ #if __ARM_ARCH__>=7 .fpu neon - -.type mul_1x1_neon,%function -.align 5 -mul_1x1_neon: - vshl.u64 d2,d16,#8 @ q1-q3 are slided - vmull.p8 q0,d16,d17 @ a·bb - vshl.u64 d4,d16,#16 - vmull.p8 q1,d2,d17 @ a<<8·bb - vshl.u64 d6,d16,#24 - vmull.p8 q2,d4,d17 @ a<<16·bb - vshr.u64 d2,#8 - vmull.p8 q3,d6,d17 @ a<<24·bb - vshl.u64 d3,#24 - veor d0,d2 - vshr.u64 d4,#16 - veor d0,d3 - vshl.u64 d5,#16 - veor d0,d4 - vshr.u64 d6,#24 - veor d0,d5 - vshl.u64 d7,#8 - veor d0,d6 - veor d0,d7 - .word 0xe12fff1e -.size mul_1x1_neon,.-mul_1x1_neon #endif .type mul_1x1_ialu,%function .align 5 @@ -120,40 +95,53 @@ bn_GF2m_mul_2x2: tst r12,#1 beq .Lialu - veor d18,d18 - vmov.32 d19,r3,r3 @ two copies of b1 - vmov.32 d18[0],r1 @ a1 - - veor d20,d20 - vld1.32 d21[],[sp,:32] @ two copies of b0 - vmov.32 d20[0],r2 @ a0 - mov r12,lr - - vmov d16,d18 - vmov d17,d19 - bl mul_1x1_neon @ a1·b1 - vmov d22,d0 - - vmov d16,d20 - vmov d17,d21 - bl mul_1x1_neon @ a0·b0 - vmov d23,d0 - - veor d16,d20,d18 - veor d17,d21,d19 - veor d20,d23,d22 - bl mul_1x1_neon @ (a0+a1)·(b0+b1) - - veor d0,d20 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 - vshl.u64 d1,d0,#32 - vshr.u64 d0,d0,#32 - veor d23,d1 - veor d22,d0 - vst1.32 {d23[0]},[r0,:32]! - vst1.32 {d23[1]},[r0,:32]! - vst1.32 {d22[0]},[r0,:32]! - vst1.32 {d22[1]},[r0,:32] - bx r12 + ldr r12, [sp] @ 5th argument + vmov.32 d26, r2, r1 + vmov.32 d27, r12, r3 + vmov.i64 d28, #0x0000ffffffffffff + vmov.i64 d29, #0x00000000ffffffff + vmov.i64 d30, #0x000000000000ffff + + vext.8 d2, d26, d26, #1 @ A1 + vmull.p8 q1, d2, d27 @ F = A1*B + vext.8 d0, d27, d27, #1 @ B1 + vmull.p8 q0, d26, d0 @ E = A*B1 + vext.8 d4, d26, d26, #2 @ A2 + vmull.p8 q2, d4, d27 @ H = A2*B + vext.8 d16, d27, d27, #2 @ B2 + vmull.p8 q8, d26, d16 @ G = A*B2 + vext.8 d6, d26, d26, #3 @ A3 + veor q1, q1, q0 @ L = E + F + vmull.p8 q3, d6, d27 @ J = A3*B + vext.8 d0, d27, d27, #3 @ B3 + veor q2, q2, q8 @ M = G + H + vmull.p8 q0, d26, d0 @ I = A*B3 + veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8 + vand d3, d3, d28 + vext.8 d16, d27, d27, #4 @ B4 + veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16 + vand d5, d5, d29 + vmull.p8 q8, d26, d16 @ K = A*B4 + veor q3, q3, q0 @ N = I + J + veor d2, d2, d3 + veor d4, d4, d5 + veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24 + vand d7, d7, d30 + vext.8 q1, q1, q1, #15 + veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d17, #0 + vext.8 q2, q2, q2, #14 + veor d6, d6, d7 + vmull.p8 q0, d26, d27 @ D = A*B + vext.8 q8, q8, q8, #12 + vext.8 q3, q3, q3, #13 + veor q1, q1, q2 + veor q3, q3, q8 + veor q0, q0, q1 + veor q0, q0, q3 + + vst1.32 {q0}, [r0] + bx lr @ bx lr .align 4 .Lialu: #endif diff --git a/app/openssl/crypto/bn/asm/armv4-gf2m.pl b/app/openssl/crypto/bn/asm/armv4-gf2m.pl index 22ad1f85..3f1f4f67 100644 --- a/app/openssl/crypto/bn/asm/armv4-gf2m.pl +++ b/app/openssl/crypto/bn/asm/armv4-gf2m.pl @@ -20,14 +20,21 @@ # length, more for longer keys. Even though NEON 1x1 multiplication # runs in even less cycles, ~30, improvement is measurable only on # longer keys. One has to optimize code elsewhere to get NEON glow... +# +# April 2014 +# +# Double bn_GF2m_mul_2x2 performance by using algorithm from paper +# referred below, which improves ECDH and ECDSA verify benchmarks +# by 18-40%. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } -sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } - $code=<<___; #include "arm_arch.h" @@ -36,31 +43,6 @@ $code=<<___; #if __ARM_ARCH__>=7 .fpu neon - -.type mul_1x1_neon,%function -.align 5 -mul_1x1_neon: - vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a - vmull.p8 `&Q("d0")`,d16,d17 @ a·bb - vshl.u64 `&Dlo("q2")`,d16,#16 - vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb - vshl.u64 `&Dlo("q3")`,d16,#24 - vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb - vshr.u64 `&Dlo("q1")`,#8 - vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb - vshl.u64 `&Dhi("q1")`,#24 - veor d0,`&Dlo("q1")` - vshr.u64 `&Dlo("q2")`,#16 - veor d0,`&Dhi("q1")` - vshl.u64 `&Dhi("q2")`,#16 - veor d0,`&Dlo("q2")` - vshr.u64 `&Dlo("q3")`,#24 - veor d0,`&Dhi("q2")` - vshl.u64 `&Dhi("q3")`,#8 - veor d0,`&Dlo("q3")` - veor d0,`&Dhi("q3")` - bx lr -.size mul_1x1_neon,.-mul_1x1_neon #endif ___ ################ @@ -159,8 +141,9 @@ ___ # void bn_GF2m_mul_2x2(BN_ULONG *r, # BN_ULONG a1,BN_ULONG a0, # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 - -($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); +{ +my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12)); +my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31)); $code.=<<___; .global bn_GF2m_mul_2x2 @@ -173,44 +156,58 @@ bn_GF2m_mul_2x2: tst r12,#1 beq .Lialu - veor $A1,$A1 - vmov.32 $B1,r3,r3 @ two copies of b1 - vmov.32 ${A1}[0],r1 @ a1 - - veor $A0,$A0 - vld1.32 ${B0}[],[sp,:32] @ two copies of b0 - vmov.32 ${A0}[0],r2 @ a0 - mov r12,lr - - vmov d16,$A1 - vmov d17,$B1 - bl mul_1x1_neon @ a1·b1 - vmov $A1B1,d0 - - vmov d16,$A0 - vmov d17,$B0 - bl mul_1x1_neon @ a0·b0 - vmov $A0B0,d0 - - veor d16,$A0,$A1 - veor d17,$B0,$B1 - veor $A0,$A0B0,$A1B1 - bl mul_1x1_neon @ (a0+a1)·(b0+b1) - - veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 - vshl.u64 d1,d0,#32 - vshr.u64 d0,d0,#32 - veor $A0B0,d1 - veor $A1B1,d0 - vst1.32 {${A0B0}[0]},[r0,:32]! - vst1.32 {${A0B0}[1]},[r0,:32]! - vst1.32 {${A1B1}[0]},[r0,:32]! - vst1.32 {${A1B1}[1]},[r0,:32] - bx r12 + ldr r12, [sp] @ 5th argument + vmov.32 $a, r2, r1 + vmov.32 $b, r12, r3 + vmov.i64 $k48, #0x0000ffffffffffff + vmov.i64 $k32, #0x00000000ffffffff + vmov.i64 $k16, #0x000000000000ffff + + vext.8 $t0#lo, $a, $a, #1 @ A1 + vmull.p8 $t0, $t0#lo, $b @ F = A1*B + vext.8 $r#lo, $b, $b, #1 @ B1 + vmull.p8 $r, $a, $r#lo @ E = A*B1 + vext.8 $t1#lo, $a, $a, #2 @ A2 + vmull.p8 $t1, $t1#lo, $b @ H = A2*B + vext.8 $t3#lo, $b, $b, #2 @ B2 + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 + vext.8 $t2#lo, $a, $a, #3 @ A3 + veor $t0, $t0, $r @ L = E + F + vmull.p8 $t2, $t2#lo, $b @ J = A3*B + vext.8 $r#lo, $b, $b, #3 @ B3 + veor $t1, $t1, $t3 @ M = G + H + vmull.p8 $r, $a, $r#lo @ I = A*B3 + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 + vand $t0#hi, $t0#hi, $k48 + vext.8 $t3#lo, $b, $b, #4 @ B4 + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 + vand $t1#hi, $t1#hi, $k32 + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 + veor $t2, $t2, $r @ N = I + J + veor $t0#lo, $t0#lo, $t0#hi + veor $t1#lo, $t1#lo, $t1#hi + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 + vand $t2#hi, $t2#hi, $k16 + vext.8 $t0, $t0, $t0, #15 + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 + vmov.i64 $t3#hi, #0 + vext.8 $t1, $t1, $t1, #14 + veor $t2#lo, $t2#lo, $t2#hi + vmull.p8 $r, $a, $b @ D = A*B + vext.8 $t3, $t3, $t3, #12 + vext.8 $t2, $t2, $t2, #13 + veor $t0, $t0, $t1 + veor $t2, $t2, $t3 + veor $r, $r, $t0 + veor $r, $r, $t2 + + vst1.32 {$r}, [r0] + ret @ bx lr .align 4 .Lialu: #endif ___ +} $ret="r10"; # reassigned 1st argument $code.=<<___; stmdb sp!,{r4-r10,lr} @@ -272,7 +269,13 @@ $code.=<<___; .comm OPENSSL_armcap_P,4,4 ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} close STDOUT; # enforce flush diff --git a/app/openssl/crypto/bn/asm/armv4-mont.S b/app/openssl/crypto/bn/asm/armv4-mont.S index eb5cd951..fecae15e 120000..100644 --- a/app/openssl/crypto/bn/asm/armv4-mont.S +++ b/app/openssl/crypto/bn/asm/armv4-mont.S @@ -1 +1,579 @@ -armv4-mont.s
\ No newline at end of file +#include "arm_arch.h" + +.text +.code 32 + +#if __ARM_ARCH__>=7 +.align 5 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-bn_mul_mont +#endif + +.global bn_mul_mont +.type bn_mul_mont,%function + +.align 5 +bn_mul_mont: + ldr ip,[sp,#4] @ load num + stmdb sp!,{r0,r2} @ sp points at argument block +#if __ARM_ARCH__>=7 + tst ip,#7 + bne .Lialu + adr r0,bn_mul_mont + ldr r2,.LOPENSSL_armcap + ldr r0,[r0,r2] + tst r0,#1 @ NEON available? + ldmia sp, {r0,r2} + beq .Lialu + add sp,sp,#8 + b bn_mul8x_mont_neon +.align 4 +.Lialu: +#endif + cmp ip,#2 + mov r0,ip @ load num + movlt r0,#0 + addlt sp,sp,#2*4 + blt .Labrt + + stmdb sp!,{r4-r12,lr} @ save 10 registers + + mov r0,r0,lsl#2 @ rescale r0 for byte count + sub sp,sp,r0 @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub r0,r0,#4 @ "num=num-1" + add r4,r2,r0 @ &bp[num-1] + + add r0,sp,r0 @ r0 to point at &tp[num-1] + ldr r8,[r0,#14*4] @ &n0 + ldr r2,[r2] @ bp[0] + ldr r5,[r1],#4 @ ap[0],ap++ + ldr r6,[r3],#4 @ np[0],np++ + ldr r8,[r8] @ *n0 + str r4,[r0,#15*4] @ save &bp[num] + + umull r10,r11,r5,r2 @ ap[0]*bp[0] + str r8,[r0,#14*4] @ save n0 value + mul r8,r10,r8 @ "tp[0]"*n0 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" + mov r4,sp + +.L1st: + ldr r5,[r1],#4 @ ap[j],ap++ + mov r10,r11 + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[0] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .L1st + + adds r12,r12,r11 + ldr r4,[r0,#13*4] @ restore bp + mov r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + str r14,[r0,#4] @ tp[num]= + +.Louter: + sub r7,r0,sp @ "original" r0-1 value + sub r1,r1,r7 @ "rewind" ap to &ap[1] + ldr r2,[r4,#4]! @ *(++bp) + sub r3,r3,r7 @ "rewind" np to &np[1] + ldr r5,[r1,#-4] @ ap[0] + ldr r10,[sp] @ tp[0] + ldr r6,[r3,#-4] @ np[0] + ldr r7,[sp,#4] @ tp[1] + + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] + str r4,[r0,#13*4] @ save bp + mul r8,r10,r8 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" + mov r4,sp + +.Linner: + ldr r5,[r1],#4 @ ap[j],ap++ + adds r10,r11,r7 @ +=tp[j] + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[i] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adc r11,r11,#0 + ldr r7,[r4,#8] @ tp[j+1] + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .Linner + + adds r12,r12,r11 + mov r14,#0 + ldr r4,[r0,#13*4] @ restore bp + adc r14,r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adds r12,r12,r7 + ldr r7,[r0,#15*4] @ restore &bp[num] + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + str r14,[r0,#4] @ tp[num]= + + cmp r4,r7 + bne .Louter + + ldr r2,[r0,#12*4] @ pull rp + add r0,r0,#4 @ r0 to point at &tp[num] + sub r5,r0,sp @ "original" num value + mov r4,sp @ "rewind" r4 + mov r1,r4 @ "borrow" r1 + sub r3,r3,r5 @ "rewind" r3 to &np[0] + + subs r7,r7,r7 @ "clear" carry flag +.Lsub: ldr r7,[r4],#4 + ldr r6,[r3],#4 + sbcs r7,r7,r6 @ tp[j]-np[j] + str r7,[r2],#4 @ rp[j]= + teq r4,r0 @ preserve carry + bne .Lsub + sbcs r14,r14,#0 @ upmost carry + mov r4,sp @ "rewind" r4 + sub r2,r2,r5 @ "rewind" r2 + + and r1,r4,r14 + bic r3,r2,r14 + orr r1,r1,r3 @ ap=borrow?tp:rp + +.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh + str sp,[r4],#4 @ zap tp + str r7,[r2],#4 + cmp r4,r0 + bne .Lcopy + + add sp,r0,#4 @ skip over tp[num+1] + ldmia sp!,{r4-r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +.Labrt: +#if __ARM_ARCH__>=5 + bx lr @ .word 0xe12fff1e +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size bn_mul_mont,.-bn_mul_mont +#if __ARM_ARCH__>=7 +.fpu neon + +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4-r11} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load rest of parameter block + + sub r7,sp,#16 + vld1.32 {d28[0]}, [r2,:32]! + sub r7,r7,r5,lsl#4 + vld1.32 {d0-d3}, [r1]! @ can't specify :32 :-( + and r7,r7,#-64 + vld1.32 {d30[0]}, [r4,:32] + mov sp,r7 @ alloca + veor d8,d8,d8 + subs r8,r5,#8 + vzip.16 d28,d8 + + vmull.u32 q6,d28,d0[0] + vmull.u32 q7,d28,d0[1] + vmull.u32 q8,d28,d1[0] + vshl.i64 d10,d13,#16 + vmull.u32 q9,d28,d1[1] + + vadd.u64 d10,d10,d12 + veor d8,d8,d8 + vmul.u32 d29,d10,d30 + + vmull.u32 q10,d28,d2[0] + vld1.32 {d4-d7}, [r3]! + vmull.u32 q11,d28,d2[1] + vmull.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmull.u32 q13,d28,d3[1] + + bne .LNEON_1st + + @ special case for num=8, everything is in register bank... + + vmlal.u32 q6,d29,d4[0] + sub r9,r5,#1 + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + b .LNEON_outer8 + +.align 4 +.LNEON_outer8: + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + vzip.16 d28,d8 + vadd.u64 d12,d12,d10 + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + vmlal.u32 q8,d28,d1[0] + vshl.i64 d10,d13,#16 + vmlal.u32 q9,d28,d1[1] + + vadd.u64 d10,d10,d12 + veor d8,d8,d8 + subs r9,r9,#1 + vmul.u32 d29,d10,d30 + + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + bne .LNEON_outer8 + + vadd.u64 d12,d12,d10 + mov r7,sp + vshr.u64 d10,d12,#16 + mov r8,r5 + vadd.u64 d13,d13,d10 + add r6,sp,#16 + vshr.u64 d10,d13,#16 + vzip.16 d12,d13 + + b .LNEON_tail2 + +.align 4 +.LNEON_1st: + vmlal.u32 q6,d29,d4[0] + vld1.32 {d0-d3}, [r1]! + vmlal.u32 q7,d29,d4[1] + subs r8,r8,#8 + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vld1.32 {d4-d5}, [r3]! + vmlal.u32 q11,d29,d6[1] + vst1.64 {q6-q7}, [r7,:256]! + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vst1.64 {q8-q9}, [r7,:256]! + + vmull.u32 q6,d28,d0[0] + vld1.32 {d6-d7}, [r3]! + vmull.u32 q7,d28,d0[1] + vst1.64 {q10-q11}, [r7,:256]! + vmull.u32 q8,d28,d1[0] + vmull.u32 q9,d28,d1[1] + vst1.64 {q12-q13}, [r7,:256]! + + vmull.u32 q10,d28,d2[0] + vmull.u32 q11,d28,d2[1] + vmull.u32 q12,d28,d3[0] + vmull.u32 q13,d28,d3[1] + + bne .LNEON_1st + + vmlal.u32 q6,d29,d4[0] + add r6,sp,#16 + vmlal.u32 q7,d29,d4[1] + sub r1,r1,r5,lsl#2 @ rewind r1 + vmlal.u32 q8,d29,d5[0] + vld1.64 {q5}, [sp,:128] + vmlal.u32 q9,d29,d5[1] + sub r9,r5,#1 + + vmlal.u32 q10,d29,d6[0] + vst1.64 {q6-q7}, [r7,:256]! + vmlal.u32 q11,d29,d6[1] + vshr.u64 d10,d10,#16 + vld1.64 {q6}, [r6, :128]! + vmlal.u32 q12,d29,d7[0] + vst1.64 {q8-q9}, [r7,:256]! + vmlal.u32 q13,d29,d7[1] + + vst1.64 {q10-q11}, [r7,:256]! + vadd.u64 d10,d10,d11 + veor q4,q4,q4 + vst1.64 {q12-q13}, [r7,:256]! + vld1.64 {q7-q8}, [r6, :256]! + vst1.64 {q4}, [r7,:128] + vshr.u64 d10,d10,#16 + + b .LNEON_outer + +.align 4 +.LNEON_outer: + vld1.32 {d28[0]}, [r2,:32]! + sub r3,r3,r5,lsl#2 @ rewind r3 + vld1.32 {d0-d3}, [r1]! + veor d8,d8,d8 + mov r7,sp + vzip.16 d28,d8 + sub r8,r5,#8 + vadd.u64 d12,d12,d10 + + vmlal.u32 q6,d28,d0[0] + vld1.64 {q9-q10},[r6,:256]! + vmlal.u32 q7,d28,d0[1] + vmlal.u32 q8,d28,d1[0] + vld1.64 {q11-q12},[r6,:256]! + vmlal.u32 q9,d28,d1[1] + + vshl.i64 d10,d13,#16 + veor d8,d8,d8 + vadd.u64 d10,d10,d12 + vld1.64 {q13},[r6,:128]! + vmul.u32 d29,d10,d30 + + vmlal.u32 q10,d28,d2[0] + vld1.32 {d4-d7}, [r3]! + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + +.LNEON_inner: + vmlal.u32 q6,d29,d4[0] + vld1.32 {d0-d3}, [r1]! + vmlal.u32 q7,d29,d4[1] + subs r8,r8,#8 + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + vst1.64 {q6-q7}, [r7,:256]! + + vmlal.u32 q10,d29,d6[0] + vld1.64 {q6}, [r6, :128]! + vmlal.u32 q11,d29,d6[1] + vst1.64 {q8-q9}, [r7,:256]! + vmlal.u32 q12,d29,d7[0] + vld1.64 {q7-q8}, [r6, :256]! + vmlal.u32 q13,d29,d7[1] + vst1.64 {q10-q11}, [r7,:256]! + + vmlal.u32 q6,d28,d0[0] + vld1.64 {q9-q10}, [r6, :256]! + vmlal.u32 q7,d28,d0[1] + vst1.64 {q12-q13}, [r7,:256]! + vmlal.u32 q8,d28,d1[0] + vld1.64 {q11-q12}, [r6, :256]! + vmlal.u32 q9,d28,d1[1] + vld1.32 {d4-d7}, [r3]! + + vmlal.u32 q10,d28,d2[0] + vld1.64 {q13}, [r6, :128]! + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vmlal.u32 q13,d28,d3[1] + + bne .LNEON_inner + + vmlal.u32 q6,d29,d4[0] + add r6,sp,#16 + vmlal.u32 q7,d29,d4[1] + sub r1,r1,r5,lsl#2 @ rewind r1 + vmlal.u32 q8,d29,d5[0] + vld1.64 {q5}, [sp,:128] + vmlal.u32 q9,d29,d5[1] + subs r9,r9,#1 + + vmlal.u32 q10,d29,d6[0] + vst1.64 {q6-q7}, [r7,:256]! + vmlal.u32 q11,d29,d6[1] + vld1.64 {q6}, [r6, :128]! + vshr.u64 d10,d10,#16 + vst1.64 {q8-q9}, [r7,:256]! + vmlal.u32 q12,d29,d7[0] + vld1.64 {q7-q8}, [r6, :256]! + vmlal.u32 q13,d29,d7[1] + + vst1.64 {q10-q11}, [r7,:256]! + vadd.u64 d10,d10,d11 + vst1.64 {q12-q13}, [r7,:256]! + vshr.u64 d10,d10,#16 + + bne .LNEON_outer + + mov r7,sp + mov r8,r5 + +.LNEON_tail: + vadd.u64 d12,d12,d10 + vld1.64 {q9-q10}, [r6, :256]! + vshr.u64 d10,d12,#16 + vadd.u64 d13,d13,d10 + vld1.64 {q11-q12}, [r6, :256]! + vshr.u64 d10,d13,#16 + vld1.64 {q13}, [r6, :128]! + vzip.16 d12,d13 + +.LNEON_tail2: + vadd.u64 d14,d14,d10 + vst1.32 {d12[0]}, [r7, :32]! + vshr.u64 d10,d14,#16 + vadd.u64 d15,d15,d10 + vshr.u64 d10,d15,#16 + vzip.16 d14,d15 + + vadd.u64 d16,d16,d10 + vst1.32 {d14[0]}, [r7, :32]! + vshr.u64 d10,d16,#16 + vadd.u64 d17,d17,d10 + vshr.u64 d10,d17,#16 + vzip.16 d16,d17 + + vadd.u64 d18,d18,d10 + vst1.32 {d16[0]}, [r7, :32]! + vshr.u64 d10,d18,#16 + vadd.u64 d19,d19,d10 + vshr.u64 d10,d19,#16 + vzip.16 d18,d19 + + vadd.u64 d20,d20,d10 + vst1.32 {d18[0]}, [r7, :32]! + vshr.u64 d10,d20,#16 + vadd.u64 d21,d21,d10 + vshr.u64 d10,d21,#16 + vzip.16 d20,d21 + + vadd.u64 d22,d22,d10 + vst1.32 {d20[0]}, [r7, :32]! + vshr.u64 d10,d22,#16 + vadd.u64 d23,d23,d10 + vshr.u64 d10,d23,#16 + vzip.16 d22,d23 + + vadd.u64 d24,d24,d10 + vst1.32 {d22[0]}, [r7, :32]! + vshr.u64 d10,d24,#16 + vadd.u64 d25,d25,d10 + vld1.64 {q6}, [r6, :128]! + vshr.u64 d10,d25,#16 + vzip.16 d24,d25 + + vadd.u64 d26,d26,d10 + vst1.32 {d24[0]}, [r7, :32]! + vshr.u64 d10,d26,#16 + vadd.u64 d27,d27,d10 + vld1.64 {q7-q8}, [r6, :256]! + vshr.u64 d10,d27,#16 + vzip.16 d26,d27 + subs r8,r8,#8 + vst1.32 {d26[0]}, [r7, :32]! + + bne .LNEON_tail + + vst1.32 {d10[0]}, [r7, :32] @ top-most bit + sub r3,r3,r5,lsl#2 @ rewind r3 + subs r1,sp,#0 @ clear carry flag + add r2,sp,r5,lsl#2 + +.LNEON_sub: + ldmia r1!, {r4-r7} + ldmia r3!, {r8-r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq r1,r2 @ preserves carry + stmia r0!, {r8-r11} + bne .LNEON_sub + + ldr r10, [r1] @ load top-most bit + veor q0,q0,q0 + sub r11,r2,sp @ this is num*4 + veor q1,q1,q1 + mov r1,sp + sub r0,r0,r11 @ rewind r0 + mov r3,r2 @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +.LNEON_copy_n_zap: + ldmia r1!, {r4-r7} + ldmia r0, {r8-r11} + movcc r8, r4 + vst1.64 {q0-q1}, [r3,:256]! @ wipe + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [r3,:256]! @ wipe + movcc r11,r7 + ldmia r1, {r4-r7} + stmia r0!, {r8-r11} + sub r1,r1,#16 + ldmia r0, {r8-r11} + movcc r8, r4 + vst1.64 {q0-q1}, [r1,:256]! @ wipe + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [r3,:256]! @ wipe + movcc r11,r7 + teq r1,r2 @ preserves carry + stmia r0!, {r8-r11} + bne .LNEON_copy_n_zap + + sub sp,ip,#96 + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r11} + bx lr @ .word 0xe12fff1e +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +#if __ARM_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +#endif diff --git a/app/openssl/crypto/bn/asm/armv4-mont.pl b/app/openssl/crypto/bn/asm/armv4-mont.pl index f78a8b5f..72bad8e3 100644 --- a/app/openssl/crypto/bn/asm/armv4-mont.pl +++ b/app/openssl/crypto/bn/asm/armv4-mont.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -23,6 +23,21 @@ # than 1/2KB. Windows CE port would be trivial, as it's exclusively # about decorations, ABI and instruction syntax are identical. +# November 2013 +# +# Add NEON code path, which handles lengths divisible by 8. RSA/DSA +# performance improvement on Cortex-A8 is ~45-100% depending on key +# length, more for longer keys. On Cortex-A15 the span is ~10-105%. +# On Snapdragon S4 improvement was measured to vary from ~70% to +# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is +# rather because original integer-only code seems to perform +# suboptimally on S4. Situation on Cortex-A9 is unfortunately +# different. It's being looked into, but the trouble is that +# performance for vectors longer than 256 bits is actually couple +# of percent worse than for integer-only code. The code is chosen +# for execution on all NEON-capable processors, because gain on +# others outweighs the marginal loss on Cortex-A9. + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -52,16 +67,40 @@ $_n0="$num,#14*4"; $_num="$num,#15*4"; $_bpend=$_num; $code=<<___; +#include "arm_arch.h" + .text +.code 32 + +#if __ARM_ARCH__>=7 +.align 5 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-bn_mul_mont +#endif .global bn_mul_mont .type bn_mul_mont,%function -.align 2 +.align 5 bn_mul_mont: + ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block - ldr $num,[sp,#3*4] @ load num - cmp $num,#2 +#if __ARM_ARCH__>=7 + tst ip,#7 + bne .Lialu + adr r0,bn_mul_mont + ldr r2,.LOPENSSL_armcap + ldr r0,[r0,r2] + tst r0,#1 @ NEON available? + ldmia sp, {r0,r2} + beq .Lialu + add sp,sp,#8 + b bn_mul8x_mont_neon +.align 4 +.Lialu: +#endif + cmp ip,#2 + mov $num,ip @ load num movlt r0,#0 addlt sp,sp,#2*4 blt .Labrt @@ -191,14 +230,446 @@ bn_mul_mont: ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 -.Labrt: tst lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size bn_mul_mont,.-bn_mul_mont -.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +___ +{ +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); +my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); +my ($Z,$Temp)=("q4","q5"); +my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); +my ($Bi,$Ni,$M0)=map("d$_",(28..31)); +my $zero=&Dlo($Z); +my $temp=&Dlo($Temp); + +my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); +my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4-r11} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load rest of parameter block + + sub $toutptr,sp,#16 + vld1.32 {${Bi}[0]}, [$bptr,:32]! + sub $toutptr,$toutptr,$num,lsl#4 + vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( + and $toutptr,$toutptr,#-64 + vld1.32 {${M0}[0]}, [$n0,:32] + mov sp,$toutptr @ alloca + veor $zero,$zero,$zero + subs $inner,$num,#8 + vzip.16 $Bi,$zero + + vmull.u32 $A0xB,$Bi,${A0}[0] + vmull.u32 $A1xB,$Bi,${A0}[1] + vmull.u32 $A2xB,$Bi,${A1}[0] + vshl.i64 $temp,`&Dhi("$A0xB")`,#16 + vmull.u32 $A3xB,$Bi,${A1}[1] + + vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + veor $zero,$zero,$zero + vmul.u32 $Ni,$temp,$M0 + + vmull.u32 $A4xB,$Bi,${A2}[0] + vld1.32 {$N0-$N3}, [$nptr]! + vmull.u32 $A5xB,$Bi,${A2}[1] + vmull.u32 $A6xB,$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmull.u32 $A7xB,$Bi,${A3}[1] + + bne .LNEON_1st + + @ special case for num=8, everything is in register bank... + + vmlal.u32 $A0xB,$Ni,${N0}[0] + sub $outer,$num,#1 + vmlal.u32 $A1xB,$Ni,${N0}[1] + vmlal.u32 $A2xB,$Ni,${N1}[0] + vmlal.u32 $A3xB,$Ni,${N1}[1] + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vmov $Temp,$A0xB + vmlal.u32 $A5xB,$Ni,${N2}[1] + vmov $A0xB,$A1xB + vmlal.u32 $A6xB,$Ni,${N3}[0] + vmov $A1xB,$A2xB + vmlal.u32 $A7xB,$Ni,${N3}[1] + vmov $A2xB,$A3xB + vmov $A3xB,$A4xB + vshr.u64 $temp,$temp,#16 + vmov $A4xB,$A5xB + vmov $A5xB,$A6xB + vadd.u64 $temp,$temp,`&Dhi("$Temp")` + vmov $A6xB,$A7xB + veor $A7xB,$A7xB + vshr.u64 $temp,$temp,#16 + + b .LNEON_outer8 + +.align 4 +.LNEON_outer8: + vld1.32 {${Bi}[0]}, [$bptr,:32]! + veor $zero,$zero,$zero + vzip.16 $Bi,$zero + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + + vmlal.u32 $A0xB,$Bi,${A0}[0] + vmlal.u32 $A1xB,$Bi,${A0}[1] + vmlal.u32 $A2xB,$Bi,${A1}[0] + vshl.i64 $temp,`&Dhi("$A0xB")`,#16 + vmlal.u32 $A3xB,$Bi,${A1}[1] + + vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + veor $zero,$zero,$zero + subs $outer,$outer,#1 + vmul.u32 $Ni,$temp,$M0 + + vmlal.u32 $A4xB,$Bi,${A2}[0] + vmlal.u32 $A5xB,$Bi,${A2}[1] + vmlal.u32 $A6xB,$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 $A7xB,$Bi,${A3}[1] + + vmlal.u32 $A0xB,$Ni,${N0}[0] + vmlal.u32 $A1xB,$Ni,${N0}[1] + vmlal.u32 $A2xB,$Ni,${N1}[0] + vmlal.u32 $A3xB,$Ni,${N1}[1] + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vmov $Temp,$A0xB + vmlal.u32 $A5xB,$Ni,${N2}[1] + vmov $A0xB,$A1xB + vmlal.u32 $A6xB,$Ni,${N3}[0] + vmov $A1xB,$A2xB + vmlal.u32 $A7xB,$Ni,${N3}[1] + vmov $A2xB,$A3xB + vmov $A3xB,$A4xB + vshr.u64 $temp,$temp,#16 + vmov $A4xB,$A5xB + vmov $A5xB,$A6xB + vadd.u64 $temp,$temp,`&Dhi("$Temp")` + vmov $A6xB,$A7xB + veor $A7xB,$A7xB + vshr.u64 $temp,$temp,#16 + + bne .LNEON_outer8 + + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + mov $toutptr,sp + vshr.u64 $temp,`&Dlo("$A0xB")`,#16 + mov $inner,$num + vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp + add $tinptr,sp,#16 + vshr.u64 $temp,`&Dhi("$A0xB")`,#16 + vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` + + b .LNEON_tail2 + +.align 4 +.LNEON_1st: + vmlal.u32 $A0xB,$Ni,${N0}[0] + vld1.32 {$A0-$A3}, [$aptr]! + vmlal.u32 $A1xB,$Ni,${N0}[1] + subs $inner,$inner,#8 + vmlal.u32 $A2xB,$Ni,${N1}[0] + vmlal.u32 $A3xB,$Ni,${N1}[1] + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vld1.32 {$N0-$N1}, [$nptr]! + vmlal.u32 $A5xB,$Ni,${N2}[1] + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! + vmlal.u32 $A6xB,$Ni,${N3}[0] + vmlal.u32 $A7xB,$Ni,${N3}[1] + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! + + vmull.u32 $A0xB,$Bi,${A0}[0] + vld1.32 {$N2-$N3}, [$nptr]! + vmull.u32 $A1xB,$Bi,${A0}[1] + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! + vmull.u32 $A2xB,$Bi,${A1}[0] + vmull.u32 $A3xB,$Bi,${A1}[1] + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! + + vmull.u32 $A4xB,$Bi,${A2}[0] + vmull.u32 $A5xB,$Bi,${A2}[1] + vmull.u32 $A6xB,$Bi,${A3}[0] + vmull.u32 $A7xB,$Bi,${A3}[1] + + bne .LNEON_1st + + vmlal.u32 $A0xB,$Ni,${N0}[0] + add $tinptr,sp,#16 + vmlal.u32 $A1xB,$Ni,${N0}[1] + sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr + vmlal.u32 $A2xB,$Ni,${N1}[0] + vld1.64 {$Temp}, [sp,:128] + vmlal.u32 $A3xB,$Ni,${N1}[1] + sub $outer,$num,#1 + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! + vmlal.u32 $A5xB,$Ni,${N2}[1] + vshr.u64 $temp,$temp,#16 + vld1.64 {$A0xB}, [$tinptr, :128]! + vmlal.u32 $A6xB,$Ni,${N3}[0] + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! + vmlal.u32 $A7xB,$Ni,${N3}[1] + + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! + vadd.u64 $temp,$temp,`&Dhi("$Temp")` + veor $Z,$Z,$Z + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! + vst1.64 {$Z}, [$toutptr,:128] + vshr.u64 $temp,$temp,#16 + + b .LNEON_outer + +.align 4 +.LNEON_outer: + vld1.32 {${Bi}[0]}, [$bptr,:32]! + sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr + vld1.32 {$A0-$A3}, [$aptr]! + veor $zero,$zero,$zero + mov $toutptr,sp + vzip.16 $Bi,$zero + sub $inner,$num,#8 + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + + vmlal.u32 $A0xB,$Bi,${A0}[0] + vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! + vmlal.u32 $A1xB,$Bi,${A0}[1] + vmlal.u32 $A2xB,$Bi,${A1}[0] + vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! + vmlal.u32 $A3xB,$Bi,${A1}[1] + + vshl.i64 $temp,`&Dhi("$A0xB")`,#16 + veor $zero,$zero,$zero + vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + vld1.64 {$A7xB},[$tinptr,:128]! + vmul.u32 $Ni,$temp,$M0 + + vmlal.u32 $A4xB,$Bi,${A2}[0] + vld1.32 {$N0-$N3}, [$nptr]! + vmlal.u32 $A5xB,$Bi,${A2}[1] + vmlal.u32 $A6xB,$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 $A7xB,$Bi,${A3}[1] + +.LNEON_inner: + vmlal.u32 $A0xB,$Ni,${N0}[0] + vld1.32 {$A0-$A3}, [$aptr]! + vmlal.u32 $A1xB,$Ni,${N0}[1] + subs $inner,$inner,#8 + vmlal.u32 $A2xB,$Ni,${N1}[0] + vmlal.u32 $A3xB,$Ni,${N1}[1] + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vld1.64 {$A0xB}, [$tinptr, :128]! + vmlal.u32 $A5xB,$Ni,${N2}[1] + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! + vmlal.u32 $A6xB,$Ni,${N3}[0] + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! + vmlal.u32 $A7xB,$Ni,${N3}[1] + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! + + vmlal.u32 $A0xB,$Bi,${A0}[0] + vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! + vmlal.u32 $A1xB,$Bi,${A0}[1] + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! + vmlal.u32 $A2xB,$Bi,${A1}[0] + vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! + vmlal.u32 $A3xB,$Bi,${A1}[1] + vld1.32 {$N0-$N3}, [$nptr]! + + vmlal.u32 $A4xB,$Bi,${A2}[0] + vld1.64 {$A7xB}, [$tinptr, :128]! + vmlal.u32 $A5xB,$Bi,${A2}[1] + vmlal.u32 $A6xB,$Bi,${A3}[0] + vmlal.u32 $A7xB,$Bi,${A3}[1] + + bne .LNEON_inner + + vmlal.u32 $A0xB,$Ni,${N0}[0] + add $tinptr,sp,#16 + vmlal.u32 $A1xB,$Ni,${N0}[1] + sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr + vmlal.u32 $A2xB,$Ni,${N1}[0] + vld1.64 {$Temp}, [sp,:128] + vmlal.u32 $A3xB,$Ni,${N1}[1] + subs $outer,$outer,#1 + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! + vmlal.u32 $A5xB,$Ni,${N2}[1] + vld1.64 {$A0xB}, [$tinptr, :128]! + vshr.u64 $temp,$temp,#16 + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! + vmlal.u32 $A6xB,$Ni,${N3}[0] + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! + vmlal.u32 $A7xB,$Ni,${N3}[1] + + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! + vadd.u64 $temp,$temp,`&Dhi("$Temp")` + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! + vshr.u64 $temp,$temp,#16 + + bne .LNEON_outer + + mov $toutptr,sp + mov $inner,$num + +.LNEON_tail: + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! + vshr.u64 $temp,`&Dlo("$A0xB")`,#16 + vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp + vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! + vshr.u64 $temp,`&Dhi("$A0xB")`,#16 + vld1.64 {$A7xB}, [$tinptr, :128]! + vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` + +.LNEON_tail2: + vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp + vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A1xB")`,#16 + vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp + vshr.u64 $temp,`&Dhi("$A1xB")`,#16 + vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` + + vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp + vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A2xB")`,#16 + vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp + vshr.u64 $temp,`&Dhi("$A2xB")`,#16 + vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` + + vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp + vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A3xB")`,#16 + vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp + vshr.u64 $temp,`&Dhi("$A3xB")`,#16 + vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` + + vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp + vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A4xB")`,#16 + vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp + vshr.u64 $temp,`&Dhi("$A4xB")`,#16 + vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` + + vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp + vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A5xB")`,#16 + vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp + vshr.u64 $temp,`&Dhi("$A5xB")`,#16 + vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` + + vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp + vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A6xB")`,#16 + vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp + vld1.64 {$A0xB}, [$tinptr, :128]! + vshr.u64 $temp,`&Dhi("$A6xB")`,#16 + vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` + + vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp + vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A7xB")`,#16 + vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! + vshr.u64 $temp,`&Dhi("$A7xB")`,#16 + vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` + subs $inner,$inner,#8 + vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! + + bne .LNEON_tail + + vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit + sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr + subs $aptr,sp,#0 @ clear carry flag + add $bptr,sp,$num,lsl#2 + +.LNEON_sub: + ldmia $aptr!, {r4-r7} + ldmia $nptr!, {r8-r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq $aptr,$bptr @ preserves carry + stmia $rptr!, {r8-r11} + bne .LNEON_sub + + ldr r10, [$aptr] @ load top-most bit + veor q0,q0,q0 + sub r11,$bptr,sp @ this is num*4 + veor q1,q1,q1 + mov $aptr,sp + sub $rptr,$rptr,r11 @ rewind $rptr + mov $nptr,$bptr @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +.LNEON_copy_n_zap: + ldmia $aptr!, {r4-r7} + ldmia $rptr, {r8-r11} + movcc r8, r4 + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + movcc r11,r7 + ldmia $aptr, {r4-r7} + stmia $rptr!, {r8-r11} + sub $aptr,$aptr,#16 + ldmia $rptr, {r8-r11} + movcc r8, r4 + vst1.64 {q0-q1}, [$aptr,:256]! @ wipe + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + movcc r11,r7 + teq $aptr,$bptr @ preserves carry + stmia $rptr!, {r8-r11} + bne .LNEON_copy_n_zap + + sub sp,ip,#96 + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r11} + ret @ bx lr +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +___ +} +$code.=<<___; +.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 +#if __ARM_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +#endif ___ +$code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; print $code; close STDOUT; diff --git a/app/openssl/crypto/bn/asm/armv4-mont.s b/app/openssl/crypto/bn/asm/armv4-mont.s deleted file mode 100644 index 64c220b5..00000000 --- a/app/openssl/crypto/bn/asm/armv4-mont.s +++ /dev/null @@ -1,145 +0,0 @@ -.text - -.global bn_mul_mont -.type bn_mul_mont,%function - -.align 2 -bn_mul_mont: - stmdb sp!,{r0,r2} @ sp points at argument block - ldr r0,[sp,#3*4] @ load num - cmp r0,#2 - movlt r0,#0 - addlt sp,sp,#2*4 - blt .Labrt - - stmdb sp!,{r4-r12,lr} @ save 10 registers - - mov r0,r0,lsl#2 @ rescale r0 for byte count - sub sp,sp,r0 @ alloca(4*num) - sub sp,sp,#4 @ +extra dword - sub r0,r0,#4 @ "num=num-1" - add r4,r2,r0 @ &bp[num-1] - - add r0,sp,r0 @ r0 to point at &tp[num-1] - ldr r8,[r0,#14*4] @ &n0 - ldr r2,[r2] @ bp[0] - ldr r5,[r1],#4 @ ap[0],ap++ - ldr r6,[r3],#4 @ np[0],np++ - ldr r8,[r8] @ *n0 - str r4,[r0,#15*4] @ save &bp[num] - - umull r10,r11,r5,r2 @ ap[0]*bp[0] - str r8,[r0,#14*4] @ save n0 value - mul r8,r10,r8 @ "tp[0]"*n0 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" - mov r4,sp - -.L1st: - ldr r5,[r1],#4 @ ap[j],ap++ - mov r10,r11 - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[0] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne .L1st - - adds r12,r12,r11 - ldr r4,[r0,#13*4] @ restore bp - mov r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - str r14,[r0,#4] @ tp[num]= - -.Louter: - sub r7,r0,sp @ "original" r0-1 value - sub r1,r1,r7 @ "rewind" ap to &ap[1] - ldr r2,[r4,#4]! @ *(++bp) - sub r3,r3,r7 @ "rewind" np to &np[1] - ldr r5,[r1,#-4] @ ap[0] - ldr r10,[sp] @ tp[0] - ldr r6,[r3,#-4] @ np[0] - ldr r7,[sp,#4] @ tp[1] - - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] - str r4,[r0,#13*4] @ save bp - mul r8,r10,r8 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" - mov r4,sp - -.Linner: - ldr r5,[r1],#4 @ ap[j],ap++ - adds r10,r11,r7 @ +=tp[j] - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[i] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adc r11,r11,#0 - ldr r7,[r4,#8] @ tp[j+1] - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne .Linner - - adds r12,r12,r11 - mov r14,#0 - ldr r4,[r0,#13*4] @ restore bp - adc r14,r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adds r12,r12,r7 - ldr r7,[r0,#15*4] @ restore &bp[num] - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - str r14,[r0,#4] @ tp[num]= - - cmp r4,r7 - bne .Louter - - ldr r2,[r0,#12*4] @ pull rp - add r0,r0,#4 @ r0 to point at &tp[num] - sub r5,r0,sp @ "original" num value - mov r4,sp @ "rewind" r4 - mov r1,r4 @ "borrow" r1 - sub r3,r3,r5 @ "rewind" r3 to &np[0] - - subs r7,r7,r7 @ "clear" carry flag -.Lsub: ldr r7,[r4],#4 - ldr r6,[r3],#4 - sbcs r7,r7,r6 @ tp[j]-np[j] - str r7,[r2],#4 @ rp[j]= - teq r4,r0 @ preserve carry - bne .Lsub - sbcs r14,r14,#0 @ upmost carry - mov r4,sp @ "rewind" r4 - sub r2,r2,r5 @ "rewind" r2 - - and r1,r4,r14 - bic r3,r2,r14 - orr r1,r1,r3 @ ap=borrow?tp:rp - -.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh - str sp,[r4],#4 @ zap tp - str r7,[r2],#4 - cmp r4,r0 - bne .Lcopy - - add sp,r0,#4 @ skip over tp[num+1] - ldmia sp!,{r4-r12,lr} @ restore registers - add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -.Labrt: tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -.size bn_mul_mont,.-bn_mul_mont -.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro@openssl.org>" -.align 2 diff --git a/app/openssl/crypto/bn/asm/mips3.s b/app/openssl/crypto/bn/asm/mips3.S index dca4105c..dca4105c 100644 --- a/app/openssl/crypto/bn/asm/mips3.s +++ b/app/openssl/crypto/bn/asm/mips3.S diff --git a/app/openssl/crypto/bn/asm/pa-risc2.s b/app/openssl/crypto/bn/asm/pa-risc2.S index f3b16290..f3b16290 100644 --- a/app/openssl/crypto/bn/asm/pa-risc2.s +++ b/app/openssl/crypto/bn/asm/pa-risc2.S diff --git a/app/openssl/crypto/bn/asm/pa-risc2W.s b/app/openssl/crypto/bn/asm/pa-risc2W.S index a9954575..a9954575 100644 --- a/app/openssl/crypto/bn/asm/pa-risc2W.s +++ b/app/openssl/crypto/bn/asm/pa-risc2W.S diff --git a/app/openssl/crypto/bn/bn_mont.c b/app/openssl/crypto/bn/bn_mont.c index 427b5cf4..ee8532c7 100644 --- a/app/openssl/crypto/bn/bn_mont.c +++ b/app/openssl/crypto/bn/bn_mont.c @@ -478,32 +478,38 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from) BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock, const BIGNUM *mod, BN_CTX *ctx) { - int got_write_lock = 0; BN_MONT_CTX *ret; CRYPTO_r_lock(lock); - if (!*pmont) + ret = *pmont; + CRYPTO_r_unlock(lock); + if (ret) + return ret; + + /* We don't want to serialise globally while doing our lazy-init math in + * BN_MONT_CTX_set. That punishes threads that are doing independent + * things. Instead, punish the case where more than one thread tries to + * lazy-init the same 'pmont', by having each do the lazy-init math work + * independently and only use the one from the thread that wins the race + * (the losers throw away the work they've done). */ + ret = BN_MONT_CTX_new(); + if (!ret) + return NULL; + if (!BN_MONT_CTX_set(ret, mod, ctx)) { - CRYPTO_r_unlock(lock); - CRYPTO_w_lock(lock); - got_write_lock = 1; + BN_MONT_CTX_free(ret); + return NULL; + } - if (!*pmont) - { - ret = BN_MONT_CTX_new(); - if (ret && !BN_MONT_CTX_set(ret, mod, ctx)) - BN_MONT_CTX_free(ret); - else - *pmont = ret; - } + /* The locked compare-and-set, after the local work is done. */ + CRYPTO_w_lock(lock); + if (*pmont) + { + BN_MONT_CTX_free(ret); + ret = *pmont; } - - ret = *pmont; - - if (got_write_lock) - CRYPTO_w_unlock(lock); else - CRYPTO_r_unlock(lock); - + *pmont = ret; + CRYPTO_w_unlock(lock); return ret; } diff --git a/app/openssl/crypto/cms/cms_env.c b/app/openssl/crypto/cms/cms_env.c index be20b1c0..add00bf9 100644 --- a/app/openssl/crypto/cms/cms_env.c +++ b/app/openssl/crypto/cms/cms_env.c @@ -185,6 +185,8 @@ CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms, if (flags & CMS_USE_KEYID) { ktri->version = 2; + if (env->version < 2) + env->version = 2; type = CMS_RECIPINFO_KEYIDENTIFIER; } else diff --git a/app/openssl/crypto/cms/cms_sd.c b/app/openssl/crypto/cms/cms_sd.c index 77fbd135..51dd33a1 100644 --- a/app/openssl/crypto/cms/cms_sd.c +++ b/app/openssl/crypto/cms/cms_sd.c @@ -158,8 +158,8 @@ static void cms_sd_set_version(CMS_SignedData *sd) if (sd->version < 3) sd->version = 3; } - else - sd->version = 1; + else if (si->version < 1) + si->version = 1; } if (sd->version < 1) diff --git a/app/openssl/crypto/cms/cms_smime.c b/app/openssl/crypto/cms/cms_smime.c index 8c56e3a8..1af9f3a6 100644 --- a/app/openssl/crypto/cms/cms_smime.c +++ b/app/openssl/crypto/cms/cms_smime.c @@ -611,7 +611,7 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert) STACK_OF(CMS_RecipientInfo) *ris; CMS_RecipientInfo *ri; int i, r; - int debug = 0; + int debug = 0, ri_match = 0; ris = CMS_get0_RecipientInfos(cms); if (ris) debug = cms->d.envelopedData->encryptedContentInfo->debug; @@ -620,6 +620,7 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert) ri = sk_CMS_RecipientInfo_value(ris, i); if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_TRANS) continue; + ri_match = 1; /* If we have a cert try matching RecipientInfo * otherwise try them all. */ @@ -655,7 +656,7 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert) } } /* If no cert and not debugging always return success */ - if (!cert && !debug) + if (ri_match && !cert && !debug) { ERR_clear_error(); return 1; diff --git a/app/openssl/crypto/dso/dso_dlfcn.c b/app/openssl/crypto/dso/dso_dlfcn.c index 5f225480..4a56aace 100644 --- a/app/openssl/crypto/dso/dso_dlfcn.c +++ b/app/openssl/crypto/dso/dso_dlfcn.c @@ -464,7 +464,7 @@ static int dlfcn_pathbyaddr(void *addr,char *path,int sz) return len; } - ERR_add_error_data(4, "dlfcn_pathbyaddr(): ", dlerror()); + ERR_add_error_data(2, "dlfcn_pathbyaddr(): ", dlerror()); #endif return -1; } diff --git a/app/openssl/crypto/ec/ec_ameth.c b/app/openssl/crypto/ec/ec_ameth.c index 0ce45240..f715a238 100644 --- a/app/openssl/crypto/ec/ec_ameth.c +++ b/app/openssl/crypto/ec/ec_ameth.c @@ -352,6 +352,7 @@ static int eckey_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey) EC_KEY_set_enc_flags(ec_key, old_flags); OPENSSL_free(ep); ECerr(EC_F_ECKEY_PRIV_ENCODE, ERR_R_EC_LIB); + return 0; } /* restore old encoding flags */ EC_KEY_set_enc_flags(ec_key, old_flags); diff --git a/app/openssl/crypto/ec/ec_asn1.c b/app/openssl/crypto/ec/ec_asn1.c index 145807b6..e94f34e1 100644 --- a/app/openssl/crypto/ec/ec_asn1.c +++ b/app/openssl/crypto/ec/ec_asn1.c @@ -1435,8 +1435,11 @@ int i2o_ECPublicKey(EC_KEY *a, unsigned char **out) *out, buf_len, NULL)) { ECerr(EC_F_I2O_ECPUBLICKEY, ERR_R_EC_LIB); - OPENSSL_free(*out); - *out = NULL; + if (new_buffer) + { + OPENSSL_free(*out); + *out = NULL; + } return 0; } if (!new_buffer) diff --git a/app/openssl/crypto/ec/ec_lcl.h b/app/openssl/crypto/ec/ec_lcl.h index 6f714c75..dae91483 100644 --- a/app/openssl/crypto/ec/ec_lcl.h +++ b/app/openssl/crypto/ec/ec_lcl.h @@ -405,7 +405,7 @@ int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx); int ec_GF2m_have_precompute_mult(const EC_GROUP *group); -#ifndef OPENSSL_EC_NISTP_64_GCC_128 +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 /* method functions in ecp_nistp224.c */ int ec_GFp_nistp224_group_init(EC_GROUP *group); int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *); diff --git a/app/openssl/crypto/evp/bio_b64.c b/app/openssl/crypto/evp/bio_b64.c index ac6d441a..16863fe2 100644 --- a/app/openssl/crypto/evp/bio_b64.c +++ b/app/openssl/crypto/evp/bio_b64.c @@ -226,6 +226,7 @@ static int b64_read(BIO *b, char *out, int outl) else if (ctx->start) { q=p=(unsigned char *)ctx->tmp; + num = 0; for (j=0; j<i; j++) { if (*(q++) != '\n') continue; diff --git a/app/openssl/crypto/evp/e_aes.c b/app/openssl/crypto/evp/e_aes.c index 41cee42d..ad0f7a4a 100644 --- a/app/openssl/crypto/evp/e_aes.c +++ b/app/openssl/crypto/evp/e_aes.c @@ -62,7 +62,7 @@ typedef struct { - AES_KEY ks; + union { double align; AES_KEY ks; } ks; block128_f block; union { cbc128_f cbc; @@ -72,7 +72,7 @@ typedef struct typedef struct { - AES_KEY ks; /* AES key schedule to use */ + union { double align; AES_KEY ks; } ks; /* AES key schedule to use */ int key_set; /* Set if key initialised */ int iv_set; /* Set if an iv is set */ GCM128_CONTEXT gcm; @@ -86,7 +86,7 @@ typedef struct typedef struct { - AES_KEY ks1, ks2; /* AES key schedules to use */ + union { double align; AES_KEY ks; } ks1, ks2; /* AES key schedules to use */ XTS128_CONTEXT xts; void (*stream)(const unsigned char *in, unsigned char *out, size_t length, @@ -96,7 +96,7 @@ typedef struct typedef struct { - AES_KEY ks; /* AES key schedule to use */ + union { double align; AES_KEY ks; } ks; /* AES key schedule to use */ int key_set; /* Set if key initialised */ int iv_set; /* Set if an iv is set */ int tag_set; /* Set if tag is valid */ @@ -160,7 +160,7 @@ void AES_xts_decrypt(const char *inp,char *out,size_t len, defined(_M_AMD64) || defined(_M_X64) || \ defined(__INTEL__) ) -extern unsigned int OPENSSL_ia32cap_P[2]; +extern unsigned int OPENSSL_ia32cap_P[]; #ifdef VPAES_ASM #define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32))) @@ -310,7 +310,7 @@ static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, return 1; if (key) { - aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); + aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks); CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)aesni_encrypt); gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks; @@ -355,19 +355,19 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, /* key_len is two AES keys */ if (enc) { - aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); + aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); xctx->xts.block1 = (block128_f)aesni_encrypt; xctx->stream = aesni_xts_encrypt; } else { - aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); + aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); xctx->xts.block1 = (block128_f)aesni_decrypt; xctx->stream = aesni_xts_decrypt; } aesni_set_encrypt_key(key + ctx->key_len/2, - ctx->key_len * 4, &xctx->ks2); + ctx->key_len * 4, &xctx->ks2.ks); xctx->xts.block2 = (block128_f)aesni_encrypt; xctx->xts.key1 = &xctx->ks1; @@ -394,7 +394,7 @@ static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, return 1; if (key) { - aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); + aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks); CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, &cctx->ks, (block128_f)aesni_encrypt); cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks : @@ -482,14 +482,38 @@ static const EVP_CIPHER aes_##keylen##_##mode = { \ NULL,NULL,aes_##mode##_ctrl,NULL }; \ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ { return &aes_##keylen##_##mode; } - #endif -#if defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm)) +#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) #include "arm_arch.h" #if __ARM_ARCH__>=7 -#define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) +# if defined(BSAES_ASM) +# define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) +# endif +# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +# define HWAES_set_encrypt_key aes_v8_set_encrypt_key +# define HWAES_set_decrypt_key aes_v8_set_decrypt_key +# define HWAES_encrypt aes_v8_encrypt +# define HWAES_decrypt aes_v8_decrypt +# define HWAES_cbc_encrypt aes_v8_cbc_encrypt +# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks +#endif #endif + +#if defined(HWAES_CAPABLE) +int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); +int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); +void HWAES_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void HWAES_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const AES_KEY *key, + unsigned char *ivec, const int enc); +void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const AES_KEY *key, const unsigned char ivec[16]); #endif #define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \ @@ -510,10 +534,23 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, mode = ctx->cipher->flags & EVP_CIPH_MODE; if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + ret = HWAES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); + dat->block = (block128_f)HWAES_decrypt; + dat->stream.cbc = NULL; +#ifdef HWAES_cbc_encrypt + if (mode==EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt; +#endif + } + else +#endif #ifdef BSAES_CAPABLE if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE) { - ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); + ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); dat->block = (block128_f)AES_decrypt; dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt; } @@ -522,7 +559,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { - ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks); + ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); dat->block = (block128_f)vpaes_decrypt; dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? (cbc128_f)vpaes_cbc_encrypt : @@ -531,17 +568,37 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, else #endif { - ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); + ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); dat->block = (block128_f)AES_decrypt; dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? (cbc128_f)AES_cbc_encrypt : NULL; } else +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + ret = HWAES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); + dat->block = (block128_f)HWAES_encrypt; + dat->stream.cbc = NULL; +#ifdef HWAES_cbc_encrypt + if (mode==EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt; + else +#endif +#ifdef HWAES_ctr32_encrypt_blocks + if (mode==EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks; + else +#endif + (void)0; /* terminate potentially open 'else' */ + } + else +#endif #ifdef BSAES_CAPABLE if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE) { - ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); + ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); dat->block = (block128_f)AES_encrypt; dat->stream.ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks; } @@ -550,7 +607,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { - ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks); + ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); dat->block = (block128_f)vpaes_encrypt; dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? (cbc128_f)vpaes_cbc_encrypt : @@ -559,7 +616,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, else #endif { - ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); + ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); dat->block = (block128_f)AES_encrypt; dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? (cbc128_f)AES_cbc_encrypt : @@ -830,10 +887,25 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, return 1; if (key) { do { +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + HWAES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks); + CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, + (block128_f)HWAES_encrypt); +#ifdef HWAES_ctr32_encrypt_blocks + gctx->ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks; +#else + gctx->ctr = NULL; +#endif + break; + } + else +#endif #ifdef BSAES_CAPABLE if (BSAES_CAPABLE) { - AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); + AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks); CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, (block128_f)AES_encrypt); gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks; @@ -844,7 +916,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { - vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); + vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks); CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, (block128_f)vpaes_encrypt); gctx->ctr = NULL; @@ -854,7 +926,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, #endif (void)0; /* terminate potentially open 'else' */ - AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); + AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks); CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt); #ifdef AES_CTR_ASM gctx->ctr = (ctr128_f)AES_ctr32_encrypt; @@ -1075,29 +1147,50 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, xctx->stream = NULL; #endif /* key_len is two AES keys */ -#if !(defined(__arm__) || defined(__arm)) /* not yet? */ +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + if (enc) + { + HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f)HWAES_encrypt; + } + else + { + HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f)HWAES_decrypt; + } + + HWAES_set_encrypt_key(key + ctx->key_len/2, + ctx->key_len * 4, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f)HWAES_encrypt; + + xctx->xts.key1 = &xctx->ks1; + break; + } + else +#endif #ifdef BSAES_CAPABLE if (BSAES_CAPABLE) xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt; else #endif -#endif #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { if (enc) { - vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); + vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); xctx->xts.block1 = (block128_f)vpaes_encrypt; } else { - vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); + vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); xctx->xts.block1 = (block128_f)vpaes_decrypt; } vpaes_set_encrypt_key(key + ctx->key_len/2, - ctx->key_len * 4, &xctx->ks2); + ctx->key_len * 4, &xctx->ks2.ks); xctx->xts.block2 = (block128_f)vpaes_encrypt; xctx->xts.key1 = &xctx->ks1; @@ -1109,17 +1202,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, if (enc) { - AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); + AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); xctx->xts.block1 = (block128_f)AES_encrypt; } else { - AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); + AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); xctx->xts.block1 = (block128_f)AES_decrypt; } AES_set_encrypt_key(key + ctx->key_len/2, - ctx->key_len * 4, &xctx->ks2); + ctx->key_len * 4, &xctx->ks2.ks); xctx->xts.block2 = (block128_f)AES_encrypt; xctx->xts.key1 = &xctx->ks1; @@ -1227,10 +1320,23 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, return 1; if (key) do { +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + HWAES_set_encrypt_key(key,ctx->key_len*8,&cctx->ks.ks); + + CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, + &cctx->ks, (block128_f)HWAES_encrypt); + cctx->str = NULL; + cctx->key_set = 1; + break; + } + else +#endif #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { - vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks); + vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks.ks); CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, &cctx->ks, (block128_f)vpaes_encrypt); cctx->str = NULL; @@ -1238,7 +1344,7 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, break; } #endif - AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); + AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks); CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, &cctx->ks, (block128_f)AES_encrypt); cctx->str = NULL; diff --git a/app/openssl/crypto/evp/encode.c b/app/openssl/crypto/evp/encode.c index 28546a84..4654bdc6 100644 --- a/app/openssl/crypto/evp/encode.c +++ b/app/openssl/crypto/evp/encode.c @@ -324,6 +324,7 @@ int EVP_DecodeUpdate(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl, v=EVP_DecodeBlock(out,d,n); n=0; if (v < 0) { rv=0; goto end; } + if (eof > v) { rv=-1; goto end; } ret+=(v-eof); } else diff --git a/app/openssl/crypto/evp/p_lib.c b/app/openssl/crypto/evp/p_lib.c index bd1977d7..8ee53c1d 100644 --- a/app/openssl/crypto/evp/p_lib.c +++ b/app/openssl/crypto/evp/p_lib.c @@ -202,7 +202,7 @@ EVP_PKEY *EVP_PKEY_new(void) EVP_PKEY *EVP_PKEY_dup(EVP_PKEY *pkey) { - CRYPTO_add(&pkey->references, 1, CRYPTO_LOCK_EVP_PKEY); + CRYPTO_add(&pkey->references,1,CRYPTO_LOCK_EVP_PKEY); return pkey; } diff --git a/app/openssl/crypto/modes/asm/ghash-armv4.S b/app/openssl/crypto/modes/asm/ghash-armv4.S index d66c4cbf..6c453774 100644 --- a/app/openssl/crypto/modes/asm/ghash-armv4.S +++ b/app/openssl/crypto/modes/asm/ghash-armv4.S @@ -309,99 +309,213 @@ gcm_gmult_4bit: #if __ARM_ARCH__>=7 .fpu neon +.global gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + vld1.64 d7,[r1,:64]! @ load H + vmov.i8 q8,#0xe1 + vld1.64 d6,[r1,:64] + vshl.i64 d17,#57 + vshr.u64 d16,#63 @ t0=0xc2....01 + vdup.8 q9,d7[7] + vshr.u64 d26,d6,#63 + vshr.s8 q9,#7 @ broadcast carry bit + vshl.i64 q3,q3,#1 + vand q8,q8,q9 + vorr d7,d26 @ H<<<=1 + veor q3,q3,q8 @ twisted H + vstmia r0,{q3} + + bx lr @ bx lr +.size gcm_init_neon,.-gcm_init_neon + .global gcm_gmult_neon .type gcm_gmult_neon,%function .align 4 gcm_gmult_neon: - sub r1,#16 @ point at H in GCM128_CTX - vld1.64 d29,[r0,:64]!@ load Xi - vmov.i32 d5,#0xe1 @ our irreducible polynomial - vld1.64 d28,[r0,:64]! - vshr.u64 d5,#32 - vldmia r1,{d0-d1} @ load H - veor q12,q12 + vld1.64 d7,[r0,:64]! @ load Xi + vld1.64 d6,[r0,:64]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26-d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff #ifdef __ARMEL__ - vrev64.8 q14,q14 + vrev64.8 q3,q3 #endif - veor q13,q13 - veor q11,q11 - mov r1,#16 - veor q10,q10 + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing mov r3,#16 - veor d2,d2 - vdup.8 d4,d28[0] @ broadcast lowest byte - b .Linner_neon + b .Lgmult_neon .size gcm_gmult_neon,.-gcm_gmult_neon .global gcm_ghash_neon .type gcm_ghash_neon,%function .align 4 gcm_ghash_neon: - vld1.64 d21,[r0,:64]! @ load Xi - vmov.i32 d5,#0xe1 @ our irreducible polynomial - vld1.64 d20,[r0,:64]! - vshr.u64 d5,#32 - vldmia r0,{d0-d1} @ load H - veor q12,q12 - nop + vld1.64 d1,[r0,:64]! @ load Xi + vld1.64 d0,[r0,:64]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26-d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff #ifdef __ARMEL__ - vrev64.8 q10,q10 + vrev64.8 q0,q0 #endif -.Louter_neon: - vld1.64 d29,[r2]! @ load inp - veor q13,q13 - vld1.64 d28,[r2]! - veor q11,q11 - mov r1,#16 + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing + +.Loop_neon: + vld1.64 d7,[r2]! @ load inp + vld1.64 d6,[r2]! #ifdef __ARMEL__ - vrev64.8 q14,q14 + vrev64.8 q3,q3 #endif - veor d2,d2 - veor q14,q10 @ inp^=Xi - veor q10,q10 - vdup.8 d4,d28[0] @ broadcast lowest byte -.Linner_neon: - subs r1,r1,#1 - vmull.p8 q9,d1,d4 @ H.lo·Xi[i] - vmull.p8 q8,d0,d4 @ H.hi·Xi[i] - vext.8 q14,q12,#1 @ IN>>=8 - - veor q10,q13 @ modulo-scheduled part - vshl.i64 d22,#48 - vdup.8 d4,d28[0] @ broadcast lowest byte - veor d3,d18,d20 - - veor d21,d22 - vuzp.8 q9,q8 - vsli.8 d2,d3,#1 @ compose the "carry" byte - vext.8 q10,q12,#1 @ Z>>=8 + veor q3,q0 @ inp^=Xi +.Lgmult_neon: + vext.8 d16, d26, d26, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d0, d6, d6, #1 @ B1 + vmull.p8 q0, d26, d0 @ E = A*B1 + vext.8 d18, d26, d26, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d26, d22 @ G = A*B2 + vext.8 d20, d26, d26, #3 @ A3 + veor q8, q8, q0 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d0, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q0, d26, d0 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d26, d22 @ K = A*B4 + veor q10, q10, q0 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q0, d26, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q0, q0, q8 + veor q0, q0, q10 + veor d6,d6,d7 @ Karatsuba pre-processing + vext.8 d16, d28, d28, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d2, d6, d6, #1 @ B1 + vmull.p8 q1, d28, d2 @ E = A*B1 + vext.8 d18, d28, d28, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d28, d22 @ G = A*B2 + vext.8 d20, d28, d28, #3 @ A3 + veor q8, q8, q1 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d2, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q1, d28, d2 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d28, d22 @ K = A*B4 + veor q10, q10, q1 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q1, d28, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q1, q1, q8 + veor q1, q1, q10 + vext.8 d16, d27, d27, #1 @ A1 + vmull.p8 q8, d16, d7 @ F = A1*B + vext.8 d4, d7, d7, #1 @ B1 + vmull.p8 q2, d27, d4 @ E = A*B1 + vext.8 d18, d27, d27, #2 @ A2 + vmull.p8 q9, d18, d7 @ H = A2*B + vext.8 d22, d7, d7, #2 @ B2 + vmull.p8 q11, d27, d22 @ G = A*B2 + vext.8 d20, d27, d27, #3 @ A3 + veor q8, q8, q2 @ L = E + F + vmull.p8 q10, d20, d7 @ J = A3*B + vext.8 d4, d7, d7, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q2, d27, d4 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d7, d7, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d27, d22 @ K = A*B4 + veor q10, q10, q2 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q2, d27, d7 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q2, q2, q8 + veor q2, q2, q10 + veor q1,q1,q0 @ Karatsuba post-processing + veor q1,q1,q2 + veor d1,d1,d2 + veor d4,d4,d3 @ Xh|Xl - 256-bit result - vmull.p8 q11,d2,d5 @ "carry"·0xe1 - vshr.u8 d2,d3,#7 @ save Z's bottom bit - vext.8 q13,q9,q12,#1 @ Qlo>>=8 - veor q10,q8 - bne .Linner_neon + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 q9,q0,#57 @ 1st phase + vshl.i64 q10,q0,#62 + veor q10,q10,q9 @ + vshl.i64 q9,q0,#63 + veor q10, q10, q9 @ + veor d1,d1,d20 @ + veor d4,d4,d21 - veor q10,q13 @ modulo-scheduled artefact - vshl.i64 d22,#48 - veor d21,d22 + vshr.u64 q10,q0,#1 @ 2nd phase + veor q2,q2,q0 + veor q0,q0,q10 @ + vshr.u64 q10,q10,#6 + vshr.u64 q0,q0,#1 @ + veor q0,q0,q2 @ + veor q0,q0,q10 @ - @ finalization, normalize Z:Zo - vand d2,d5 @ suffices to mask the bit - vshr.u64 d3,d20,#63 - vshl.i64 q10,#1 subs r3,#16 - vorr q10,q1 @ Z=Z:Zo<<1 - bne .Louter_neon + bne .Loop_neon #ifdef __ARMEL__ - vrev64.8 q10,q10 + vrev64.8 q0,q0 #endif sub r0,#16 - vst1.64 d21,[r0,:64]! @ write out Xi - vst1.64 d20,[r0,:64] + vst1.64 d1,[r0,:64]! @ write out Xi + vst1.64 d0,[r0,:64] - .word 0xe12fff1e + bx lr @ bx lr .size gcm_ghash_neon,.-gcm_ghash_neon #endif .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" diff --git a/app/openssl/crypto/modes/asm/ghash-armv4.pl b/app/openssl/crypto/modes/asm/ghash-armv4.pl index e46f8e34..b79ecbcc 100644 --- a/app/openssl/crypto/modes/asm/ghash-armv4.pl +++ b/app/openssl/crypto/modes/asm/ghash-armv4.pl @@ -35,6 +35,20 @@ # Add NEON implementation featuring polynomial multiplication, i.e. no # lookup tables involved. On Cortex A8 it was measured to process one # byte in 15 cycles or 55% faster than integer-only code. +# +# April 2014 +# +# Switch to multiplication algorithm suggested in paper referred +# below and combine it with reduction algorithm from x86 module. +# Performance improvement over previous version varies from 65% on +# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 +# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 - +# in 9.33. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf # ==================================================================== # Note about "528B" variant. In ARM case it makes lesser sense to @@ -303,117 +317,160 @@ $code.=<<___; .size gcm_gmult_4bit,.-gcm_gmult_4bit ___ { -my $cnt=$Htbl; # $Htbl is used once in the very beginning - -my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); -my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); - -# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit -# in Zo. Or should I say "top bit", because GHASH is specified in -# reverse bit order? Otherwise straightforward 128-bt H by one input -# byte multiplication and modulo-reduction, times 16. +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); +my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } -sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } +sub clmul64x64 { +my ($r,$a,$b)=@_; +$code.=<<___; + vext.8 $t0#lo, $a, $a, #1 @ A1 + vmull.p8 $t0, $t0#lo, $b @ F = A1*B + vext.8 $r#lo, $b, $b, #1 @ B1 + vmull.p8 $r, $a, $r#lo @ E = A*B1 + vext.8 $t1#lo, $a, $a, #2 @ A2 + vmull.p8 $t1, $t1#lo, $b @ H = A2*B + vext.8 $t3#lo, $b, $b, #2 @ B2 + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 + vext.8 $t2#lo, $a, $a, #3 @ A3 + veor $t0, $t0, $r @ L = E + F + vmull.p8 $t2, $t2#lo, $b @ J = A3*B + vext.8 $r#lo, $b, $b, #3 @ B3 + veor $t1, $t1, $t3 @ M = G + H + vmull.p8 $r, $a, $r#lo @ I = A*B3 + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 + vand $t0#hi, $t0#hi, $k48 + vext.8 $t3#lo, $b, $b, #4 @ B4 + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 + vand $t1#hi, $t1#hi, $k32 + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 + veor $t2, $t2, $r @ N = I + J + veor $t0#lo, $t0#lo, $t0#hi + veor $t1#lo, $t1#lo, $t1#hi + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 + vand $t2#hi, $t2#hi, $k16 + vext.8 $t0, $t0, $t0, #15 + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 + vmov.i64 $t3#hi, #0 + vext.8 $t1, $t1, $t1, #14 + veor $t2#lo, $t2#lo, $t2#hi + vmull.p8 $r, $a, $b @ D = A*B + vext.8 $t3, $t3, $t3, #12 + vext.8 $t2, $t2, $t2, #13 + veor $t0, $t0, $t1 + veor $t2, $t2, $t3 + veor $r, $r, $t0 + veor $r, $r, $t2 +___ +} $code.=<<___; #if __ARM_ARCH__>=7 .fpu neon +.global gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + vld1.64 $IN#hi,[r1,:64]! @ load H + vmov.i8 $t0,#0xe1 + vld1.64 $IN#lo,[r1,:64] + vshl.i64 $t0#hi,#57 + vshr.u64 $t0#lo,#63 @ t0=0xc2....01 + vdup.8 $t1,$IN#hi[7] + vshr.u64 $Hlo,$IN#lo,#63 + vshr.s8 $t1,#7 @ broadcast carry bit + vshl.i64 $IN,$IN,#1 + vand $t0,$t0,$t1 + vorr $IN#hi,$Hlo @ H<<<=1 + veor $IN,$IN,$t0 @ twisted H + vstmia r0,{$IN} + + ret @ bx lr +.size gcm_init_neon,.-gcm_init_neon + .global gcm_gmult_neon .type gcm_gmult_neon,%function .align 4 gcm_gmult_neon: - sub $Htbl,#16 @ point at H in GCM128_CTX - vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi - vmov.i32 $mod,#0xe1 @ our irreducible polynomial - vld1.64 `&Dlo("$IN")`,[$Xi,:64]! - vshr.u64 $mod,#32 - vldmia $Htbl,{$Hhi-$Hlo} @ load H - veor $zero,$zero + vld1.64 $IN#hi,[$Xi,:64]! @ load Xi + vld1.64 $IN#lo,[$Xi,:64]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff #ifdef __ARMEL__ vrev64.8 $IN,$IN #endif - veor $Qpost,$Qpost - veor $R,$R - mov $cnt,#16 - veor $Z,$Z + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing mov $len,#16 - veor $Zo,$Zo - vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte - b .Linner_neon + b .Lgmult_neon .size gcm_gmult_neon,.-gcm_gmult_neon .global gcm_ghash_neon .type gcm_ghash_neon,%function .align 4 gcm_ghash_neon: - vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi - vmov.i32 $mod,#0xe1 @ our irreducible polynomial - vld1.64 `&Dlo("$Z")`,[$Xi,:64]! - vshr.u64 $mod,#32 - vldmia $Xi,{$Hhi-$Hlo} @ load H - veor $zero,$zero - nop + vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi + vld1.64 $Xl#lo,[$Xi,:64]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff #ifdef __ARMEL__ - vrev64.8 $Z,$Z + vrev64.8 $Xl,$Xl #endif -.Louter_neon: - vld1.64 `&Dhi($IN)`,[$inp]! @ load inp - veor $Qpost,$Qpost - vld1.64 `&Dlo($IN)`,[$inp]! - veor $R,$R - mov $cnt,#16 + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing + +.Loop_neon: + vld1.64 $IN#hi,[$inp]! @ load inp + vld1.64 $IN#lo,[$inp]! #ifdef __ARMEL__ vrev64.8 $IN,$IN #endif - veor $Zo,$Zo - veor $IN,$Z @ inp^=Xi - veor $Z,$Z - vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte -.Linner_neon: - subs $cnt,$cnt,#1 - vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i] - vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i] - vext.8 $IN,$zero,#1 @ IN>>=8 - - veor $Z,$Qpost @ modulo-scheduled part - vshl.i64 `&Dlo("$R")`,#48 - vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte - veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")` - - veor `&Dhi("$Z")`,`&Dlo("$R")` - vuzp.8 $Qlo,$Qhi - vsli.8 $Zo,$T,#1 @ compose the "carry" byte - vext.8 $Z,$zero,#1 @ Z>>=8 - - vmull.p8 $R,$Zo,$mod @ "carry"·0xe1 - vshr.u8 $Zo,$T,#7 @ save Z's bottom bit - vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8 - veor $Z,$Qhi - bne .Linner_neon - - veor $Z,$Qpost @ modulo-scheduled artefact - vshl.i64 `&Dlo("$R")`,#48 - veor `&Dhi("$Z")`,`&Dlo("$R")` - - @ finalization, normalize Z:Zo - vand $Zo,$mod @ suffices to mask the bit - vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 - vshl.i64 $Z,#1 + veor $IN,$Xl @ inp^=Xi +.Lgmult_neon: +___ + &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo +$code.=<<___; + veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing +___ + &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) + &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi +$code.=<<___; + veor $Xm,$Xm,$Xl @ Karatsuba post-processing + veor $Xm,$Xm,$Xh + veor $Xl#hi,$Xl#hi,$Xm#lo + veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result + + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 $t1,$Xl,#57 @ 1st phase + vshl.i64 $t2,$Xl,#62 + veor $t2,$t2,$t1 @ + vshl.i64 $t1,$Xl,#63 + veor $t2, $t2, $t1 @ + veor $Xl#hi,$Xl#hi,$t2#lo @ + veor $Xh#lo,$Xh#lo,$t2#hi + + vshr.u64 $t2,$Xl,#1 @ 2nd phase + veor $Xh,$Xh,$Xl + veor $Xl,$Xl,$t2 @ + vshr.u64 $t2,$t2,#6 + vshr.u64 $Xl,$Xl,#1 @ + veor $Xl,$Xl,$Xh @ + veor $Xl,$Xl,$t2 @ + subs $len,#16 - vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1 - bne .Louter_neon + bne .Loop_neon #ifdef __ARMEL__ - vrev64.8 $Z,$Z + vrev64.8 $Xl,$Xl #endif sub $Xi,#16 - vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi - vst1.64 `&Dlo("$Z")`,[$Xi,:64] + vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi + vst1.64 $Xl#lo,[$Xi,:64] - bx lr + ret @ bx lr .size gcm_ghash_neon,.-gcm_ghash_neon #endif ___ @@ -423,7 +480,13 @@ $code.=<<___; .align 2 ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} close STDOUT; # enforce flush diff --git a/app/openssl/crypto/modes/asm/ghashv8-armx-64.S b/app/openssl/crypto/modes/asm/ghashv8-armx-64.S new file mode 100644 index 00000000..b77b6c40 --- /dev/null +++ b/app/openssl/crypto/modes/asm/ghashv8-armx-64.S @@ -0,0 +1,115 @@ +#include "arm_arch.h" + +.text +.arch armv8-a+crypto +.global gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + ld1 {v17.2d},[x1] //load H + movi v16.16b,#0xe1 + ext v3.16b,v17.16b,v17.16b,#8 + shl v16.2d,v16.2d,#57 + ushr v18.2d,v16.2d,#63 + ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01 + dup v17.4s,v17.s[1] + ushr v19.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v19.16b,v19.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v19.16b,v19.16b,v19.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v19.16b //H<<<=1 + eor v3.16b,v3.16b,v16.16b //twisted H + st1 {v3.2d},[x0] + + ret +.size gcm_init_v8,.-gcm_init_v8 + +.global gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d},[x1] //load twisted H + shl v19.2d,v19.2d,#57 +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ext v21.16b,v20.16b,v20.16b,#8 + mov x3,#0 + ext v3.16b,v17.16b,v17.16b,#8 + mov x12,#0 + eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing + mov x2,x0 + b .Lgmult_v8 +.size gcm_gmult_v8,.-gcm_gmult_v8 + +.global gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: + ld1 {v0.2d},[x0] //load [rotated] Xi + subs x3,x3,#16 + movi v19.16b,#0xe1 + mov x12,#16 + ld1 {v20.2d},[x1] //load twisted H + csel x12,xzr,x12,eq + ext v0.16b,v0.16b,v0.16b,#8 + shl v19.2d,v19.2d,#57 + ld1 {v17.2d},[x2],x12 //load [rotated] inp + ext v21.16b,v20.16b,v20.16b,#8 +#ifndef __ARMEB__ + rev64 v0.16b,v0.16b + rev64 v17.16b,v17.16b +#endif + eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing + ext v3.16b,v17.16b,v17.16b,#8 + b .Loop_v8 + +.align 4 +.Loop_v8: + ext v18.16b,v0.16b,v0.16b,#8 + eor v3.16b,v3.16b,v0.16b //inp^=Xi + eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi + +.Lgmult_v8: + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + subs x3,x3,#16 + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + csel x12,xzr,x12,eq + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] inp + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + eor v0.16b,v1.16b,v18.16b + ext v3.16b,v17.16b,v17.16b,#8 + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + b.hs .Loop_v8 + +#ifndef __ARMEB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_ghash_v8,.-gcm_ghash_v8 +.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 diff --git a/app/openssl/crypto/modes/asm/ghashv8-armx.S b/app/openssl/crypto/modes/asm/ghashv8-armx.S new file mode 100644 index 00000000..f388c54e --- /dev/null +++ b/app/openssl/crypto/modes/asm/ghashv8-armx.S @@ -0,0 +1,116 @@ +#include "arm_arch.h" + +.text +.fpu neon +.code 32 +.global gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + vld1.64 {q9},[r1] @ load H + vmov.i8 q8,#0xe1 + vext.8 q3,q9,q9,#8 + vshl.i64 q8,q8,#57 + vshr.u64 q10,q8,#63 + vext.8 q8,q10,q8,#8 @ t0=0xc2....01 + vdup.32 q9,d18[1] + vshr.u64 q11,q3,#63 + vshr.s32 q9,q9,#31 @ broadcast carry bit + vand q11,q11,q8 + vshl.i64 q3,q3,#1 + vext.8 q11,q11,q11,#8 + vand q8,q8,q9 + vorr q3,q3,q11 @ H<<<=1 + veor q3,q3,q8 @ twisted H + vst1.64 {q3},[r0] + + bx lr +.size gcm_init_v8,.-gcm_init_v8 + +.global gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + vld1.64 {q9},[r0] @ load Xi + vmov.i8 q11,#0xe1 + vld1.64 {q12},[r1] @ load twisted H + vshl.u64 q11,q11,#57 +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vext.8 q13,q12,q12,#8 + mov r3,#0 + vext.8 q3,q9,q9,#8 + mov r12,#0 + veor q13,q13,q12 @ Karatsuba pre-processing + mov r2,r0 + b .Lgmult_v8 +.size gcm_gmult_v8,.-gcm_gmult_v8 + +.global gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: + vld1.64 {q0},[r0] @ load [rotated] Xi + subs r3,r3,#16 + vmov.i8 q11,#0xe1 + mov r12,#16 + vld1.64 {q12},[r1] @ load twisted H + moveq r12,#0 + vext.8 q0,q0,q0,#8 + vshl.u64 q11,q11,#57 + vld1.64 {q9},[r2],r12 @ load [rotated] inp + vext.8 q13,q12,q12,#8 +#ifndef __ARMEB__ + vrev64.8 q0,q0 + vrev64.8 q9,q9 +#endif + veor q13,q13,q12 @ Karatsuba pre-processing + vext.8 q3,q9,q9,#8 + b .Loop_v8 + +.align 4 +.Loop_v8: + vext.8 q10,q0,q0,#8 + veor q3,q3,q0 @ inp^=Xi + veor q9,q9,q10 @ q9 is rotated inp^Xi + +.Lgmult_v8: + .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + veor q9,q9,q3 @ Karatsuba pre-processing + .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi + subs r3,r3,#16 + .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + moveq r12,#0 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + vld1.64 {q9},[r2],r12 @ load [rotated] inp + veor q1,q1,q10 + .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + veor q0,q1,q10 + vext.8 q3,q9,q9,#8 + + vext.8 q10,q0,q0,#8 @ 2nd phase + .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q0,q0,q10 + bhs .Loop_v8 + +#ifndef __ARMEB__ + vrev64.8 q0,q0 +#endif + vext.8 q0,q0,q0,#8 + vst1.64 {q0},[r0] @ write out Xi + + bx lr +.size gcm_ghash_v8,.-gcm_ghash_v8 +.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 diff --git a/app/openssl/crypto/modes/asm/ghashv8-armx.pl b/app/openssl/crypto/modes/asm/ghashv8-armx.pl new file mode 100644 index 00000000..69e863e7 --- /dev/null +++ b/app/openssl/crypto/modes/asm/ghashv8-armx.pl @@ -0,0 +1,240 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. +# +# June 2014 +# +# Initial version was developed in tight cooperation with Ard +# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from +# other assembly modules. Just like aesv8-armx.pl this module +# supports both AArch32 and AArch64 execution modes. +# +# Current performance in cycles per processed byte: +# +# PMULL[2] 32-bit NEON(*) +# Apple A7 1.76 5.62 +# Cortex-A5x n/a n/a +# +# (*) presented for reference/comparison purposes; + +$flavour = shift; +open STDOUT,">".shift; + +$Xi="x0"; # argument block +$Htbl="x1"; +$inp="x2"; +$len="x3"; + +$inc="x12"; + +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14)); + +$code=<<___; +#include "arm_arch.h" + +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); + +$code.=<<___; +.global gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + vld1.64 {$t1},[x1] @ load H + vmov.i8 $t0,#0xe1 + vext.8 $IN,$t1,$t1,#8 + vshl.i64 $t0,$t0,#57 + vshr.u64 $t2,$t0,#63 + vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01 + vdup.32 $t1,${t1}[1] + vshr.u64 $t3,$IN,#63 + vshr.s32 $t1,$t1,#31 @ broadcast carry bit + vand $t3,$t3,$t0 + vshl.i64 $IN,$IN,#1 + vext.8 $t3,$t3,$t3,#8 + vand $t0,$t0,$t1 + vorr $IN,$IN,$t3 @ H<<<=1 + veor $IN,$IN,$t0 @ twisted H + vst1.64 {$IN},[x0] + + ret +.size gcm_init_v8,.-gcm_init_v8 + +.global gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + vld1.64 {$t1},[$Xi] @ load Xi + vmov.i8 $t3,#0xe1 + vld1.64 {$H},[$Htbl] @ load twisted H + vshl.u64 $t3,$t3,#57 +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vext.8 $Hhl,$H,$H,#8 + mov $len,#0 + vext.8 $IN,$t1,$t1,#8 + mov $inc,#0 + veor $Hhl,$Hhl,$H @ Karatsuba pre-processing + mov $inp,$Xi + b .Lgmult_v8 +.size gcm_gmult_v8,.-gcm_gmult_v8 + +.global gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: + vld1.64 {$Xl},[$Xi] @ load [rotated] Xi + subs $len,$len,#16 + vmov.i8 $t3,#0xe1 + mov $inc,#16 + vld1.64 {$H},[$Htbl] @ load twisted H + cclr $inc,eq + vext.8 $Xl,$Xl,$Xl,#8 + vshl.u64 $t3,$t3,#57 + vld1.64 {$t1},[$inp],$inc @ load [rotated] inp + vext.8 $Hhl,$H,$H,#8 +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl + vrev64.8 $t1,$t1 +#endif + veor $Hhl,$Hhl,$H @ Karatsuba pre-processing + vext.8 $IN,$t1,$t1,#8 + b .Loop_v8 + +.align 4 +.Loop_v8: + vext.8 $t2,$Xl,$Xl,#8 + veor $IN,$IN,$Xl @ inp^=Xi + veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi + +.Lgmult_v8: + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + veor $t1,$t1,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + subs $len,$len,#16 + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + cclr $inc,eq + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$t1},[$inp],$inc @ load [rotated] inp + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$t3 @ 1st phase + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + veor $Xl,$Xm,$t2 + vext.8 $IN,$t1,$t1,#8 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vpmull.p64 $Xl,$Xl,$t3 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + b.hs .Loop_v8 + +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl +#endif + vext.8 $Xl,$Xl,$Xl,#8 + vst1.64 {$Xl},[$Xi] @ write out Xi + + ret +.size gcm_ghash_v8,.-gcm_ghash_v8 +___ +} +$code.=<<___; +.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +if ($flavour =~ /64/) { ######## 64-bit code + sub unvmov { + my $arg=shift; + + $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && + sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; + } + foreach(split("\n",$code)) { + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vmov\s+(.*)/unvmov($1)/geo or + s/vext\.8/ext/o or + s/vshr\.s/sshr\.s/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + # fix up remainig legacy suffixes + s/\.[ui]?8(\s)/$1/o; + s/\.[uis]?32//o and s/\.16b/\.4s/go; + m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument + m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments + s/\.[uisp]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + print $_,"\n"; + } +} else { ######## 32-bit code + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + sub unvpmullp64 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { + my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + $word |= 0x00010001 if ($mnemonic =~ "2"); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } + + foreach(split("\n",$code)) { + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remainig new-style suffixes + s/\],#[0-9]+/]!/o; + + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT; # enforce flush diff --git a/app/openssl/crypto/modes/gcm128.c b/app/openssl/crypto/modes/gcm128.c index e1dc2b0f..79ebb66e 100644 --- a/app/openssl/crypto/modes/gcm128.c +++ b/app/openssl/crypto/modes/gcm128.c @@ -642,7 +642,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) #endif -#if TABLE_BITS==4 && defined(GHASH_ASM) +#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ)) # if !defined(I386_ONLY) && \ (defined(__i386) || defined(__i386__) || \ defined(__x86_64) || defined(__x86_64__) || \ @@ -663,13 +663,21 @@ void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); # endif -# elif defined(__arm__) || defined(__arm) +# elif defined(__arm__) || defined(__arm) || defined(__aarch64__) # include "arm_arch.h" # if __ARM_ARCH__>=7 # define GHASH_ASM_ARM # define GCM_FUNCREF_4BIT +# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL) +# if defined(__arm__) || defined(__arm) +# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) +# endif +void gcm_init_neon(u128 Htable[16],const u64 Xi[2]); void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +void gcm_init_v8(u128 Htable[16],const u64 Xi[2]); +void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); # endif # endif #endif @@ -739,10 +747,21 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) ctx->ghash = gcm_ghash_4bit; # endif # elif defined(GHASH_ASM_ARM) - if (OPENSSL_armcap_P & ARMV7_NEON) { +# ifdef PMULL_CAPABLE + if (PMULL_CAPABLE) { + gcm_init_v8(ctx->Htable,ctx->H.u); + ctx->gmult = gcm_gmult_v8; + ctx->ghash = gcm_ghash_v8; + } else +# endif +# ifdef NEON_CAPABLE + if (NEON_CAPABLE) { + gcm_init_neon(ctx->Htable,ctx->H.u); ctx->gmult = gcm_gmult_neon; ctx->ghash = gcm_ghash_neon; - } else { + } else +# endif + { gcm_init_4bit(ctx->Htable,ctx->H.u); ctx->gmult = gcm_gmult_4bit; ctx->ghash = gcm_ghash_4bit; diff --git a/app/openssl/crypto/opensslconf-32.h b/app/openssl/crypto/opensslconf-32.h index d6625489..caf6f1b8 100644 --- a/app/openssl/crypto/opensslconf-32.h +++ b/app/openssl/crypto/opensslconf-32.h @@ -53,6 +53,9 @@ #ifndef OPENSSL_NO_RFC3779 # define OPENSSL_NO_RFC3779 #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif #ifndef OPENSSL_NO_RSAX # define OPENSSL_NO_RSAX #endif @@ -137,6 +140,9 @@ # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779) # define NO_RFC3779 # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +# define NO_RIPEMD +# endif # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX) # define NO_RSAX # endif diff --git a/app/openssl/crypto/opensslconf-64.h b/app/openssl/crypto/opensslconf-64.h index 70c5a2cb..88fb0419 100644 --- a/app/openssl/crypto/opensslconf-64.h +++ b/app/openssl/crypto/opensslconf-64.h @@ -53,6 +53,9 @@ #ifndef OPENSSL_NO_RFC3779 # define OPENSSL_NO_RFC3779 #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif #ifndef OPENSSL_NO_RSAX # define OPENSSL_NO_RSAX #endif @@ -137,6 +140,9 @@ # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779) # define NO_RFC3779 # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +# define NO_RIPEMD +# endif # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX) # define NO_RSAX # endif diff --git a/app/openssl/crypto/opensslconf-static-32.h b/app/openssl/crypto/opensslconf-static-32.h index d6625489..caf6f1b8 100644 --- a/app/openssl/crypto/opensslconf-static-32.h +++ b/app/openssl/crypto/opensslconf-static-32.h @@ -53,6 +53,9 @@ #ifndef OPENSSL_NO_RFC3779 # define OPENSSL_NO_RFC3779 #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif #ifndef OPENSSL_NO_RSAX # define OPENSSL_NO_RSAX #endif @@ -137,6 +140,9 @@ # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779) # define NO_RFC3779 # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +# define NO_RIPEMD +# endif # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX) # define NO_RSAX # endif diff --git a/app/openssl/crypto/opensslconf-static-64.h b/app/openssl/crypto/opensslconf-static-64.h index 70c5a2cb..88fb0419 100644 --- a/app/openssl/crypto/opensslconf-static-64.h +++ b/app/openssl/crypto/opensslconf-static-64.h @@ -53,6 +53,9 @@ #ifndef OPENSSL_NO_RFC3779 # define OPENSSL_NO_RFC3779 #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif #ifndef OPENSSL_NO_RSAX # define OPENSSL_NO_RSAX #endif @@ -137,6 +140,9 @@ # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779) # define NO_RFC3779 # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +# define NO_RIPEMD +# endif # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX) # define NO_RSAX # endif diff --git a/app/openssl/crypto/opensslv.h b/app/openssl/crypto/opensslv.h index ebe71807..c3b6acec 100644 --- a/app/openssl/crypto/opensslv.h +++ b/app/openssl/crypto/opensslv.h @@ -25,11 +25,11 @@ * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for * major minor fix final patch/beta) */ -#define OPENSSL_VERSION_NUMBER 0x1000107fL +#define OPENSSL_VERSION_NUMBER 0x1000108fL #ifdef OPENSSL_FIPS -#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1g-fips 7 Apr 2014" +#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1h-fips 5 Jun 2014" #else -#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1g 7 Apr 2014" +#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1h 5 Jun 2014" #endif #define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT diff --git a/app/openssl/crypto/perlasm/x86asm.pl b/app/openssl/crypto/perlasm/x86asm.pl index eb543db2..3f190ae5 100644 --- a/app/openssl/crypto/perlasm/x86asm.pl +++ b/app/openssl/crypto/perlasm/x86asm.pl @@ -257,4 +257,6 @@ EOF &file($filename); } +sub ::hidden {} + 1; diff --git a/app/openssl/crypto/perlasm/x86gas.pl b/app/openssl/crypto/perlasm/x86gas.pl index 682a3a31..735c1ad2 100644 --- a/app/openssl/crypto/perlasm/x86gas.pl +++ b/app/openssl/crypto/perlasm/x86gas.pl @@ -250,4 +250,6 @@ ___ sub ::dataseg { push(@out,".data\n"); } +*::hidden = sub { push(@out,".hidden\t$nmdecor$_[0]\n"); } if ($::elf); + 1; diff --git a/app/openssl/crypto/pkcs12/p12_crt.c b/app/openssl/crypto/pkcs12/p12_crt.c index a34915d0..35e8a4a8 100644 --- a/app/openssl/crypto/pkcs12/p12_crt.c +++ b/app/openssl/crypto/pkcs12/p12_crt.c @@ -96,7 +96,11 @@ PKCS12 *PKCS12_create(char *pass, char *name, EVP_PKEY *pkey, X509 *cert, nid_cert = NID_pbe_WithSHA1And3_Key_TripleDES_CBC; else #endif +#ifdef OPENSSL_NO_RC2 + nid_cert = NID_pbe_WithSHA1And3_Key_TripleDES_CBC; +#else nid_cert = NID_pbe_WithSHA1And40BitRC2_CBC; +#endif } if (!nid_key) nid_key = NID_pbe_WithSHA1And3_Key_TripleDES_CBC; @@ -286,7 +290,11 @@ int PKCS12_add_safe(STACK_OF(PKCS7) **psafes, STACK_OF(PKCS12_SAFEBAG) *bags, free_safes = 0; if (nid_safe == 0) +#ifdef OPENSSL_NO_RC2 + nid_safe = NID_pbe_WithSHA1And3_Key_TripleDES_CBC; +#else nid_safe = NID_pbe_WithSHA1And40BitRC2_CBC; +#endif if (nid_safe == -1) p7 = PKCS12_pack_p7data(bags); diff --git a/app/openssl/crypto/pkcs12/p12_kiss.c b/app/openssl/crypto/pkcs12/p12_kiss.c index 206b1b0b..c9b7ab61 100644 --- a/app/openssl/crypto/pkcs12/p12_kiss.c +++ b/app/openssl/crypto/pkcs12/p12_kiss.c @@ -269,7 +269,7 @@ static int parse_bag(PKCS12_SAFEBAG *bag, const char *pass, int passlen, int len, r; unsigned char *data; len = ASN1_STRING_to_UTF8(&data, fname); - if(len > 0) { + if(len >= 0) { r = X509_alias_set1(x509, data, len); OPENSSL_free(data); if (!r) diff --git a/app/openssl/crypto/pkcs7/pk7_doit.c b/app/openssl/crypto/pkcs7/pk7_doit.c index 77fda3b8..d91aa116 100644 --- a/app/openssl/crypto/pkcs7/pk7_doit.c +++ b/app/openssl/crypto/pkcs7/pk7_doit.c @@ -440,6 +440,11 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert) { case NID_pkcs7_signed: data_body=PKCS7_get_octet_string(p7->d.sign->contents); + if (!PKCS7_is_detached(p7) && data_body == NULL) + { + PKCS7err(PKCS7_F_PKCS7_DATADECODE,PKCS7_R_INVALID_SIGNED_DATA_TYPE); + goto err; + } md_sk=p7->d.sign->md_algs; break; case NID_pkcs7_signedAndEnveloped: @@ -928,6 +933,7 @@ int PKCS7_SIGNER_INFO_sign(PKCS7_SIGNER_INFO *si) if (EVP_DigestSignUpdate(&mctx,abuf,alen) <= 0) goto err; OPENSSL_free(abuf); + abuf = NULL; if (EVP_DigestSignFinal(&mctx, NULL, &siglen) <= 0) goto err; abuf = OPENSSL_malloc(siglen); diff --git a/app/openssl/crypto/pkcs7/pkcs7.h b/app/openssl/crypto/pkcs7/pkcs7.h index e4d44319..04f60379 100644 --- a/app/openssl/crypto/pkcs7/pkcs7.h +++ b/app/openssl/crypto/pkcs7/pkcs7.h @@ -453,6 +453,7 @@ void ERR_load_PKCS7_strings(void); #define PKCS7_R_ERROR_SETTING_CIPHER 121 #define PKCS7_R_INVALID_MIME_TYPE 131 #define PKCS7_R_INVALID_NULL_POINTER 143 +#define PKCS7_R_INVALID_SIGNED_DATA_TYPE 155 #define PKCS7_R_MIME_NO_CONTENT_TYPE 132 #define PKCS7_R_MIME_PARSE_ERROR 133 #define PKCS7_R_MIME_SIG_PARSE_ERROR 134 diff --git a/app/openssl/crypto/pkcs7/pkcs7err.c b/app/openssl/crypto/pkcs7/pkcs7err.c index d0af32a2..f3db08e0 100644 --- a/app/openssl/crypto/pkcs7/pkcs7err.c +++ b/app/openssl/crypto/pkcs7/pkcs7err.c @@ -1,6 +1,6 @@ /* crypto/pkcs7/pkcs7err.c */ /* ==================================================================== - * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2014 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -130,6 +130,7 @@ static ERR_STRING_DATA PKCS7_str_reasons[]= {ERR_REASON(PKCS7_R_ERROR_SETTING_CIPHER),"error setting cipher"}, {ERR_REASON(PKCS7_R_INVALID_MIME_TYPE) ,"invalid mime type"}, {ERR_REASON(PKCS7_R_INVALID_NULL_POINTER),"invalid null pointer"}, +{ERR_REASON(PKCS7_R_INVALID_SIGNED_DATA_TYPE),"invalid signed data type"}, {ERR_REASON(PKCS7_R_MIME_NO_CONTENT_TYPE),"mime no content type"}, {ERR_REASON(PKCS7_R_MIME_PARSE_ERROR) ,"mime parse error"}, {ERR_REASON(PKCS7_R_MIME_SIG_PARSE_ERROR),"mime sig parse error"}, diff --git a/app/openssl/crypto/ripemd/README b/app/openssl/crypto/ripemd/README deleted file mode 100644 index f1ffc8b1..00000000 --- a/app/openssl/crypto/ripemd/README +++ /dev/null @@ -1,15 +0,0 @@ -RIPEMD-160 -http://www.esat.kuleuven.ac.be/~bosselae/ripemd160.html - -This is my implementation of RIPEMD-160. The pentium assember is a little -off the pace since I only get 1050 cycles, while the best is 1013. -I have a few ideas for how to get another 20 or so cycles, but at -this point I will not bother right now. I believe the trick will be -to remove my 'copy X array onto stack' until inside the RIP1() finctions the -first time round. To do this I need another register and will only have one -temporary one. A bit tricky.... I can also cleanup the saving of the 5 words -after the first half of the calculation. I should read the origional -value, add then write. Currently I just save the new and read the origioal. -I then read both at the end. Bad. - -eric (20-Jan-1998) diff --git a/app/openssl/crypto/ripemd/asm/rips.cpp b/app/openssl/crypto/ripemd/asm/rips.cpp deleted file mode 100644 index f7a13677..00000000 --- a/app/openssl/crypto/ripemd/asm/rips.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// -// gettsc.inl -// -// gives access to the Pentium's (secret) cycle counter -// -// This software was written by Leonard Janke (janke@unixg.ubc.ca) -// in 1996-7 and is entered, by him, into the public domain. - -#if defined(__WATCOMC__) -void GetTSC(unsigned long&); -#pragma aux GetTSC = 0x0f 0x31 "mov [edi], eax" parm [edi] modify [edx eax]; -#elif defined(__GNUC__) -inline -void GetTSC(unsigned long& tsc) -{ - asm volatile(".byte 15, 49\n\t" - : "=eax" (tsc) - : - : "%edx", "%eax"); -} -#elif defined(_MSC_VER) -inline -void GetTSC(unsigned long& tsc) -{ - unsigned long a; - __asm _emit 0fh - __asm _emit 31h - __asm mov a, eax; - tsc=a; -} -#endif - -#include <stdio.h> -#include <stdlib.h> -#include <openssl/ripemd.h> - -#define ripemd160_block_x86 ripemd160_block_asm_host_order - -extern "C" { -void ripemd160_block_x86(RIPEMD160_CTX *ctx, unsigned char *buffer,int num); -} - -void main(int argc,char *argv[]) - { - unsigned char buffer[64*256]; - RIPEMD160_CTX ctx; - unsigned long s1,s2,e1,e2; - unsigned char k[16]; - unsigned long data[2]; - unsigned char iv[8]; - int i,num=0,numm; - int j=0; - - if (argc >= 2) - num=atoi(argv[1]); - - if (num == 0) num=16; - if (num > 250) num=16; - numm=num+2; -#if 0 - num*=64; - numm*=64; -#endif - - for (j=0; j<6; j++) - { - for (i=0; i<10; i++) /**/ - { - ripemd160_block_x86(&ctx,buffer,numm); - GetTSC(s1); - ripemd160_block_x86(&ctx,buffer,numm); - GetTSC(e1); - GetTSC(s2); - ripemd160_block_x86(&ctx,buffer,num); - GetTSC(e2); - ripemd160_block_x86(&ctx,buffer,num); - } - printf("ripemd160 (%d bytes) %d %d (%.2f)\n",num*64, - e1-s1,e2-s2,(double)((e1-s1)-(e2-s2))/2); - } - } - diff --git a/app/openssl/crypto/ripemd/asm/rmd-586.pl b/app/openssl/crypto/ripemd/asm/rmd-586.pl deleted file mode 100644 index e8b2bc2d..00000000 --- a/app/openssl/crypto/ripemd/asm/rmd-586.pl +++ /dev/null @@ -1,591 +0,0 @@ -#!/usr/local/bin/perl - -# Normal is the -# ripemd160_block_asm_data_order(RIPEMD160_CTX *c, ULONG *X,int blocks); - -$normal=0; - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],$0); - -$A="ecx"; -$B="esi"; -$C="edi"; -$D="ebx"; -$E="ebp"; -$tmp1="eax"; -$tmp2="edx"; - -$KL1=0x5A827999; -$KL2=0x6ED9EBA1; -$KL3=0x8F1BBCDC; -$KL4=0xA953FD4E; -$KR0=0x50A28BE6; -$KR1=0x5C4DD124; -$KR2=0x6D703EF3; -$KR3=0x7A6D76E9; - - -@wl=( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, - 7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8, - 3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12, - 1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2, - 4, 0, 5, 9, 7,12, 2,10,14, 1, 3, 8,11, 6,15,13, - ); - -@wr=( 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12, - 6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2, - 15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13, - 8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14, - 12,15,10, 4, 1, 5, 8, 7, 6, 2,13,14, 0, 3, 9,11, - ); - -@sl=( 11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8, - 7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12, - 11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5, - 11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12, - 9,15, 5,11, 6, 8,13,12, 5,12,13,14,11, 8, 5, 6, - ); - -@sr=( 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6, - 9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11, - 9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5, - 15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8, - 8, 5,12, 9,12, 5,14, 6, 8,13, 6, 5,15,13,11,11, - ); - -&ripemd160_block("ripemd160_block_asm_data_order"); -&asm_finish(); - -sub Xv - { - local($n)=@_; - return(&swtmp($n)); - # tmp on stack - } - -sub Np - { - local($p)=@_; - local(%n)=($A,$E,$B,$A,$C,$B,$D,$C,$E,$D); - return($n{$p}); - } - -sub RIP1 - { - local($a,$b,$c,$d,$e,$pos,$s,$o,$pos2)=@_; - - &comment($p++); - if ($p & 1) - { - #&mov($tmp1, $c) if $o == -1; - &xor($tmp1, $d) if $o == -1; - &mov($tmp2, &Xv($pos)); - &xor($tmp1, $b); - &add($a, $tmp2); - &rotl($c, 10); - &add($a, $tmp1); - &mov($tmp1, &Np($c)); # NEXT - # XXX - &rotl($a, $s); - &add($a, $e); - } - else - { - &xor($tmp1, $d); - &mov($tmp2, &Xv($pos)); - &xor($tmp1, $b); - &add($a, $tmp1); - &mov($tmp1, &Np($c)) if $o <= 0; - &mov($tmp1, -1) if $o == 1; - # XXX if $o == 2; - &rotl($c, 10); - &add($a, $tmp2); - &xor($tmp1, &Np($d)) if $o <= 0; - &mov($tmp2, &Xv($pos2)) if $o == 1; - &mov($tmp2, &wparam(0)) if $o == 2; - &rotl($a, $s); - &add($a, $e); - } - } - -sub RIP2 - { - local($a,$b,$c,$d,$e,$pos,$pos2,$s,$K,$o)=@_; - -# XXXXXX - &comment($p++); - if ($p & 1) - { -# &mov($tmp2, &Xv($pos)) if $o < -1; -# &mov($tmp1, -1) if $o < -1; - - &add($a, $tmp2); - &mov($tmp2, $c); - &sub($tmp1, $b); - &and($tmp2, $b); - &and($tmp1, $d); - &or($tmp2, $tmp1); - &mov($tmp1, &Xv($pos2)) if $o <= 0; # XXXXXXXXXXXXXX - # XXX - &rotl($c, 10); - &lea($a, &DWP($K,$a,$tmp2,1)); - &mov($tmp2, -1) if $o <= 0; - # XXX - &rotl($a, $s); - &add($a, $e); - } - else - { - # XXX - &add($a, $tmp1); - &mov($tmp1, $c); - &sub($tmp2, $b); - &and($tmp1, $b); - &and($tmp2, $d); - if ($o != 2) - { - &or($tmp1, $tmp2); - &mov($tmp2, &Xv($pos2)) if $o <= 0; - &mov($tmp2, -1) if $o == 1; - &rotl($c, 10); - &lea($a, &DWP($K,$a,$tmp1,1)); - &mov($tmp1, -1) if $o <= 0; - &sub($tmp2, &Np($c)) if $o == 1; - } else { - &or($tmp2, $tmp1); - &mov($tmp1, &Np($c)); - &rotl($c, 10); - &lea($a, &DWP($K,$a,$tmp2,1)); - &xor($tmp1, &Np($d)); - } - &rotl($a, $s); - &add($a, $e); - } - } - -sub RIP3 - { - local($a,$b,$c,$d,$e,$pos,$s,$K,$o,$pos2)=@_; - - &comment($p++); - if ($p & 1) - { -# &mov($tmp2, -1) if $o < -1; -# &sub($tmp2, $c) if $o < -1; - &mov($tmp1, &Xv($pos)); - &or($tmp2, $b); - &add($a, $tmp1); - &xor($tmp2, $d); - &mov($tmp1, -1) if $o <= 0; # NEXT - # XXX - &rotl($c, 10); - &lea($a, &DWP($K,$a,$tmp2,1)); - &sub($tmp1, &Np($c)) if $o <= 0; # NEXT - # XXX - &rotl($a, $s); - &add($a, $e); - } - else - { - &mov($tmp2, &Xv($pos)); - &or($tmp1, $b); - &add($a, $tmp2); - &xor($tmp1, $d); - &mov($tmp2, -1) if $o <= 0; # NEXT - &mov($tmp2, -1) if $o == 1; - &mov($tmp2, &Xv($pos2)) if $o == 2; - &rotl($c, 10); - &lea($a, &DWP($K,$a,$tmp1,1)); - &sub($tmp2, &Np($c)) if $o <= 0; # NEXT - &mov($tmp1, &Np($d)) if $o == 1; - &mov($tmp1, -1) if $o == 2; - &rotl($a, $s); - &add($a, $e); - } - } - -sub RIP4 - { - local($a,$b,$c,$d,$e,$pos,$s,$K,$o)=@_; - - &comment($p++); - if ($p & 1) - { -# &mov($tmp2, -1) if $o == -2; -# &mov($tmp1, $d) if $o == -2; - &sub($tmp2, $d); - &and($tmp1, $b); - &and($tmp2, $c); - &or($tmp2, $tmp1); - &mov($tmp1, &Xv($pos)); - &rotl($c, 10); - &lea($a, &DWP($K,$a,$tmp2)); - &mov($tmp2, -1) unless $o > 0; # NEXT - # XXX - &add($a, $tmp1); - &mov($tmp1, &Np($d)) unless $o > 0; # NEXT - # XXX - &rotl($a, $s); - &add($a, $e); - } - else - { - &sub($tmp2, $d); - &and($tmp1, $b); - &and($tmp2, $c); - &or($tmp2, $tmp1); - &mov($tmp1, &Xv($pos)); - &rotl($c, 10); - &lea($a, &DWP($K,$a,$tmp2)); - &mov($tmp2, -1) if $o == 0; # NEXT - &mov($tmp2, -1) if $o == 1; - &mov($tmp2, -1) if $o == 2; - # XXX - &add($a, $tmp1); - &mov($tmp1, &Np($d)) if $o == 0; # NEXT - &sub($tmp2, &Np($d)) if $o == 1; - &sub($tmp2, &Np($c)) if $o == 2; - # XXX - &rotl($a, $s); - &add($a, $e); - } - } - -sub RIP5 - { - local($a,$b,$c,$d,$e,$pos,$s,$K,$o)=@_; - - &comment($p++); - if ($p & 1) - { - &mov($tmp2, -1) if $o == -2; - &sub($tmp2, $d) if $o == -2; - &mov($tmp1, &Xv($pos)); - &or($tmp2, $c); - &add($a, $tmp1); - &xor($tmp2, $b); - &mov($tmp1, -1) if $o <= 0; - # XXX - &rotl($c, 10); - &lea($a, &DWP($K,$a,$tmp2,1)); - &sub($tmp1, &Np($d)) if $o <= 0; - # XXX - &rotl($a, $s); - &add($a, $e); - } - else - { - &mov($tmp2, &Xv($pos)); - &or($tmp1, $c); - &add($a, $tmp2); - &xor($tmp1, $b); - &mov($tmp2, -1) if $o <= 0; - &mov($tmp2, &wparam(0)) if $o == 1; # Middle code - &mov($tmp2, -1) if $o == 2; - &rotl($c, 10); - &lea($a, &DWP($K,$a,$tmp1,1)); - &sub($tmp2, &Np($d)) if $o <= 0; - &mov(&swtmp(16), $A) if $o == 1; - &mov($tmp1, &Np($d)) if $o == 2; - &rotl($a, $s); - &add($a, $e); - } - } - -sub ripemd160_block - { - local($name)=@_; - - &function_begin_B($name,"",3); - - # parameter 1 is the RIPEMD160_CTX structure. - # A 0 - # B 4 - # C 8 - # D 12 - # E 16 - - &mov($tmp2, &wparam(0)); - &mov($tmp1, &wparam(1)); - &push("esi"); - &mov($A, &DWP( 0,$tmp2,"",0)); - &push("edi"); - &mov($B, &DWP( 4,$tmp2,"",0)); - &push("ebp"); - &mov($C, &DWP( 8,$tmp2,"",0)); - &push("ebx"); - &stack_push(16+5+6); - # Special comment about the figure of 6. - # Idea is to pad the current frame so - # that the top of the stack gets fairly - # aligned. Well, as you realize it would - # always depend on how the frame below is - # aligned. The good news are that gcc-2.95 - # and later does keep first argument at - # least double-wise aligned. - # <appro@fy.chalmers.se> - - &set_label("start") unless $normal; - &comment(""); - - # &mov($tmp1, &wparam(1)); # Done at end of loop - # &mov($tmp2, &wparam(0)); # Done at end of loop - - for ($z=0; $z<16; $z+=2) - { - &mov($D, &DWP( $z*4,$tmp1,"",0)); - &mov($E, &DWP( ($z+1)*4,$tmp1,"",0)); - &mov(&swtmp($z), $D); - &mov(&swtmp($z+1), $E); - } - &mov($tmp1, $C); - &mov($D, &DWP(12,$tmp2,"",0)); - &mov($E, &DWP(16,$tmp2,"",0)); - - &RIP1($A,$B,$C,$D,$E,$wl[ 0],$sl[ 0],-1); - &RIP1($E,$A,$B,$C,$D,$wl[ 1],$sl[ 1],0); - &RIP1($D,$E,$A,$B,$C,$wl[ 2],$sl[ 2],0); - &RIP1($C,$D,$E,$A,$B,$wl[ 3],$sl[ 3],0); - &RIP1($B,$C,$D,$E,$A,$wl[ 4],$sl[ 4],0); - &RIP1($A,$B,$C,$D,$E,$wl[ 5],$sl[ 5],0); - &RIP1($E,$A,$B,$C,$D,$wl[ 6],$sl[ 6],0); - &RIP1($D,$E,$A,$B,$C,$wl[ 7],$sl[ 7],0); - &RIP1($C,$D,$E,$A,$B,$wl[ 8],$sl[ 8],0); - &RIP1($B,$C,$D,$E,$A,$wl[ 9],$sl[ 9],0); - &RIP1($A,$B,$C,$D,$E,$wl[10],$sl[10],0); - &RIP1($E,$A,$B,$C,$D,$wl[11],$sl[11],0); - &RIP1($D,$E,$A,$B,$C,$wl[12],$sl[12],0); - &RIP1($C,$D,$E,$A,$B,$wl[13],$sl[13],0); - &RIP1($B,$C,$D,$E,$A,$wl[14],$sl[14],0); - &RIP1($A,$B,$C,$D,$E,$wl[15],$sl[15],1,$wl[16]); - - &RIP2($E,$A,$B,$C,$D,$wl[16],$wl[17],$sl[16],$KL1,-1); - &RIP2($D,$E,$A,$B,$C,$wl[17],$wl[18],$sl[17],$KL1,0); - &RIP2($C,$D,$E,$A,$B,$wl[18],$wl[19],$sl[18],$KL1,0); - &RIP2($B,$C,$D,$E,$A,$wl[19],$wl[20],$sl[19],$KL1,0); - &RIP2($A,$B,$C,$D,$E,$wl[20],$wl[21],$sl[20],$KL1,0); - &RIP2($E,$A,$B,$C,$D,$wl[21],$wl[22],$sl[21],$KL1,0); - &RIP2($D,$E,$A,$B,$C,$wl[22],$wl[23],$sl[22],$KL1,0); - &RIP2($C,$D,$E,$A,$B,$wl[23],$wl[24],$sl[23],$KL1,0); - &RIP2($B,$C,$D,$E,$A,$wl[24],$wl[25],$sl[24],$KL1,0); - &RIP2($A,$B,$C,$D,$E,$wl[25],$wl[26],$sl[25],$KL1,0); - &RIP2($E,$A,$B,$C,$D,$wl[26],$wl[27],$sl[26],$KL1,0); - &RIP2($D,$E,$A,$B,$C,$wl[27],$wl[28],$sl[27],$KL1,0); - &RIP2($C,$D,$E,$A,$B,$wl[28],$wl[29],$sl[28],$KL1,0); - &RIP2($B,$C,$D,$E,$A,$wl[29],$wl[30],$sl[29],$KL1,0); - &RIP2($A,$B,$C,$D,$E,$wl[30],$wl[31],$sl[30],$KL1,0); - &RIP2($E,$A,$B,$C,$D,$wl[31],$wl[32],$sl[31],$KL1,1); - - &RIP3($D,$E,$A,$B,$C,$wl[32],$sl[32],$KL2,-1); - &RIP3($C,$D,$E,$A,$B,$wl[33],$sl[33],$KL2,0); - &RIP3($B,$C,$D,$E,$A,$wl[34],$sl[34],$KL2,0); - &RIP3($A,$B,$C,$D,$E,$wl[35],$sl[35],$KL2,0); - &RIP3($E,$A,$B,$C,$D,$wl[36],$sl[36],$KL2,0); - &RIP3($D,$E,$A,$B,$C,$wl[37],$sl[37],$KL2,0); - &RIP3($C,$D,$E,$A,$B,$wl[38],$sl[38],$KL2,0); - &RIP3($B,$C,$D,$E,$A,$wl[39],$sl[39],$KL2,0); - &RIP3($A,$B,$C,$D,$E,$wl[40],$sl[40],$KL2,0); - &RIP3($E,$A,$B,$C,$D,$wl[41],$sl[41],$KL2,0); - &RIP3($D,$E,$A,$B,$C,$wl[42],$sl[42],$KL2,0); - &RIP3($C,$D,$E,$A,$B,$wl[43],$sl[43],$KL2,0); - &RIP3($B,$C,$D,$E,$A,$wl[44],$sl[44],$KL2,0); - &RIP3($A,$B,$C,$D,$E,$wl[45],$sl[45],$KL2,0); - &RIP3($E,$A,$B,$C,$D,$wl[46],$sl[46],$KL2,0); - &RIP3($D,$E,$A,$B,$C,$wl[47],$sl[47],$KL2,1); - - &RIP4($C,$D,$E,$A,$B,$wl[48],$sl[48],$KL3,-1); - &RIP4($B,$C,$D,$E,$A,$wl[49],$sl[49],$KL3,0); - &RIP4($A,$B,$C,$D,$E,$wl[50],$sl[50],$KL3,0); - &RIP4($E,$A,$B,$C,$D,$wl[51],$sl[51],$KL3,0); - &RIP4($D,$E,$A,$B,$C,$wl[52],$sl[52],$KL3,0); - &RIP4($C,$D,$E,$A,$B,$wl[53],$sl[53],$KL3,0); - &RIP4($B,$C,$D,$E,$A,$wl[54],$sl[54],$KL3,0); - &RIP4($A,$B,$C,$D,$E,$wl[55],$sl[55],$KL3,0); - &RIP4($E,$A,$B,$C,$D,$wl[56],$sl[56],$KL3,0); - &RIP4($D,$E,$A,$B,$C,$wl[57],$sl[57],$KL3,0); - &RIP4($C,$D,$E,$A,$B,$wl[58],$sl[58],$KL3,0); - &RIP4($B,$C,$D,$E,$A,$wl[59],$sl[59],$KL3,0); - &RIP4($A,$B,$C,$D,$E,$wl[60],$sl[60],$KL3,0); - &RIP4($E,$A,$B,$C,$D,$wl[61],$sl[61],$KL3,0); - &RIP4($D,$E,$A,$B,$C,$wl[62],$sl[62],$KL3,0); - &RIP4($C,$D,$E,$A,$B,$wl[63],$sl[63],$KL3,1); - - &RIP5($B,$C,$D,$E,$A,$wl[64],$sl[64],$KL4,-1); - &RIP5($A,$B,$C,$D,$E,$wl[65],$sl[65],$KL4,0); - &RIP5($E,$A,$B,$C,$D,$wl[66],$sl[66],$KL4,0); - &RIP5($D,$E,$A,$B,$C,$wl[67],$sl[67],$KL4,0); - &RIP5($C,$D,$E,$A,$B,$wl[68],$sl[68],$KL4,0); - &RIP5($B,$C,$D,$E,$A,$wl[69],$sl[69],$KL4,0); - &RIP5($A,$B,$C,$D,$E,$wl[70],$sl[70],$KL4,0); - &RIP5($E,$A,$B,$C,$D,$wl[71],$sl[71],$KL4,0); - &RIP5($D,$E,$A,$B,$C,$wl[72],$sl[72],$KL4,0); - &RIP5($C,$D,$E,$A,$B,$wl[73],$sl[73],$KL4,0); - &RIP5($B,$C,$D,$E,$A,$wl[74],$sl[74],$KL4,0); - &RIP5($A,$B,$C,$D,$E,$wl[75],$sl[75],$KL4,0); - &RIP5($E,$A,$B,$C,$D,$wl[76],$sl[76],$KL4,0); - &RIP5($D,$E,$A,$B,$C,$wl[77],$sl[77],$KL4,0); - &RIP5($C,$D,$E,$A,$B,$wl[78],$sl[78],$KL4,0); - &RIP5($B,$C,$D,$E,$A,$wl[79],$sl[79],$KL4,1); - - # &mov($tmp2, &wparam(0)); # moved into last RIP5 - # &mov(&swtmp(16), $A); - &mov($A, &DWP( 0,$tmp2,"",0)); - &mov(&swtmp(16+1), $B); - &mov(&swtmp(16+2), $C); - &mov($B, &DWP( 4,$tmp2,"",0)); - &mov(&swtmp(16+3), $D); - &mov($C, &DWP( 8,$tmp2,"",0)); - &mov(&swtmp(16+4), $E); - &mov($D, &DWP(12,$tmp2,"",0)); - &mov($E, &DWP(16,$tmp2,"",0)); - - &RIP5($A,$B,$C,$D,$E,$wr[ 0],$sr[ 0],$KR0,-2); - &RIP5($E,$A,$B,$C,$D,$wr[ 1],$sr[ 1],$KR0,0); - &RIP5($D,$E,$A,$B,$C,$wr[ 2],$sr[ 2],$KR0,0); - &RIP5($C,$D,$E,$A,$B,$wr[ 3],$sr[ 3],$KR0,0); - &RIP5($B,$C,$D,$E,$A,$wr[ 4],$sr[ 4],$KR0,0); - &RIP5($A,$B,$C,$D,$E,$wr[ 5],$sr[ 5],$KR0,0); - &RIP5($E,$A,$B,$C,$D,$wr[ 6],$sr[ 6],$KR0,0); - &RIP5($D,$E,$A,$B,$C,$wr[ 7],$sr[ 7],$KR0,0); - &RIP5($C,$D,$E,$A,$B,$wr[ 8],$sr[ 8],$KR0,0); - &RIP5($B,$C,$D,$E,$A,$wr[ 9],$sr[ 9],$KR0,0); - &RIP5($A,$B,$C,$D,$E,$wr[10],$sr[10],$KR0,0); - &RIP5($E,$A,$B,$C,$D,$wr[11],$sr[11],$KR0,0); - &RIP5($D,$E,$A,$B,$C,$wr[12],$sr[12],$KR0,0); - &RIP5($C,$D,$E,$A,$B,$wr[13],$sr[13],$KR0,0); - &RIP5($B,$C,$D,$E,$A,$wr[14],$sr[14],$KR0,0); - &RIP5($A,$B,$C,$D,$E,$wr[15],$sr[15],$KR0,2); - - &RIP4($E,$A,$B,$C,$D,$wr[16],$sr[16],$KR1,-2); - &RIP4($D,$E,$A,$B,$C,$wr[17],$sr[17],$KR1,0); - &RIP4($C,$D,$E,$A,$B,$wr[18],$sr[18],$KR1,0); - &RIP4($B,$C,$D,$E,$A,$wr[19],$sr[19],$KR1,0); - &RIP4($A,$B,$C,$D,$E,$wr[20],$sr[20],$KR1,0); - &RIP4($E,$A,$B,$C,$D,$wr[21],$sr[21],$KR1,0); - &RIP4($D,$E,$A,$B,$C,$wr[22],$sr[22],$KR1,0); - &RIP4($C,$D,$E,$A,$B,$wr[23],$sr[23],$KR1,0); - &RIP4($B,$C,$D,$E,$A,$wr[24],$sr[24],$KR1,0); - &RIP4($A,$B,$C,$D,$E,$wr[25],$sr[25],$KR1,0); - &RIP4($E,$A,$B,$C,$D,$wr[26],$sr[26],$KR1,0); - &RIP4($D,$E,$A,$B,$C,$wr[27],$sr[27],$KR1,0); - &RIP4($C,$D,$E,$A,$B,$wr[28],$sr[28],$KR1,0); - &RIP4($B,$C,$D,$E,$A,$wr[29],$sr[29],$KR1,0); - &RIP4($A,$B,$C,$D,$E,$wr[30],$sr[30],$KR1,0); - &RIP4($E,$A,$B,$C,$D,$wr[31],$sr[31],$KR1,2); - - &RIP3($D,$E,$A,$B,$C,$wr[32],$sr[32],$KR2,-2); - &RIP3($C,$D,$E,$A,$B,$wr[33],$sr[33],$KR2,0); - &RIP3($B,$C,$D,$E,$A,$wr[34],$sr[34],$KR2,0); - &RIP3($A,$B,$C,$D,$E,$wr[35],$sr[35],$KR2,0); - &RIP3($E,$A,$B,$C,$D,$wr[36],$sr[36],$KR2,0); - &RIP3($D,$E,$A,$B,$C,$wr[37],$sr[37],$KR2,0); - &RIP3($C,$D,$E,$A,$B,$wr[38],$sr[38],$KR2,0); - &RIP3($B,$C,$D,$E,$A,$wr[39],$sr[39],$KR2,0); - &RIP3($A,$B,$C,$D,$E,$wr[40],$sr[40],$KR2,0); - &RIP3($E,$A,$B,$C,$D,$wr[41],$sr[41],$KR2,0); - &RIP3($D,$E,$A,$B,$C,$wr[42],$sr[42],$KR2,0); - &RIP3($C,$D,$E,$A,$B,$wr[43],$sr[43],$KR2,0); - &RIP3($B,$C,$D,$E,$A,$wr[44],$sr[44],$KR2,0); - &RIP3($A,$B,$C,$D,$E,$wr[45],$sr[45],$KR2,0); - &RIP3($E,$A,$B,$C,$D,$wr[46],$sr[46],$KR2,0); - &RIP3($D,$E,$A,$B,$C,$wr[47],$sr[47],$KR2,2,$wr[48]); - - &RIP2($C,$D,$E,$A,$B,$wr[48],$wr[49],$sr[48],$KR3,-2); - &RIP2($B,$C,$D,$E,$A,$wr[49],$wr[50],$sr[49],$KR3,0); - &RIP2($A,$B,$C,$D,$E,$wr[50],$wr[51],$sr[50],$KR3,0); - &RIP2($E,$A,$B,$C,$D,$wr[51],$wr[52],$sr[51],$KR3,0); - &RIP2($D,$E,$A,$B,$C,$wr[52],$wr[53],$sr[52],$KR3,0); - &RIP2($C,$D,$E,$A,$B,$wr[53],$wr[54],$sr[53],$KR3,0); - &RIP2($B,$C,$D,$E,$A,$wr[54],$wr[55],$sr[54],$KR3,0); - &RIP2($A,$B,$C,$D,$E,$wr[55],$wr[56],$sr[55],$KR3,0); - &RIP2($E,$A,$B,$C,$D,$wr[56],$wr[57],$sr[56],$KR3,0); - &RIP2($D,$E,$A,$B,$C,$wr[57],$wr[58],$sr[57],$KR3,0); - &RIP2($C,$D,$E,$A,$B,$wr[58],$wr[59],$sr[58],$KR3,0); - &RIP2($B,$C,$D,$E,$A,$wr[59],$wr[60],$sr[59],$KR3,0); - &RIP2($A,$B,$C,$D,$E,$wr[60],$wr[61],$sr[60],$KR3,0); - &RIP2($E,$A,$B,$C,$D,$wr[61],$wr[62],$sr[61],$KR3,0); - &RIP2($D,$E,$A,$B,$C,$wr[62],$wr[63],$sr[62],$KR3,0); - &RIP2($C,$D,$E,$A,$B,$wr[63],$wr[64],$sr[63],$KR3,2); - - &RIP1($B,$C,$D,$E,$A,$wr[64],$sr[64],-2); - &RIP1($A,$B,$C,$D,$E,$wr[65],$sr[65],0); - &RIP1($E,$A,$B,$C,$D,$wr[66],$sr[66],0); - &RIP1($D,$E,$A,$B,$C,$wr[67],$sr[67],0); - &RIP1($C,$D,$E,$A,$B,$wr[68],$sr[68],0); - &RIP1($B,$C,$D,$E,$A,$wr[69],$sr[69],0); - &RIP1($A,$B,$C,$D,$E,$wr[70],$sr[70],0); - &RIP1($E,$A,$B,$C,$D,$wr[71],$sr[71],0); - &RIP1($D,$E,$A,$B,$C,$wr[72],$sr[72],0); - &RIP1($C,$D,$E,$A,$B,$wr[73],$sr[73],0); - &RIP1($B,$C,$D,$E,$A,$wr[74],$sr[74],0); - &RIP1($A,$B,$C,$D,$E,$wr[75],$sr[75],0); - &RIP1($E,$A,$B,$C,$D,$wr[76],$sr[76],0); - &RIP1($D,$E,$A,$B,$C,$wr[77],$sr[77],0); - &RIP1($C,$D,$E,$A,$B,$wr[78],$sr[78],0); - &RIP1($B,$C,$D,$E,$A,$wr[79],$sr[79],2); - - # &mov($tmp2, &wparam(0)); # Moved into last round - - &mov($tmp1, &DWP( 4,$tmp2,"",0)); # ctx->B - &add($D, $tmp1); - &mov($tmp1, &swtmp(16+2)); # $c - &add($D, $tmp1); - - &mov($tmp1, &DWP( 8,$tmp2,"",0)); # ctx->C - &add($E, $tmp1); - &mov($tmp1, &swtmp(16+3)); # $d - &add($E, $tmp1); - - &mov($tmp1, &DWP(12,$tmp2,"",0)); # ctx->D - &add($A, $tmp1); - &mov($tmp1, &swtmp(16+4)); # $e - &add($A, $tmp1); - - - &mov($tmp1, &DWP(16,$tmp2,"",0)); # ctx->E - &add($B, $tmp1); - &mov($tmp1, &swtmp(16+0)); # $a - &add($B, $tmp1); - - &mov($tmp1, &DWP( 0,$tmp2,"",0)); # ctx->A - &add($C, $tmp1); - &mov($tmp1, &swtmp(16+1)); # $b - &add($C, $tmp1); - - &mov($tmp1, &wparam(2)); - - &mov(&DWP( 0,$tmp2,"",0), $D); - &mov(&DWP( 4,$tmp2,"",0), $E); - &mov(&DWP( 8,$tmp2,"",0), $A); - &sub($tmp1,1); - &mov(&DWP(12,$tmp2,"",0), $B); - &mov(&DWP(16,$tmp2,"",0), $C); - - &jle(&label("get_out")); - - &mov(&wparam(2),$tmp1); - &mov($C, $A); - &mov($tmp1, &wparam(1)); - &mov($A, $D); - &add($tmp1, 64); - &mov($B, $E); - &mov(&wparam(1),$tmp1); - - &jmp(&label("start")); - - &set_label("get_out"); - - &stack_pop(16+5+6); - - &pop("ebx"); - &pop("ebp"); - &pop("edi"); - &pop("esi"); - &ret(); - &function_end_B($name); - } - diff --git a/app/openssl/crypto/ripemd/ripemd.h b/app/openssl/crypto/ripemd/ripemd.h deleted file mode 100644 index 189bd8c9..00000000 --- a/app/openssl/crypto/ripemd/ripemd.h +++ /dev/null @@ -1,107 +0,0 @@ -/* crypto/ripemd/ripemd.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#ifndef HEADER_RIPEMD_H -#define HEADER_RIPEMD_H - -#include <openssl/e_os2.h> -#include <stddef.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef OPENSSL_NO_RIPEMD -#error RIPEMD is disabled. -#endif - -#if defined(__LP32__) -#define RIPEMD160_LONG unsigned long -#elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__) -#define RIPEMD160_LONG unsigned long -#define RIPEMD160_LONG_LOG2 3 -#else -#define RIPEMD160_LONG unsigned int -#endif - -#define RIPEMD160_CBLOCK 64 -#define RIPEMD160_LBLOCK (RIPEMD160_CBLOCK/4) -#define RIPEMD160_DIGEST_LENGTH 20 - -typedef struct RIPEMD160state_st - { - RIPEMD160_LONG A,B,C,D,E; - RIPEMD160_LONG Nl,Nh; - RIPEMD160_LONG data[RIPEMD160_LBLOCK]; - unsigned int num; - } RIPEMD160_CTX; - -#ifdef OPENSSL_FIPS -int private_RIPEMD160_Init(RIPEMD160_CTX *c); -#endif -int RIPEMD160_Init(RIPEMD160_CTX *c); -int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len); -int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c); -unsigned char *RIPEMD160(const unsigned char *d, size_t n, - unsigned char *md); -void RIPEMD160_Transform(RIPEMD160_CTX *c, const unsigned char *b); -#ifdef __cplusplus -} -#endif - -#endif diff --git a/app/openssl/crypto/ripemd/rmd160.c b/app/openssl/crypto/ripemd/rmd160.c deleted file mode 100644 index b0ec5744..00000000 --- a/app/openssl/crypto/ripemd/rmd160.c +++ /dev/null @@ -1,127 +0,0 @@ -/* crypto/ripemd/rmd160.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <stdlib.h> -#include <openssl/ripemd.h> - -#define BUFSIZE 1024*16 - -void do_fp(FILE *f); -void pt(unsigned char *md); -#if !defined(_OSD_POSIX) && !defined(__DJGPP__) -int read(int, void *, unsigned int); -#endif - -int main(int argc, char **argv) - { - int i,err=0; - FILE *IN; - - if (argc == 1) - { - do_fp(stdin); - } - else - { - for (i=1; i<argc; i++) - { - IN=fopen(argv[i],"r"); - if (IN == NULL) - { - perror(argv[i]); - err++; - continue; - } - printf("RIPEMD160(%s)= ",argv[i]); - do_fp(IN); - fclose(IN); - } - } - exit(err); - } - -void do_fp(FILE *f) - { - RIPEMD160_CTX c; - unsigned char md[RIPEMD160_DIGEST_LENGTH]; - int fd; - int i; - static unsigned char buf[BUFSIZE]; - - fd=fileno(f); - RIPEMD160_Init(&c); - for (;;) - { - i=read(fd,buf,BUFSIZE); - if (i <= 0) break; - RIPEMD160_Update(&c,buf,(unsigned long)i); - } - RIPEMD160_Final(&(md[0]),&c); - pt(md); - } - -void pt(unsigned char *md) - { - int i; - - for (i=0; i<RIPEMD160_DIGEST_LENGTH; i++) - printf("%02x",md[i]); - printf("\n"); - } - diff --git a/app/openssl/crypto/ripemd/rmd_dgst.c b/app/openssl/crypto/ripemd/rmd_dgst.c deleted file mode 100644 index d8e72da5..00000000 --- a/app/openssl/crypto/ripemd/rmd_dgst.c +++ /dev/null @@ -1,292 +0,0 @@ -/* crypto/ripemd/rmd_dgst.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include "rmd_locl.h" -#include <openssl/opensslv.h> -#include <openssl/crypto.h> - -const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT; - -# ifdef RMD160_ASM - void ripemd160_block_x86(RIPEMD160_CTX *c, unsigned long *p,size_t num); -# define ripemd160_block ripemd160_block_x86 -# else - void ripemd160_block(RIPEMD160_CTX *c, unsigned long *p,size_t num); -# endif - -fips_md_init(RIPEMD160) - { - memset (c,0,sizeof(*c)); - c->A=RIPEMD160_A; - c->B=RIPEMD160_B; - c->C=RIPEMD160_C; - c->D=RIPEMD160_D; - c->E=RIPEMD160_E; - return 1; - } - -#ifndef ripemd160_block_data_order -#ifdef X -#undef X -#endif -void ripemd160_block_data_order (RIPEMD160_CTX *ctx, const void *p, size_t num) - { - const unsigned char *data=p; - register unsigned MD32_REG_T A,B,C,D,E; - unsigned MD32_REG_T a,b,c,d,e,l; -#ifndef MD32_XARRAY - /* See comment in crypto/sha/sha_locl.h for details. */ - unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, - XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; -# define X(i) XX##i -#else - RIPEMD160_LONG XX[16]; -# define X(i) XX[i] -#endif - - for (;num--;) - { - - A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E; - - (void)HOST_c2l(data,l); X( 0)=l;(void)HOST_c2l(data,l); X( 1)=l; - RIP1(A,B,C,D,E,WL00,SL00); (void)HOST_c2l(data,l); X( 2)=l; - RIP1(E,A,B,C,D,WL01,SL01); (void)HOST_c2l(data,l); X( 3)=l; - RIP1(D,E,A,B,C,WL02,SL02); (void)HOST_c2l(data,l); X( 4)=l; - RIP1(C,D,E,A,B,WL03,SL03); (void)HOST_c2l(data,l); X( 5)=l; - RIP1(B,C,D,E,A,WL04,SL04); (void)HOST_c2l(data,l); X( 6)=l; - RIP1(A,B,C,D,E,WL05,SL05); (void)HOST_c2l(data,l); X( 7)=l; - RIP1(E,A,B,C,D,WL06,SL06); (void)HOST_c2l(data,l); X( 8)=l; - RIP1(D,E,A,B,C,WL07,SL07); (void)HOST_c2l(data,l); X( 9)=l; - RIP1(C,D,E,A,B,WL08,SL08); (void)HOST_c2l(data,l); X(10)=l; - RIP1(B,C,D,E,A,WL09,SL09); (void)HOST_c2l(data,l); X(11)=l; - RIP1(A,B,C,D,E,WL10,SL10); (void)HOST_c2l(data,l); X(12)=l; - RIP1(E,A,B,C,D,WL11,SL11); (void)HOST_c2l(data,l); X(13)=l; - RIP1(D,E,A,B,C,WL12,SL12); (void)HOST_c2l(data,l); X(14)=l; - RIP1(C,D,E,A,B,WL13,SL13); (void)HOST_c2l(data,l); X(15)=l; - RIP1(B,C,D,E,A,WL14,SL14); - RIP1(A,B,C,D,E,WL15,SL15); - - RIP2(E,A,B,C,D,WL16,SL16,KL1); - RIP2(D,E,A,B,C,WL17,SL17,KL1); - RIP2(C,D,E,A,B,WL18,SL18,KL1); - RIP2(B,C,D,E,A,WL19,SL19,KL1); - RIP2(A,B,C,D,E,WL20,SL20,KL1); - RIP2(E,A,B,C,D,WL21,SL21,KL1); - RIP2(D,E,A,B,C,WL22,SL22,KL1); - RIP2(C,D,E,A,B,WL23,SL23,KL1); - RIP2(B,C,D,E,A,WL24,SL24,KL1); - RIP2(A,B,C,D,E,WL25,SL25,KL1); - RIP2(E,A,B,C,D,WL26,SL26,KL1); - RIP2(D,E,A,B,C,WL27,SL27,KL1); - RIP2(C,D,E,A,B,WL28,SL28,KL1); - RIP2(B,C,D,E,A,WL29,SL29,KL1); - RIP2(A,B,C,D,E,WL30,SL30,KL1); - RIP2(E,A,B,C,D,WL31,SL31,KL1); - - RIP3(D,E,A,B,C,WL32,SL32,KL2); - RIP3(C,D,E,A,B,WL33,SL33,KL2); - RIP3(B,C,D,E,A,WL34,SL34,KL2); - RIP3(A,B,C,D,E,WL35,SL35,KL2); - RIP3(E,A,B,C,D,WL36,SL36,KL2); - RIP3(D,E,A,B,C,WL37,SL37,KL2); - RIP3(C,D,E,A,B,WL38,SL38,KL2); - RIP3(B,C,D,E,A,WL39,SL39,KL2); - RIP3(A,B,C,D,E,WL40,SL40,KL2); - RIP3(E,A,B,C,D,WL41,SL41,KL2); - RIP3(D,E,A,B,C,WL42,SL42,KL2); - RIP3(C,D,E,A,B,WL43,SL43,KL2); - RIP3(B,C,D,E,A,WL44,SL44,KL2); - RIP3(A,B,C,D,E,WL45,SL45,KL2); - RIP3(E,A,B,C,D,WL46,SL46,KL2); - RIP3(D,E,A,B,C,WL47,SL47,KL2); - - RIP4(C,D,E,A,B,WL48,SL48,KL3); - RIP4(B,C,D,E,A,WL49,SL49,KL3); - RIP4(A,B,C,D,E,WL50,SL50,KL3); - RIP4(E,A,B,C,D,WL51,SL51,KL3); - RIP4(D,E,A,B,C,WL52,SL52,KL3); - RIP4(C,D,E,A,B,WL53,SL53,KL3); - RIP4(B,C,D,E,A,WL54,SL54,KL3); - RIP4(A,B,C,D,E,WL55,SL55,KL3); - RIP4(E,A,B,C,D,WL56,SL56,KL3); - RIP4(D,E,A,B,C,WL57,SL57,KL3); - RIP4(C,D,E,A,B,WL58,SL58,KL3); - RIP4(B,C,D,E,A,WL59,SL59,KL3); - RIP4(A,B,C,D,E,WL60,SL60,KL3); - RIP4(E,A,B,C,D,WL61,SL61,KL3); - RIP4(D,E,A,B,C,WL62,SL62,KL3); - RIP4(C,D,E,A,B,WL63,SL63,KL3); - - RIP5(B,C,D,E,A,WL64,SL64,KL4); - RIP5(A,B,C,D,E,WL65,SL65,KL4); - RIP5(E,A,B,C,D,WL66,SL66,KL4); - RIP5(D,E,A,B,C,WL67,SL67,KL4); - RIP5(C,D,E,A,B,WL68,SL68,KL4); - RIP5(B,C,D,E,A,WL69,SL69,KL4); - RIP5(A,B,C,D,E,WL70,SL70,KL4); - RIP5(E,A,B,C,D,WL71,SL71,KL4); - RIP5(D,E,A,B,C,WL72,SL72,KL4); - RIP5(C,D,E,A,B,WL73,SL73,KL4); - RIP5(B,C,D,E,A,WL74,SL74,KL4); - RIP5(A,B,C,D,E,WL75,SL75,KL4); - RIP5(E,A,B,C,D,WL76,SL76,KL4); - RIP5(D,E,A,B,C,WL77,SL77,KL4); - RIP5(C,D,E,A,B,WL78,SL78,KL4); - RIP5(B,C,D,E,A,WL79,SL79,KL4); - - a=A; b=B; c=C; d=D; e=E; - /* Do other half */ - A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E; - - RIP5(A,B,C,D,E,WR00,SR00,KR0); - RIP5(E,A,B,C,D,WR01,SR01,KR0); - RIP5(D,E,A,B,C,WR02,SR02,KR0); - RIP5(C,D,E,A,B,WR03,SR03,KR0); - RIP5(B,C,D,E,A,WR04,SR04,KR0); - RIP5(A,B,C,D,E,WR05,SR05,KR0); - RIP5(E,A,B,C,D,WR06,SR06,KR0); - RIP5(D,E,A,B,C,WR07,SR07,KR0); - RIP5(C,D,E,A,B,WR08,SR08,KR0); - RIP5(B,C,D,E,A,WR09,SR09,KR0); - RIP5(A,B,C,D,E,WR10,SR10,KR0); - RIP5(E,A,B,C,D,WR11,SR11,KR0); - RIP5(D,E,A,B,C,WR12,SR12,KR0); - RIP5(C,D,E,A,B,WR13,SR13,KR0); - RIP5(B,C,D,E,A,WR14,SR14,KR0); - RIP5(A,B,C,D,E,WR15,SR15,KR0); - - RIP4(E,A,B,C,D,WR16,SR16,KR1); - RIP4(D,E,A,B,C,WR17,SR17,KR1); - RIP4(C,D,E,A,B,WR18,SR18,KR1); - RIP4(B,C,D,E,A,WR19,SR19,KR1); - RIP4(A,B,C,D,E,WR20,SR20,KR1); - RIP4(E,A,B,C,D,WR21,SR21,KR1); - RIP4(D,E,A,B,C,WR22,SR22,KR1); - RIP4(C,D,E,A,B,WR23,SR23,KR1); - RIP4(B,C,D,E,A,WR24,SR24,KR1); - RIP4(A,B,C,D,E,WR25,SR25,KR1); - RIP4(E,A,B,C,D,WR26,SR26,KR1); - RIP4(D,E,A,B,C,WR27,SR27,KR1); - RIP4(C,D,E,A,B,WR28,SR28,KR1); - RIP4(B,C,D,E,A,WR29,SR29,KR1); - RIP4(A,B,C,D,E,WR30,SR30,KR1); - RIP4(E,A,B,C,D,WR31,SR31,KR1); - - RIP3(D,E,A,B,C,WR32,SR32,KR2); - RIP3(C,D,E,A,B,WR33,SR33,KR2); - RIP3(B,C,D,E,A,WR34,SR34,KR2); - RIP3(A,B,C,D,E,WR35,SR35,KR2); - RIP3(E,A,B,C,D,WR36,SR36,KR2); - RIP3(D,E,A,B,C,WR37,SR37,KR2); - RIP3(C,D,E,A,B,WR38,SR38,KR2); - RIP3(B,C,D,E,A,WR39,SR39,KR2); - RIP3(A,B,C,D,E,WR40,SR40,KR2); - RIP3(E,A,B,C,D,WR41,SR41,KR2); - RIP3(D,E,A,B,C,WR42,SR42,KR2); - RIP3(C,D,E,A,B,WR43,SR43,KR2); - RIP3(B,C,D,E,A,WR44,SR44,KR2); - RIP3(A,B,C,D,E,WR45,SR45,KR2); - RIP3(E,A,B,C,D,WR46,SR46,KR2); - RIP3(D,E,A,B,C,WR47,SR47,KR2); - - RIP2(C,D,E,A,B,WR48,SR48,KR3); - RIP2(B,C,D,E,A,WR49,SR49,KR3); - RIP2(A,B,C,D,E,WR50,SR50,KR3); - RIP2(E,A,B,C,D,WR51,SR51,KR3); - RIP2(D,E,A,B,C,WR52,SR52,KR3); - RIP2(C,D,E,A,B,WR53,SR53,KR3); - RIP2(B,C,D,E,A,WR54,SR54,KR3); - RIP2(A,B,C,D,E,WR55,SR55,KR3); - RIP2(E,A,B,C,D,WR56,SR56,KR3); - RIP2(D,E,A,B,C,WR57,SR57,KR3); - RIP2(C,D,E,A,B,WR58,SR58,KR3); - RIP2(B,C,D,E,A,WR59,SR59,KR3); - RIP2(A,B,C,D,E,WR60,SR60,KR3); - RIP2(E,A,B,C,D,WR61,SR61,KR3); - RIP2(D,E,A,B,C,WR62,SR62,KR3); - RIP2(C,D,E,A,B,WR63,SR63,KR3); - - RIP1(B,C,D,E,A,WR64,SR64); - RIP1(A,B,C,D,E,WR65,SR65); - RIP1(E,A,B,C,D,WR66,SR66); - RIP1(D,E,A,B,C,WR67,SR67); - RIP1(C,D,E,A,B,WR68,SR68); - RIP1(B,C,D,E,A,WR69,SR69); - RIP1(A,B,C,D,E,WR70,SR70); - RIP1(E,A,B,C,D,WR71,SR71); - RIP1(D,E,A,B,C,WR72,SR72); - RIP1(C,D,E,A,B,WR73,SR73); - RIP1(B,C,D,E,A,WR74,SR74); - RIP1(A,B,C,D,E,WR75,SR75); - RIP1(E,A,B,C,D,WR76,SR76); - RIP1(D,E,A,B,C,WR77,SR77); - RIP1(C,D,E,A,B,WR78,SR78); - RIP1(B,C,D,E,A,WR79,SR79); - - D =ctx->B+c+D; - ctx->B=ctx->C+d+E; - ctx->C=ctx->D+e+A; - ctx->D=ctx->E+a+B; - ctx->E=ctx->A+b+C; - ctx->A=D; - - } - } -#endif diff --git a/app/openssl/crypto/ripemd/rmd_locl.h b/app/openssl/crypto/ripemd/rmd_locl.h deleted file mode 100644 index 2bd8957d..00000000 --- a/app/openssl/crypto/ripemd/rmd_locl.h +++ /dev/null @@ -1,150 +0,0 @@ -/* crypto/ripemd/rmd_locl.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdlib.h> -#include <string.h> -#include <openssl/opensslconf.h> -#include <openssl/ripemd.h> - -#ifndef RIPEMD160_LONG_LOG2 -#define RIPEMD160_LONG_LOG2 2 /* default to 32 bits */ -#endif - -/* - * DO EXAMINE COMMENTS IN crypto/md5/md5_locl.h & crypto/md5/md5_dgst.c - * FOR EXPLANATIONS ON FOLLOWING "CODE." - * <appro@fy.chalmers.se> - */ -#ifdef RMD160_ASM -# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) -# define ripemd160_block_data_order ripemd160_block_asm_data_order -# endif -#endif - -void ripemd160_block_data_order (RIPEMD160_CTX *c, const void *p,size_t num); - -#define DATA_ORDER_IS_LITTLE_ENDIAN - -#define HASH_LONG RIPEMD160_LONG -#define HASH_CTX RIPEMD160_CTX -#define HASH_CBLOCK RIPEMD160_CBLOCK -#define HASH_UPDATE RIPEMD160_Update -#define HASH_TRANSFORM RIPEMD160_Transform -#define HASH_FINAL RIPEMD160_Final -#define HASH_MAKE_STRING(c,s) do { \ - unsigned long ll; \ - ll=(c)->A; (void)HOST_l2c(ll,(s)); \ - ll=(c)->B; (void)HOST_l2c(ll,(s)); \ - ll=(c)->C; (void)HOST_l2c(ll,(s)); \ - ll=(c)->D; (void)HOST_l2c(ll,(s)); \ - ll=(c)->E; (void)HOST_l2c(ll,(s)); \ - } while (0) -#define HASH_BLOCK_DATA_ORDER ripemd160_block_data_order - -#include "md32_common.h" - -#if 0 -#define F1(x,y,z) ((x)^(y)^(z)) -#define F2(x,y,z) (((x)&(y))|((~x)&z)) -#define F3(x,y,z) (((x)|(~y))^(z)) -#define F4(x,y,z) (((x)&(z))|((y)&(~(z)))) -#define F5(x,y,z) ((x)^((y)|(~(z)))) -#else -/* - * Transformed F2 and F4 are courtesy of Wei Dai <weidai@eskimo.com> - */ -#define F1(x,y,z) ((x) ^ (y) ^ (z)) -#define F2(x,y,z) ((((y) ^ (z)) & (x)) ^ (z)) -#define F3(x,y,z) (((~(y)) | (x)) ^ (z)) -#define F4(x,y,z) ((((x) ^ (y)) & (z)) ^ (y)) -#define F5(x,y,z) (((~(z)) | (y)) ^ (x)) -#endif - -#define RIPEMD160_A 0x67452301L -#define RIPEMD160_B 0xEFCDAB89L -#define RIPEMD160_C 0x98BADCFEL -#define RIPEMD160_D 0x10325476L -#define RIPEMD160_E 0xC3D2E1F0L - -#include "rmdconst.h" - -#define RIP1(a,b,c,d,e,w,s) { \ - a+=F1(b,c,d)+X(w); \ - a=ROTATE(a,s)+e; \ - c=ROTATE(c,10); } - -#define RIP2(a,b,c,d,e,w,s,K) { \ - a+=F2(b,c,d)+X(w)+K; \ - a=ROTATE(a,s)+e; \ - c=ROTATE(c,10); } - -#define RIP3(a,b,c,d,e,w,s,K) { \ - a+=F3(b,c,d)+X(w)+K; \ - a=ROTATE(a,s)+e; \ - c=ROTATE(c,10); } - -#define RIP4(a,b,c,d,e,w,s,K) { \ - a+=F4(b,c,d)+X(w)+K; \ - a=ROTATE(a,s)+e; \ - c=ROTATE(c,10); } - -#define RIP5(a,b,c,d,e,w,s,K) { \ - a+=F5(b,c,d)+X(w)+K; \ - a=ROTATE(a,s)+e; \ - c=ROTATE(c,10); } - diff --git a/app/openssl/crypto/ripemd/rmd_one.c b/app/openssl/crypto/ripemd/rmd_one.c deleted file mode 100644 index 3efb1375..00000000 --- a/app/openssl/crypto/ripemd/rmd_one.c +++ /dev/null @@ -1,78 +0,0 @@ -/* crypto/ripemd/rmd_one.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <string.h> -#include <openssl/ripemd.h> -#include <openssl/crypto.h> - -unsigned char *RIPEMD160(const unsigned char *d, size_t n, - unsigned char *md) - { - RIPEMD160_CTX c; - static unsigned char m[RIPEMD160_DIGEST_LENGTH]; - - if (md == NULL) md=m; - if (!RIPEMD160_Init(&c)) - return NULL; - RIPEMD160_Update(&c,d,n); - RIPEMD160_Final(md,&c); - OPENSSL_cleanse(&c,sizeof(c)); /* security consideration */ - return(md); - } - diff --git a/app/openssl/crypto/ripemd/rmdconst.h b/app/openssl/crypto/ripemd/rmdconst.h deleted file mode 100644 index 59c48dea..00000000 --- a/app/openssl/crypto/ripemd/rmdconst.h +++ /dev/null @@ -1,399 +0,0 @@ -/* crypto/ripemd/rmdconst.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -#define KL0 0x00000000L -#define KL1 0x5A827999L -#define KL2 0x6ED9EBA1L -#define KL3 0x8F1BBCDCL -#define KL4 0xA953FD4EL - -#define KR0 0x50A28BE6L -#define KR1 0x5C4DD124L -#define KR2 0x6D703EF3L -#define KR3 0x7A6D76E9L -#define KR4 0x00000000L - -#define WL00 0 -#define SL00 11 -#define WL01 1 -#define SL01 14 -#define WL02 2 -#define SL02 15 -#define WL03 3 -#define SL03 12 -#define WL04 4 -#define SL04 5 -#define WL05 5 -#define SL05 8 -#define WL06 6 -#define SL06 7 -#define WL07 7 -#define SL07 9 -#define WL08 8 -#define SL08 11 -#define WL09 9 -#define SL09 13 -#define WL10 10 -#define SL10 14 -#define WL11 11 -#define SL11 15 -#define WL12 12 -#define SL12 6 -#define WL13 13 -#define SL13 7 -#define WL14 14 -#define SL14 9 -#define WL15 15 -#define SL15 8 - -#define WL16 7 -#define SL16 7 -#define WL17 4 -#define SL17 6 -#define WL18 13 -#define SL18 8 -#define WL19 1 -#define SL19 13 -#define WL20 10 -#define SL20 11 -#define WL21 6 -#define SL21 9 -#define WL22 15 -#define SL22 7 -#define WL23 3 -#define SL23 15 -#define WL24 12 -#define SL24 7 -#define WL25 0 -#define SL25 12 -#define WL26 9 -#define SL26 15 -#define WL27 5 -#define SL27 9 -#define WL28 2 -#define SL28 11 -#define WL29 14 -#define SL29 7 -#define WL30 11 -#define SL30 13 -#define WL31 8 -#define SL31 12 - -#define WL32 3 -#define SL32 11 -#define WL33 10 -#define SL33 13 -#define WL34 14 -#define SL34 6 -#define WL35 4 -#define SL35 7 -#define WL36 9 -#define SL36 14 -#define WL37 15 -#define SL37 9 -#define WL38 8 -#define SL38 13 -#define WL39 1 -#define SL39 15 -#define WL40 2 -#define SL40 14 -#define WL41 7 -#define SL41 8 -#define WL42 0 -#define SL42 13 -#define WL43 6 -#define SL43 6 -#define WL44 13 -#define SL44 5 -#define WL45 11 -#define SL45 12 -#define WL46 5 -#define SL46 7 -#define WL47 12 -#define SL47 5 - -#define WL48 1 -#define SL48 11 -#define WL49 9 -#define SL49 12 -#define WL50 11 -#define SL50 14 -#define WL51 10 -#define SL51 15 -#define WL52 0 -#define SL52 14 -#define WL53 8 -#define SL53 15 -#define WL54 12 -#define SL54 9 -#define WL55 4 -#define SL55 8 -#define WL56 13 -#define SL56 9 -#define WL57 3 -#define SL57 14 -#define WL58 7 -#define SL58 5 -#define WL59 15 -#define SL59 6 -#define WL60 14 -#define SL60 8 -#define WL61 5 -#define SL61 6 -#define WL62 6 -#define SL62 5 -#define WL63 2 -#define SL63 12 - -#define WL64 4 -#define SL64 9 -#define WL65 0 -#define SL65 15 -#define WL66 5 -#define SL66 5 -#define WL67 9 -#define SL67 11 -#define WL68 7 -#define SL68 6 -#define WL69 12 -#define SL69 8 -#define WL70 2 -#define SL70 13 -#define WL71 10 -#define SL71 12 -#define WL72 14 -#define SL72 5 -#define WL73 1 -#define SL73 12 -#define WL74 3 -#define SL74 13 -#define WL75 8 -#define SL75 14 -#define WL76 11 -#define SL76 11 -#define WL77 6 -#define SL77 8 -#define WL78 15 -#define SL78 5 -#define WL79 13 -#define SL79 6 - -#define WR00 5 -#define SR00 8 -#define WR01 14 -#define SR01 9 -#define WR02 7 -#define SR02 9 -#define WR03 0 -#define SR03 11 -#define WR04 9 -#define SR04 13 -#define WR05 2 -#define SR05 15 -#define WR06 11 -#define SR06 15 -#define WR07 4 -#define SR07 5 -#define WR08 13 -#define SR08 7 -#define WR09 6 -#define SR09 7 -#define WR10 15 -#define SR10 8 -#define WR11 8 -#define SR11 11 -#define WR12 1 -#define SR12 14 -#define WR13 10 -#define SR13 14 -#define WR14 3 -#define SR14 12 -#define WR15 12 -#define SR15 6 - -#define WR16 6 -#define SR16 9 -#define WR17 11 -#define SR17 13 -#define WR18 3 -#define SR18 15 -#define WR19 7 -#define SR19 7 -#define WR20 0 -#define SR20 12 -#define WR21 13 -#define SR21 8 -#define WR22 5 -#define SR22 9 -#define WR23 10 -#define SR23 11 -#define WR24 14 -#define SR24 7 -#define WR25 15 -#define SR25 7 -#define WR26 8 -#define SR26 12 -#define WR27 12 -#define SR27 7 -#define WR28 4 -#define SR28 6 -#define WR29 9 -#define SR29 15 -#define WR30 1 -#define SR30 13 -#define WR31 2 -#define SR31 11 - -#define WR32 15 -#define SR32 9 -#define WR33 5 -#define SR33 7 -#define WR34 1 -#define SR34 15 -#define WR35 3 -#define SR35 11 -#define WR36 7 -#define SR36 8 -#define WR37 14 -#define SR37 6 -#define WR38 6 -#define SR38 6 -#define WR39 9 -#define SR39 14 -#define WR40 11 -#define SR40 12 -#define WR41 8 -#define SR41 13 -#define WR42 12 -#define SR42 5 -#define WR43 2 -#define SR43 14 -#define WR44 10 -#define SR44 13 -#define WR45 0 -#define SR45 13 -#define WR46 4 -#define SR46 7 -#define WR47 13 -#define SR47 5 - -#define WR48 8 -#define SR48 15 -#define WR49 6 -#define SR49 5 -#define WR50 4 -#define SR50 8 -#define WR51 1 -#define SR51 11 -#define WR52 3 -#define SR52 14 -#define WR53 11 -#define SR53 14 -#define WR54 15 -#define SR54 6 -#define WR55 0 -#define SR55 14 -#define WR56 5 -#define SR56 6 -#define WR57 12 -#define SR57 9 -#define WR58 2 -#define SR58 12 -#define WR59 13 -#define SR59 9 -#define WR60 9 -#define SR60 12 -#define WR61 7 -#define SR61 5 -#define WR62 10 -#define SR62 15 -#define WR63 14 -#define SR63 8 - -#define WR64 12 -#define SR64 8 -#define WR65 15 -#define SR65 5 -#define WR66 10 -#define SR66 12 -#define WR67 4 -#define SR67 9 -#define WR68 1 -#define SR68 12 -#define WR69 5 -#define SR69 5 -#define WR70 8 -#define SR70 14 -#define WR71 7 -#define SR71 6 -#define WR72 6 -#define SR72 8 -#define WR73 2 -#define SR73 13 -#define WR74 13 -#define SR74 6 -#define WR75 14 -#define SR75 5 -#define WR76 0 -#define SR76 15 -#define WR77 3 -#define SR77 13 -#define WR78 9 -#define SR78 11 -#define WR79 11 -#define SR79 11 - diff --git a/app/openssl/crypto/ripemd/rmdtest.c b/app/openssl/crypto/ripemd/rmdtest.c deleted file mode 100644 index fb34e0e8..00000000 --- a/app/openssl/crypto/ripemd/rmdtest.c +++ /dev/null @@ -1,145 +0,0 @@ -/* crypto/ripemd/rmdtest.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> - -#include "../e_os.h" - -#ifdef OPENSSL_NO_RIPEMD -int main(int argc, char *argv[]) -{ - printf("No ripemd support\n"); - return(0); -} -#else -#include <openssl/ripemd.h> -#include <openssl/evp.h> - -#ifdef CHARSET_EBCDIC -#include <openssl/ebcdic.h> -#endif - -static char *test[]={ - "", - "a", - "abc", - "message digest", - "abcdefghijklmnopqrstuvwxyz", - "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", - "12345678901234567890123456789012345678901234567890123456789012345678901234567890", - NULL, - }; - -static char *ret[]={ - "9c1185a5c5e9fc54612808977ee8f548b2258d31", - "0bdc9d2d256b3ee9daae347be6f4dc835a467ffe", - "8eb208f7e05d987a9b044a8e98c6b087f15a0bfc", - "5d0689ef49d2fae572b881b123a85ffa21595f36", - "f71c27109c692c1b56bbdceb5b9d2865b3708dbc", - "12a053384a9c0c88e405a06c27dcf49ada62eb2b", - "b0e20b6e3116640286ed3a87a5713079b21f5189", - "9b752e45573d4b39f4dbd3323cab82bf63326bfb", - }; - -static char *pt(unsigned char *md); -int main(int argc, char *argv[]) - { - int i,err=0; - char **P,**R; - char *p; - unsigned char md[RIPEMD160_DIGEST_LENGTH]; - - P=test; - R=ret; - i=1; - while (*P != NULL) - { -#ifdef CHARSET_EBCDIC - ebcdic2ascii((char *)*P, (char *)*P, strlen((char *)*P)); -#endif - EVP_Digest(&(P[0][0]),strlen((char *)*P),md,NULL,EVP_ripemd160(), NULL); - p=pt(md); - if (strcmp(p,(char *)*R) != 0) - { - printf("error calculating RIPEMD160 on '%s'\n",*P); - printf("got %s instead of %s\n",p,*R); - err++; - } - else - printf("test %d ok\n",i); - i++; - R++; - P++; - } - EXIT(err); - return(0); - } - -static char *pt(unsigned char *md) - { - int i; - static char buf[80]; - - for (i=0; i<RIPEMD160_DIGEST_LENGTH; i++) - sprintf(&(buf[i*2]),"%02x",md[i]); - return(buf); - } -#endif diff --git a/app/openssl/crypto/rsa/rsa_ameth.c b/app/openssl/crypto/rsa/rsa_ameth.c index 5a2062f9..4c8ecd92 100644 --- a/app/openssl/crypto/rsa/rsa_ameth.c +++ b/app/openssl/crypto/rsa/rsa_ameth.c @@ -358,7 +358,7 @@ static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0) goto err; } - else if (BIO_puts(bp, "0x14 (default)") <= 0) + else if (BIO_puts(bp, "14 (default)") <= 0) goto err; BIO_puts(bp, "\n"); diff --git a/app/openssl/crypto/sha/asm/sha1-armv4-large.S b/app/openssl/crypto/sha/asm/sha1-armv4-large.S index 6523cbdd..a1562883 120000..100644 --- a/app/openssl/crypto/sha/asm/sha1-armv4-large.S +++ b/app/openssl/crypto/sha/asm/sha1-armv4-large.S @@ -1 +1,1450 @@ -sha1-armv4-large.s
\ No newline at end of file +#include "arm_arch.h" + +.text +.code 32 + +.global sha1_block_data_order +.type sha1_block_data_order,%function + +.align 5 +sha1_block_data_order: +#if __ARM_ARCH__>=7 + sub r3,pc,#8 @ sha1_block_data_order + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA1 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + stmdb sp!,{r4-r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +.Lloop: + ldr r8,.LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +.L_00_15: +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) + teq r14,sp + bne .L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,.LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +.L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) + teq r14,sp @ preserve carry + bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,.LK_40_59 + sub sp,sp,#20*4 @ [+2] +.L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 + teq r14,sp + bne .L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,.LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b .L_20_39_or_60_79 @ [+4], spare 300 bytes +.L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne .Lloop @ [+18], total 1307 + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha1_block_data_order,.-sha1_block_data_order + +.align 5 +.LK_00_19: .word 0x5a827999 +.LK_20_39: .word 0x6ed9eba1 +.LK_40_59: .word 0x8f1bbcdc +.LK_60_79: .word 0xca62c1d6 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha1_block_data_order +.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 5 +#if __ARM_ARCH__>=7 +.fpu neon + +.type sha1_block_data_order_neon,%function +.align 4 +sha1_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + @ dmb @ errata #451034 on early Cortex A8 + @ vstmdb sp!,{d8-d15} @ ABI specification says so + mov r14,sp + sub sp,sp,#64 @ alloca + adr r8,.LK_00_19 + bic sp,sp,#15 @ align for 128-bit stores + + ldmia r0,{r3,r4,r5,r6,r7} @ load context + mov r12,sp + + vld1.8 {q0-q1},[r1]! @ handles unaligned + veor q15,q15,q15 + vld1.8 {q2-q3},[r1]! + vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 + vrev32.8 q0,q0 @ yes, even on + vrev32.8 q1,q1 @ big-endian... + vrev32.8 q2,q2 + vadd.i32 q8,q0,q14 + vrev32.8 q3,q3 + vadd.i32 q9,q1,q14 + vst1.32 {q8},[r12,:128]! + vadd.i32 q10,q2,q14 + vst1.32 {q9},[r12,:128]! + vst1.32 {q10},[r12,:128]! + ldr r9,[sp] @ big RAW stall + +.Loop_neon: + vext.8 q8,q0,q1,#8 + bic r10,r6,r4 + add r7,r7,r9 + and r11,r5,r4 + vadd.i32 q13,q3,q14 + ldr r9,[sp,#4] + add r7,r7,r3,ror#27 + vext.8 q12,q3,q15,#4 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q8,q8,q0 + bic r10,r5,r3 + add r6,r6,r9 + veor q12,q12,q2 + and r11,r4,r3 + ldr r9,[sp,#8] + veor q12,q12,q8 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q13,q15,q12,#4 + bic r10,r4,r7 + add r5,r5,r9 + vadd.i32 q8,q12,q12 + and r11,r3,r7 + ldr r9,[sp,#12] + vsri.32 q8,q12,#31 + add r5,r5,r6,ror#27 + eor r11,r11,r10 + mov r7,r7,ror#2 + vshr.u32 q12,q13,#30 + add r5,r5,r11 + bic r10,r3,r6 + vshl.u32 q13,q13,#2 + add r4,r4,r9 + and r11,r7,r6 + veor q8,q8,q12 + ldr r9,[sp,#16] + add r4,r4,r5,ror#27 + veor q8,q8,q13 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q9,q1,q2,#8 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + vadd.i32 q13,q8,q14 + ldr r9,[sp,#20] + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r4,ror#27 + vext.8 q12,q8,q15,#4 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + veor q9,q9,q1 + bic r10,r6,r4 + add r7,r7,r9 + veor q12,q12,q3 + and r11,r5,r4 + ldr r9,[sp,#24] + veor q12,q12,q9 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q13,q15,q12,#4 + bic r10,r5,r3 + add r6,r6,r9 + vadd.i32 q9,q12,q12 + and r11,r4,r3 + ldr r9,[sp,#28] + vsri.32 q9,q12,#31 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + mov r3,r3,ror#2 + vshr.u32 q12,q13,#30 + add r6,r6,r11 + bic r10,r4,r7 + vshl.u32 q13,q13,#2 + add r5,r5,r9 + and r11,r3,r7 + veor q9,q9,q12 + ldr r9,[sp,#32] + add r5,r5,r6,ror#27 + veor q9,q9,q13 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q10,q2,q3,#8 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + vadd.i32 q13,q9,q14 + ldr r9,[sp,#36] + add r4,r4,r5,ror#27 + vext.8 q12,q9,q15,#4 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q10,q10,q2 + bic r10,r7,r5 + add r3,r3,r9 + veor q12,q12,q8 + and r11,r6,r5 + ldr r9,[sp,#40] + veor q12,q12,q10 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q13,q15,q12,#4 + bic r10,r6,r4 + add r7,r7,r9 + vadd.i32 q10,q12,q12 + and r11,r5,r4 + ldr r9,[sp,#44] + vsri.32 q10,q12,#31 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + mov r4,r4,ror#2 + vshr.u32 q12,q13,#30 + add r7,r7,r11 + bic r10,r5,r3 + vshl.u32 q13,q13,#2 + add r6,r6,r9 + and r11,r4,r3 + veor q10,q10,q12 + ldr r9,[sp,#48] + add r6,r6,r7,ror#27 + veor q10,q10,q13 + eor r11,r11,r10 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q11,q3,q8,#8 + bic r10,r4,r7 + add r5,r5,r9 + and r11,r3,r7 + vadd.i32 q13,q10,q14 + ldr r9,[sp,#52] + add r5,r5,r6,ror#27 + vext.8 q12,q10,q15,#4 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q11,q11,q3 + bic r10,r3,r6 + add r4,r4,r9 + veor q12,q12,q9 + and r11,r7,r6 + ldr r9,[sp,#56] + veor q12,q12,q11 + add r4,r4,r5,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q13,q15,q12,#4 + bic r10,r7,r5 + add r3,r3,r9 + vadd.i32 q11,q12,q12 + and r11,r6,r5 + ldr r9,[sp,#60] + vsri.32 q11,q12,#31 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + vshr.u32 q12,q13,#30 + add r3,r3,r11 + bic r10,r6,r4 + vshl.u32 q13,q13,#2 + add r7,r7,r9 + and r11,r5,r4 + veor q11,q11,q12 + ldr r9,[sp,#0] + add r7,r7,r3,ror#27 + veor q11,q11,q13 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q10,q11,#8 + bic r10,r5,r3 + add r6,r6,r9 + and r11,r4,r3 + veor q0,q0,q8 + ldr r9,[sp,#4] + add r6,r6,r7,ror#27 + veor q0,q0,q1 + eor r11,r11,r10 + mov r3,r3,ror#2 + vadd.i32 q13,q11,q14 + add r6,r6,r11 + bic r10,r4,r7 + veor q12,q12,q0 + add r5,r5,r9 + and r11,r3,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r11,r10 + mov r7,r7,ror#2 + vsli.32 q0,q12,#2 + add r5,r5,r11 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + ldr r9,[sp,#12] + add r4,r4,r5,ror#27 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + ldr r9,[sp,#16] + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q11,q0,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#20] + veor q1,q1,q9 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q1,q1,q2 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q0,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q1 + ldr r9,[sp,#24] + eor r11,r10,r4 + vshr.u32 q1,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q1,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#28] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#32] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q0,q1,#8 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#36] + veor q2,q2,q10 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + veor q2,q2,q3 + mov r5,r5,ror#2 + add r3,r3,r11 + vadd.i32 q13,q1,q14 + eor r10,r4,r6 + vld1.32 {d28[],d29[]},[r8,:32]! + add r7,r7,r9 + veor q12,q12,q2 + ldr r9,[sp,#40] + eor r11,r10,r5 + vshr.u32 q2,q12,#30 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + vst1.32 {q13},[r12,:128]! + add r7,r7,r11 + eor r10,r3,r5 + vsli.32 q2,q12,#2 + add r6,r6,r9 + ldr r9,[sp,#44] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#48] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q1,q2,#8 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r7 + add r4,r4,r5,ror#27 + veor q3,q3,q8 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q2,q14 + eor r10,r5,r7 + add r3,r3,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r6 + vshr.u32 q3,q12,#30 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vst1.32 {q13},[r12,:128]! + add r3,r3,r11 + eor r10,r4,r6 + vsli.32 q3,q12,#2 + add r7,r7,r9 + ldr r9,[sp,#60] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#0] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q2,q3,#8 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#4] + veor q8,q8,q0 + eor r11,r10,r3 + add r5,r5,r6,ror#27 + veor q8,q8,q9 + mov r7,r7,ror#2 + add r5,r5,r11 + vadd.i32 q13,q3,q14 + eor r10,r6,r3 + add r4,r4,r9 + veor q12,q12,q8 + ldr r9,[sp,#8] + eor r11,r10,r7 + vshr.u32 q8,q12,#30 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + add r4,r4,r11 + eor r10,r5,r7 + vsli.32 q8,q12,#2 + add r3,r3,r9 + ldr r9,[sp,#12] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#16] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q3,q8,#8 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#20] + veor q9,q9,q1 + eor r11,r10,r4 + add r6,r6,r7,ror#27 + veor q9,q9,q10 + mov r3,r3,ror#2 + add r6,r6,r11 + vadd.i32 q13,q8,q14 + eor r10,r7,r4 + add r5,r5,r9 + veor q12,q12,q9 + ldr r9,[sp,#24] + eor r11,r10,r3 + vshr.u32 q9,q12,#30 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + vst1.32 {q13},[r12,:128]! + add r5,r5,r11 + eor r10,r6,r3 + vsli.32 q9,q12,#2 + add r4,r4,r9 + ldr r9,[sp,#28] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#32] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q8,q9,#8 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#36] + veor q10,q10,q2 + add r7,r7,r3,ror#27 + eor r11,r5,r6 + veor q10,q10,q11 + add r7,r7,r10 + and r11,r11,r4 + vadd.i32 q13,q9,q14 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q12,q12,q10 + add r6,r6,r9 + and r10,r4,r5 + vshr.u32 q10,q12,#30 + ldr r9,[sp,#40] + add r6,r6,r7,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r4,r5 + add r6,r6,r10 + vsli.32 q10,q12,#2 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#44] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#48] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q9,q10,#8 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#52] + veor q11,q11,q3 + add r3,r3,r4,ror#27 + eor r11,r6,r7 + veor q11,q11,q0 + add r3,r3,r10 + and r11,r11,r5 + vadd.i32 q13,q10,q14 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + veor q12,q12,q11 + add r7,r7,r9 + and r10,r5,r6 + vshr.u32 q11,q12,#30 + ldr r9,[sp,#56] + add r7,r7,r3,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r5,r6 + add r7,r7,r10 + vsli.32 q11,q12,#2 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#60] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#0] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q10,q11,#8 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#4] + veor q0,q0,q8 + add r4,r4,r5,ror#27 + eor r11,r7,r3 + veor q0,q0,q1 + add r4,r4,r10 + and r11,r11,r6 + vadd.i32 q13,q11,q14 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q12,q12,q0 + add r3,r3,r9 + and r10,r6,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r3,r3,r4,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r6,r7 + add r3,r3,r10 + vsli.32 q0,q12,#2 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#12] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#16] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q11,q0,#8 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#20] + veor q1,q1,q9 + add r5,r5,r6,ror#27 + eor r11,r3,r4 + veor q1,q1,q2 + add r5,r5,r10 + and r11,r11,r7 + vadd.i32 q13,q0,q14 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q12,q12,q1 + add r4,r4,r9 + and r10,r7,r3 + vshr.u32 q1,q12,#30 + ldr r9,[sp,#24] + add r4,r4,r5,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r7,r3 + add r4,r4,r10 + vsli.32 q1,q12,#2 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#28] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#32] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q0,q1,#8 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#36] + veor q2,q2,q10 + add r6,r6,r7,ror#27 + eor r11,r4,r5 + veor q2,q2,q3 + add r6,r6,r10 + and r11,r11,r3 + vadd.i32 q13,q1,q14 + mov r3,r3,ror#2 + add r6,r6,r11 + veor q12,q12,q2 + add r5,r5,r9 + and r10,r3,r4 + vshr.u32 q2,q12,#30 + ldr r9,[sp,#40] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r3,r4 + add r5,r5,r10 + vsli.32 q2,q12,#2 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#44] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#48] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q1,q2,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q3,q3,q8 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q2,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r4 + vshr.u32 q3,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q3,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#60] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#0] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q3,q14 + eor r10,r5,r7 + add r3,r3,r9 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + teq r1,r2 + sub r8,r8,#16 + subeq r1,r1,#64 + vld1.8 {q0-q1},[r1]! + ldr r9,[sp,#4] + eor r11,r10,r6 + vld1.8 {q2-q3},[r1]! + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + eor r10,r4,r6 + vrev32.8 q0,q0 + add r7,r7,r9 + ldr r9,[sp,#8] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#12] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#16] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vrev32.8 q1,q1 + eor r10,r6,r3 + add r4,r4,r9 + vadd.i32 q8,q0,q14 + ldr r9,[sp,#20] + eor r11,r10,r7 + vst1.32 {q8},[r12,:128]! + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#24] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#28] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#32] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vrev32.8 q2,q2 + eor r10,r7,r4 + add r5,r5,r9 + vadd.i32 q9,q1,q14 + ldr r9,[sp,#36] + eor r11,r10,r3 + vst1.32 {q9},[r12,:128]! + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#40] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#44] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#48] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vrev32.8 q3,q3 + eor r10,r3,r5 + add r6,r6,r9 + vadd.i32 q10,q2,q14 + ldr r9,[sp,#52] + eor r11,r10,r4 + vst1.32 {q10},[r12,:128]! + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#56] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#60] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + ldmia r0,{r9,r10,r11,r12} @ accumulate context + add r3,r3,r9 + ldr r9,[r0,#16] + add r4,r4,r10 + add r5,r5,r11 + add r6,r6,r12 + moveq sp,r14 + add r7,r7,r9 + ldrne r9,[sp] + stmia r0,{r3,r4,r5,r6,r7} + addne r12,sp,#3*16 + bne .Loop_neon + + @ vldmia sp!,{d8-d15} + ldmia sp!,{r4-r12,pc} +.size sha1_block_data_order_neon,.-sha1_block_data_order_neon +#endif +#if __ARM_ARCH__>=7 +.type sha1_block_data_order_armv8,%function +.align 5 +sha1_block_data_order_armv8: +.LARMv8: + vstmdb sp!,{d8-d15} @ ABI specification says so + + veor q1,q1,q1 + adr r3,.LK_00_19 + vld1.32 {q0},[r0]! + vld1.32 {d2[0]},[r0] + sub r0,r0,#16 + vld1.32 {d16[],d17[]},[r3,:32]! + vld1.32 {d18[],d19[]},[r3,:32]! + vld1.32 {d20[],d21[]},[r3,:32]! + vld1.32 {d22[],d23[]},[r3,:32] + +.Loop_v8: + vld1.8 {q4-q5},[r1]! + vld1.8 {q6-q7},[r1]! + vrev32.8 q4,q4 + vrev32.8 q5,q5 + + vadd.i32 q12,q8,q4 + vrev32.8 q6,q6 + vmov q14,q0 @ offload + subs r2,r2,#1 + + vadd.i32 q13,q8,q5 + vrev32.8 q7,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 0 + .byte 0x68,0x0c,0x02,0xf2 @ sha1c q0,q1,q12 + vadd.i32 q12,q8,q6 + .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 1 + .byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 + vadd.i32 q13,q8,q7 + .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 + .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 2 + .byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 + vadd.i32 q12,q8,q4 + .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 + .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 3 + .byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 + vadd.i32 q13,q9,q5 + .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 + .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 4 + .byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 + vadd.i32 q12,q9,q6 + .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 + .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 5 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q7 + .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 + .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 6 + .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + vadd.i32 q12,q9,q4 + .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 + .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 7 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q5 + .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 + .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 8 + .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + vadd.i32 q12,q10,q6 + .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 + .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 9 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q10,q7 + .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 + .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 10 + .byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q4 + .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 + .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 11 + .byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 + vadd.i32 q13,q10,q5 + .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 + .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 12 + .byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q6 + .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 + .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 13 + .byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 + vadd.i32 q13,q11,q7 + .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 + .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 14 + .byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 + vadd.i32 q12,q11,q4 + .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 + .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 15 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q5 + .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 + .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 16 + .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + vadd.i32 q12,q11,q6 + .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 17 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q7 + + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 18 + .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 19 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + + vadd.i32 q1,q1,q2 + vadd.i32 q0,q0,q14 + bne .Loop_v8 + + vst1.32 {q0},[r0]! + vst1.32 {d2[0]},[r0] + + vldmia sp!,{d8-d15} + bx lr @ bx lr +.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 +#endif +.comm OPENSSL_armcap_P,4,4 diff --git a/app/openssl/crypto/sha/asm/sha1-armv4-large.pl b/app/openssl/crypto/sha/asm/sha1-armv4-large.pl index 33da3e0e..50bd07b3 100644 --- a/app/openssl/crypto/sha/asm/sha1-armv4-large.pl +++ b/app/openssl/crypto/sha/asm/sha1-armv4-large.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -52,6 +52,20 @@ # Profiler-assisted and platform-specific optimization resulted in 10% # improvement on Cortex A8 core and 12.2 cycles per byte. +# September 2013. +# +# Add NEON implementation (see sha1-586.pl for background info). On +# Cortex A8 it was measured to process one byte in 6.7 cycles or >80% +# faster than integer-only code. Because [fully unrolled] NEON code +# is ~2.5x larger and there are some redundant instructions executed +# when processing last block, improvement is not as big for smallest +# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per +# byte, which is also >80% faster than integer-only code. + +# May 2014. +# +# Add ARMv8 code path performing at 2.35 cpb on Apple A7. + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -153,12 +167,22 @@ $code=<<___; #include "arm_arch.h" .text +.code 32 .global sha1_block_data_order .type sha1_block_data_order,%function -.align 2 +.align 5 sha1_block_data_order: +#if __ARM_ARCH__>=7 + sub r3,pc,#8 @ sha1_block_data_order + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA1 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif stmdb sp!,{r4-r12,lr} add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp ldmia $ctx,{$a,$b,$c,$d,$e} @@ -233,16 +257,422 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.align 2 +.size sha1_block_data_order,.-sha1_block_data_order + +.align 5 .LK_00_19: .word 0x5a827999 .LK_20_39: .word 0x6ed9eba1 .LK_40_59: .word 0x8f1bbcdc .LK_60_79: .word 0xca62c1d6 -.size sha1_block_data_order,.-sha1_block_data_order -.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" -.align 2 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha1_block_data_order +.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 5 +___ +##################################################################### +# NEON stuff +# +{{{ +my @V=($a,$b,$c,$d,$e); +my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14)); +my $Xi=4; +my @X=map("q$_",(8..11,0..3)); +my @Tx=("q12","q13"); +my ($K,$zero)=("q14","q15"); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub body_00_19 () { + ( + '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. + '&bic ($t0,$d,$b)', + '&add ($e,$e,$Ki)', # e+=X[i]+K + '&and ($t1,$c,$b)', + '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', + '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) + '&eor ($t1,$t1,$t0)', # F_00_19 + '&mov ($b,$b,"ror#2")', # b=ROR(b,2) + '&add ($e,$e,$t1);'. # e+=F_00_19 + '$j++; unshift(@V,pop(@V));' + ) +} +sub body_20_39 () { + ( + '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. + '&eor ($t0,$b,$d)', + '&add ($e,$e,$Ki)', # e+=X[i]+K + '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)', + '&eor ($t1,$t0,$c)', # F_20_39 + '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) + '&mov ($b,$b,"ror#2")', # b=ROR(b,2) + '&add ($e,$e,$t1);'. # e+=F_20_39 + '$j++; unshift(@V,pop(@V));' + ) +} +sub body_40_59 () { + ( + '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. + '&add ($e,$e,$Ki)', # e+=X[i]+K + '&and ($t0,$c,$d)', + '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', + '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) + '&eor ($t1,$c,$d)', + '&add ($e,$e,$t0)', + '&and ($t1,$t1,$b)', + '&mov ($b,$b,"ror#2")', # b=ROR(b,2) + '&add ($e,$e,$t1);'. # e+=F_40_59 + '$j++; unshift(@V,pop(@V));' + ) +} + +sub Xupdate_16_31 () +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e); + + &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@Tx[1],@X[-1&7],$K); + eval(shift(@insns)); + &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0); + eval(shift(@insns)); + &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8] + eval(shift(@insns)); + eval(shift(@insns)); + &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer + &sub ($Xfer,$Xfer,64) if ($Xi%4==0); + eval(shift(@insns)); + eval(shift(@insns)); + &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@Tx[0],@Tx[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 (@Tx[0],@Tx[1],30); + eval(shift(@insns)); + eval(shift(@insns)); + &vshl_u32 (@Tx[1],@Tx[1],2); + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@Tx[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xupdate_32_79 () +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e); + + &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@Tx[1],@X[-1&7],$K); + eval(shift(@insns)); + &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0); + eval(shift(@insns)); + &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 (@X[0],@Tx[0],30); + eval(shift(@insns)); + eval(shift(@insns)); + &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer + &sub ($Xfer,$Xfer,64) if ($Xi%4==0); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2 + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xuplast_80 () +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e); + + &vadd_i32 (@Tx[1],@X[-1&7],$K); + eval(shift(@insns)); + eval(shift(@insns)); + &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); + &sub ($Xfer,$Xfer,64); + + &teq ($inp,$len); + &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX + &subeq ($inp,$inp,64); # reload last block to avoid SEGV + &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19 + eval(shift(@insns)); + eval(shift(@insns)); + &vrev32_8 (@X[-4&7],@X[-4&7]); + + foreach (@insns) { eval; } # remaining instructions + + $Xi=0; +} + +sub Xloop() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e); + + &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K); + eval(shift(@insns)); + eval(shift(@insns)); + &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU + + foreach (@insns) { eval; } + + $Xi++; +} + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.type sha1_block_data_order_neon,%function +.align 4 +sha1_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp + @ dmb @ errata #451034 on early Cortex A8 + @ vstmdb sp!,{d8-d15} @ ABI specification says so + mov $saved_sp,sp + sub sp,sp,#64 @ alloca + adr $K_XX_XX,.LK_00_19 + bic sp,sp,#15 @ align for 128-bit stores + + ldmia $ctx,{$a,$b,$c,$d,$e} @ load context + mov $Xfer,sp + + vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned + veor $zero,$zero,$zero + vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]! + vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19 + vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on + vrev32.8 @X[-3&7],@X[-3&7] @ big-endian... + vrev32.8 @X[-2&7],@X[-2&7] + vadd.i32 @X[0],@X[-4&7],$K + vrev32.8 @X[-1&7],@X[-1&7] + vadd.i32 @X[1],@X[-3&7],$K + vst1.32 {@X[0]},[$Xfer,:128]! + vadd.i32 @X[2],@X[-2&7],$K + vst1.32 {@X[1]},[$Xfer,:128]! + vst1.32 {@X[2]},[$Xfer,:128]! + ldr $Ki,[sp] @ big RAW stall + +.Loop_neon: +___ + &Xupdate_16_31(\&body_00_19); + &Xupdate_16_31(\&body_00_19); + &Xupdate_16_31(\&body_00_19); + &Xupdate_16_31(\&body_00_19); + &Xupdate_32_79(\&body_00_19); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_20_39); + &Xuplast_80(\&body_20_39); + &Xloop(\&body_20_39); + &Xloop(\&body_20_39); + &Xloop(\&body_20_39); +$code.=<<___; + ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context + add $a,$a,$Ki + ldr $Ki,[$ctx,#16] + add $b,$b,$t0 + add $c,$c,$t1 + add $d,$d,$Xfer + moveq sp,$saved_sp + add $e,$e,$Ki + ldrne $Ki,[sp] + stmia $ctx,{$a,$b,$c,$d,$e} + addne $Xfer,sp,#3*16 + bne .Loop_neon + + @ vldmia sp!,{d8-d15} + ldmia sp!,{r4-r12,pc} +.size sha1_block_data_order_neon,.-sha1_block_data_order_neon +#endif +___ +}}} +##################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3)); +my @MSG=map("q$_",(4..7)); +my @Kxx=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); + +$code.=<<___; +#if __ARM_ARCH__>=7 +.type sha1_block_data_order_armv8,%function +.align 5 +sha1_block_data_order_armv8: +.LARMv8: + vstmdb sp!,{d8-d15} @ ABI specification says so + + veor $E,$E,$E + adr r3,.LK_00_19 + vld1.32 {$ABCD},[$ctx]! + vld1.32 {$E\[0]},[$ctx] + sub $ctx,$ctx,#16 + vld1.32 {@Kxx[0]\[]},[r3,:32]! + vld1.32 {@Kxx[1]\[]},[r3,:32]! + vld1.32 {@Kxx[2]\[]},[r3,:32]! + vld1.32 {@Kxx[3]\[]},[r3,:32] + +.Loop_v8: + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! + vrev32.8 @MSG[0],@MSG[0] + vrev32.8 @MSG[1],@MSG[1] + + vadd.i32 $W0,@Kxx[0],@MSG[0] + vrev32.8 @MSG[2],@MSG[2] + vmov $ABCD_SAVE,$ABCD @ offload + subs $len,$len,#1 + + vadd.i32 $W1,@Kxx[0],@MSG[1] + vrev32.8 @MSG[3],@MSG[3] + sha1h $E1,$ABCD @ 0 + sha1c $ABCD,$E,$W0 + vadd.i32 $W0,@Kxx[$j],@MSG[2] + sha1su0 @MSG[0],@MSG[1],@MSG[2] +___ +for ($j=0,$i=1;$i<20-3;$i++) { +my $f=("c","p","m","p")[$i/5]; +$code.=<<___; + sha1h $E0,$ABCD @ $i + sha1$f $ABCD,$E1,$W1 + vadd.i32 $W1,@Kxx[$j],@MSG[3] + sha1su1 @MSG[0],@MSG[3] +___ +$code.=<<___ if ($i<20-4); + sha1su0 @MSG[1],@MSG[2],@MSG[3] ___ + ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0); + push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0); +} +$code.=<<___; + sha1h $E0,$ABCD @ $i + sha1p $ABCD,$E1,$W1 + vadd.i32 $W1,@Kxx[$j],@MSG[3] + + sha1h $E1,$ABCD @ 18 + sha1p $ABCD,$E0,$W0 + + sha1h $E0,$ABCD @ 19 + sha1p $ABCD,$E1,$W1 + + vadd.i32 $E,$E,$E0 + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE + bne .Loop_v8 + + vst1.32 {$ABCD},[$ctx]! + vst1.32 {$E\[0]},[$ctx] + + vldmia sp!,{d8-d15} + ret @ bx lr +.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +.comm OPENSSL_armcap_P,4,4 +___ + +{ my %opcode = ( + "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40, + "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40, + "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 ); + + sub unsha1 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } +} + +foreach (split($/,$code)) { + s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or + s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo; + + s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo; + + s/\bret\b/bx lr/o or + s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4 + + print $_,$/; +} -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -print $code; close STDOUT; # enforce flush diff --git a/app/openssl/crypto/sha/asm/sha1-armv4-large.s b/app/openssl/crypto/sha/asm/sha1-armv4-large.s deleted file mode 100644 index 639ae78a..00000000 --- a/app/openssl/crypto/sha/asm/sha1-armv4-large.s +++ /dev/null @@ -1,452 +0,0 @@ -#include "arm_arch.h" - -.text - -.global sha1_block_data_order -.type sha1_block_data_order,%function - -.align 2 -sha1_block_data_order: - stmdb sp!,{r4-r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - ldmia r0,{r3,r4,r5,r6,r7} -.Lloop: - ldr r8,.LK_00_19 - mov r14,sp - sub sp,sp,#15*4 - mov r5,r5,ror#30 - mov r6,r6,ror#30 - mov r7,r7,ror#30 @ [6] -.L_00_15: -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r6,r8,r6,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r4,r5 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r6,r8,r6,ror#2 @ E+=K_00_19 - eor r10,r4,r5 @ F_xx_xx - add r6,r6,r7,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r3,r10,ror#2 - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r6,r6,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r5,r8,r5,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r3,r4 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r5,r8,r5,ror#2 @ E+=K_00_19 - eor r10,r3,r4 @ F_xx_xx - add r5,r5,r6,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r7,r10,ror#2 - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r5,r5,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r4,r8,r4,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r7,r3 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r4,r8,r4,ror#2 @ E+=K_00_19 - eor r10,r7,r3 @ F_xx_xx - add r4,r4,r5,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r6,r10,ror#2 - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r4,r4,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r3,r8,r3,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r6,r7 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r3,r8,r3,ror#2 @ E+=K_00_19 - eor r10,r6,r7 @ F_xx_xx - add r3,r3,r4,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r5,r10,ror#2 - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r3,r3,r10 @ E+=F_00_19(B,C,D) - teq r14,sp - bne .L_00_15 @ [((11+4)*5+2)*3] - sub sp,sp,#25*4 -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - add r6,r6,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - add r5,r5,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - add r4,r4,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - add r3,r3,r10 @ E+=F_00_19(B,C,D) - - ldr r8,.LK_20_39 @ [+15+16*4] - cmn sp,#0 @ [+3], clear carry to denote 20_39 -.L_20_39_or_60_79: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r4,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_20_39(B,C,D) - teq r14,sp @ preserve carry - bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] - bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes - - ldr r8,.LK_40_59 - sub sp,sp,#20*4 @ [+2] -.L_40_59: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r4,r10,ror#2 @ F_xx_xx - and r11,r5,r6 @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_40_59(B,C,D) - add r7,r7,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - and r11,r4,r5 @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_40_59(B,C,D) - add r6,r6,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - and r11,r3,r4 @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_40_59(B,C,D) - add r5,r5,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - and r11,r7,r3 @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_40_59(B,C,D) - add r4,r4,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - and r11,r6,r7 @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_40_59(B,C,D) - add r3,r3,r11,ror#2 - teq r14,sp - bne .L_40_59 @ [+((12+5)*5+2)*4] - - ldr r8,.LK_60_79 - sub sp,sp,#20*4 - cmp sp,#0 @ set carry to denote 60_79 - b .L_20_39_or_60_79 @ [+4], spare 300 bytes -.L_done: - add sp,sp,#80*4 @ "deallocate" stack frame - ldmia r0,{r8,r9,r10,r11,r12} - add r3,r8,r3 - add r4,r9,r4 - add r5,r10,r5,ror#2 - add r6,r11,r6,ror#2 - add r7,r12,r7,ror#2 - stmia r0,{r3,r4,r5,r6,r7} - teq r1,r2 - bne .Lloop @ [+18], total 1307 - -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.align 2 -.LK_00_19: .word 0x5a827999 -.LK_20_39: .word 0x6ed9eba1 -.LK_40_59: .word 0x8f1bbcdc -.LK_60_79: .word 0xca62c1d6 -.size sha1_block_data_order,.-sha1_block_data_order -.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" -.align 2 diff --git a/app/openssl/crypto/sha/asm/sha1-armv8.S b/app/openssl/crypto/sha/asm/sha1-armv8.S new file mode 100644 index 00000000..f9d12625 --- /dev/null +++ b/app/openssl/crypto/sha/asm/sha1-armv8.S @@ -0,0 +1,1211 @@ +#include "arm_arch.h" + +.text + +.globl sha1_block_data_order +.type sha1_block_data_order,%function +.align 6 +sha1_block_data_order: + ldr x16,.LOPENSSL_armcap_P + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA1 + b.ne .Lv8_entry + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp w20,w21,[x0] + ldp w22,w23,[x0,#8] + ldr w24,[x0,#16] + +.Loop: + ldr x3,[x1],#64 + movz w28,#0x7999 + sub x2,x2,#1 + movk w28,#0x5a82,lsl#16 +#ifdef __ARMEB__ + ror x3,x3,#32 +#else + rev32 x3,x3 +#endif + add w24,w24,w28 // warm it up + add w24,w24,w3 + lsr x4,x3,#32 + ldr x5,[x1,#-56] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w4 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x5,x5,#32 +#else + rev32 x5,x5 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w5 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x6,x5,#32 + ldr x7,[x1,#-48] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w6 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x7,x7,#32 +#else + rev32 x7,x7 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w7 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x8,x7,#32 + ldr x9,[x1,#-40] + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w8 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x9,x9,#32 +#else + rev32 x9,x9 +#endif + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w9 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + lsr x10,x9,#32 + ldr x11,[x1,#-32] + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w10 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x11,x11,#32 +#else + rev32 x11,x11 +#endif + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w11 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + lsr x12,x11,#32 + ldr x13,[x1,#-24] + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w12 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x13,x13,#32 +#else + rev32 x13,x13 +#endif + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w13 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + lsr x14,x13,#32 + ldr x15,[x1,#-16] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w14 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x15,x15,#32 +#else + rev32 x15,x15 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w15 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x16,x15,#32 + ldr x17,[x1,#-8] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w16 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x17,x17,#32 +#else + rev32 x17,x17 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w17 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x19,x17,#32 + eor w3,w3,w5 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w3,w3,w11 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w3,w3,w16 + ror w22,w22,#2 + add w24,w24,w19 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + eor w4,w4,w12 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + eor w4,w4,w17 + ror w21,w21,#2 + add w23,w23,w3 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + eor w5,w5,w13 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + eor w5,w5,w19 + ror w20,w20,#2 + add w22,w22,w4 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + eor w6,w6,w14 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + eor w6,w6,w3 + ror w24,w24,#2 + add w21,w21,w5 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + eor w7,w7,w15 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + eor w7,w7,w4 + ror w23,w23,#2 + add w20,w20,w6 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w7,w7,#31 + movz w28,#0xeba1 + movk w28,#0x6ed9,lsl#16 + eor w8,w8,w10 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w8,w8,w16 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w8,w8,w5 + ror w22,w22,#2 + add w24,w24,w7 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w9,w9,w6 + add w23,w23,w8 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w10,w10,w7 + add w22,w22,w9 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w11,w11,w8 + add w21,w21,w10 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w12,w12,w9 + add w20,w20,w11 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w13,w13,w10 + add w24,w24,w12 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w14,w14,w11 + add w23,w23,w13 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w15,w15,w12 + add w22,w22,w14 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w16,w16,w13 + add w21,w21,w15 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w17,w17,w14 + add w20,w20,w16 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w19,w19,w15 + add w24,w24,w17 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w3,w3,w16 + add w23,w23,w19 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w4,w4,w17 + add w22,w22,w3 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w5,w5,w19 + add w21,w21,w4 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w6,w6,w3 + add w20,w20,w5 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w7,w7,w4 + add w24,w24,w6 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w8,w8,w5 + add w23,w23,w7 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w9,w9,w6 + add w22,w22,w8 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w10,w10,w7 + add w21,w21,w9 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w11,w11,w8 + add w20,w20,w10 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w11,w11,#31 + movz w28,#0xbcdc + movk w28,#0x8f1b,lsl#16 + eor w12,w12,w14 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w12,w12,w9 + add w24,w24,w11 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w13,w13,w15 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w13,w13,w5 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w13,w13,w10 + add w23,w23,w12 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w14,w14,w16 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w14,w14,w6 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w14,w14,w11 + add w22,w22,w13 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w15,w15,w17 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w15,w15,w7 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w15,w15,w12 + add w21,w21,w14 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w15,w15,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w16,w16,w19 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w16,w16,w8 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w16,w16,w13 + add w20,w20,w15 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w16,w16,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w17,w17,w3 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w17,w17,w9 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w17,w17,w14 + add w24,w24,w16 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w17,w17,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w19,w19,w4 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w19,w19,w10 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w19,w19,w15 + add w23,w23,w17 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w19,w19,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w3,w3,w5 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w3,w3,w11 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w3,w3,w16 + add w22,w22,w19 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w3,w3,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w4,w4,w6 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w4,w4,w12 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w4,w4,w17 + add w21,w21,w3 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w4,w4,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w5,w5,w7 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w5,w5,w13 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w5,w5,w19 + add w20,w20,w4 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w5,w5,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w6,w6,w8 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w6,w6,w14 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w6,w6,w3 + add w24,w24,w5 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w6,w6,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w7,w7,w9 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w7,w7,w15 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w7,w7,w4 + add w23,w23,w6 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w7,w7,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w8,w8,w10 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w8,w8,w16 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w8,w8,w5 + add w22,w22,w7 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w8,w8,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w9,w9,w11 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w9,w9,w17 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w9,w9,w6 + add w21,w21,w8 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w9,w9,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w10,w10,w12 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w10,w10,w19 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w10,w10,w7 + add w20,w20,w9 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w10,w10,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w11,w11,w13 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w11,w11,w3 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w11,w11,w8 + add w24,w24,w10 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w11,w11,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w12,w12,w14 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w12,w12,w4 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w12,w12,w9 + add w23,w23,w11 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w13,w13,w15 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w13,w13,w5 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w13,w13,w10 + add w22,w22,w12 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w14,w14,w16 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w14,w14,w6 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w14,w14,w11 + add w21,w21,w13 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w15,w15,w17 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w15,w15,w7 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w15,w15,w12 + add w20,w20,w14 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w15,w15,#31 + movz w28,#0xc1d6 + movk w28,#0xca62,lsl#16 + orr w25,w22,w23 + and w26,w22,w23 + eor w16,w16,w19 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w16,w16,w8 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w16,w16,w13 + add w24,w24,w15 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w17,w17,w14 + add w23,w23,w16 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w19,w19,w15 + add w22,w22,w17 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w3,w3,w16 + add w21,w21,w19 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w4,w4,w17 + add w20,w20,w3 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w5,w5,w19 + add w24,w24,w4 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w6,w6,w3 + add w23,w23,w5 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w7,w7,w4 + add w22,w22,w6 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w8,w8,w5 + add w21,w21,w7 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w9,w9,w6 + add w20,w20,w8 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w10,w10,w7 + add w24,w24,w9 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w11,w11,w8 + add w23,w23,w10 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w12,w12,w9 + add w22,w22,w11 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w13,w13,w10 + add w21,w21,w12 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w14,w14,w11 + add w20,w20,w13 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w15,w15,w12 + add w24,w24,w14 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w16,w16,w13 + add w23,w23,w15 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w17,w17,w14 + add w22,w22,w16 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w19,w19,w15 + add w21,w21,w17 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w19,w19,#31 + ldp w4,w5,[x0] + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w19 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ldp w6,w7,[x0,#8] + eor w25,w24,w22 + ror w27,w21,#27 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + ldr w8,[x0,#16] + add w20,w20,w25 // e+=F(b,c,d) + add w21,w21,w5 + add w22,w22,w6 + add w20,w20,w4 + add w23,w23,w7 + add w24,w24,w8 + stp w20,w21,[x0] + stp w22,w23,[x0,#8] + str w24,[x0,#16] + cbnz x2,.Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret +.size sha1_block_data_order,.-sha1_block_data_order +.type sha1_block_armv8,%function +.align 6 +sha1_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adr x4,.Lconst + eor v1.16b,v1.16b,v1.16b + ld1 {v0.4s},[x0],#16 + ld1 {v1.s}[0],[x0] + sub x0,x0,#16 + ld1 {v16.4s-v19.4s},[x4] + +.Loop_hw: + ld1 {v4.16b-v7.16b},[x1],#64 + sub x2,x2,#1 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + + add v20.4s,v16.4s,v4.4s + rev32 v6.16b,v6.16b + orr v22.16b,v0.16b,v0.16b // offload + + add v21.4s,v16.4s,v5.4s + rev32 v7.16b,v7.16b + .inst 0x5e280803 //sha1h v3.16b,v0.16b + .inst 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 + add v20.4s,v16.4s,v6.4s + .inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 1 + .inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v16.4s,v7.4s + .inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b + .inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b + .inst 0x5e280803 //sha1h v3.16b,v0.16b // 2 + .inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v16.4s,v4.4s + .inst 0x5e281885 //sha1su1 v5.16b,v4.16b + .inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 3 + .inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s + .inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b + .inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b + .inst 0x5e280803 //sha1h v3.16b,v0.16b // 4 + .inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v6.4s + .inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b + .inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 5 + .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v7.4s + .inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b + .inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b + .inst 0x5e280803 //sha1h v3.16b,v0.16b // 6 + .inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v4.4s + .inst 0x5e281885 //sha1su1 v5.16b,v4.16b + .inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 7 + .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s + .inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b + .inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b + .inst 0x5e280803 //sha1h v3.16b,v0.16b // 8 + .inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s + .inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b + .inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 9 + .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v7.4s + .inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b + .inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b + .inst 0x5e280803 //sha1h v3.16b,v0.16b // 10 + .inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v4.4s + .inst 0x5e281885 //sha1su1 v5.16b,v4.16b + .inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 11 + .inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v5.4s + .inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b + .inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b + .inst 0x5e280803 //sha1h v3.16b,v0.16b // 12 + .inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s + .inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b + .inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 13 + .inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s + .inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b + .inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b + .inst 0x5e280803 //sha1h v3.16b,v0.16b // 14 + .inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v4.4s + .inst 0x5e281885 //sha1su1 v5.16b,v4.16b + .inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 15 + .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v5.4s + .inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b + .inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b + .inst 0x5e280803 //sha1h v3.16b,v0.16b // 16 + .inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v6.4s + .inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 17 + .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s + + .inst 0x5e280803 //sha1h v3.16b,v0.16b // 18 + .inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + + .inst 0x5e280802 //sha1h v2.16b,v0.16b // 19 + .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + + add v1.4s,v1.4s,v2.4s + add v0.4s,v0.4s,v22.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s},[x0],#16 + st1 {v1.s}[0],[x0] + + ldr x29,[sp],#16 + ret +.size sha1_block_armv8,.-sha1_block_armv8 +.align 6 +.Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.LOPENSSL_armcap_P: +.quad OPENSSL_armcap_P-. +.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +.comm OPENSSL_armcap_P,4,4 diff --git a/app/openssl/crypto/sha/asm/sha1-armv8.pl b/app/openssl/crypto/sha/asm/sha1-armv8.pl new file mode 100644 index 00000000..c1f552b6 --- /dev/null +++ b/app/openssl/crypto/sha/asm/sha1-armv8.pl @@ -0,0 +1,333 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# hardware-assisted software(*) +# Apple A7 2.31 4.13 (+14%) +# Cortex-A5x n/a n/a +# +# (*) Software results are presented mostly for reference purposes. + +$flavour = shift; +open STDOUT,">".shift; + +($ctx,$inp,$num)=("x0","x1","x2"); +@Xw=map("w$_",(3..17,19)); +@Xx=map("x$_",(3..17,19)); +@V=($A,$B,$C,$D,$E)=map("w$_",(20..24)); +($t0,$t1,$t2,$K)=map("w$_",(25..28)); + + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i<15 && !($i&1)); + lsr @Xx[$i+1],@Xx[$i],#32 +___ +$code.=<<___ if ($i<14 && !($i&1)); + ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`] +___ +$code.=<<___ if ($i<14 && ($i&1)); +#ifdef __ARMEB__ + ror @Xx[$i+1],@Xx[$i+1],#32 +#else + rev32 @Xx[$i+1],@Xx[$i+1] +#endif +___ +$code.=<<___ if ($i<14); + bic $t0,$d,$b + and $t1,$c,$b + ror $t2,$a,#27 + add $d,$d,$K // future e+=K + orr $t0,$t0,$t1 + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) +___ +$code.=<<___ if ($i==19); + movz $K,#0xeba1 + movk $K,#0x6ed9,lsl#16 +___ +$code.=<<___ if ($i>=14); + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] + bic $t0,$d,$b + and $t1,$c,$b + ror $t2,$a,#27 + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] + add $d,$d,$K // future e+=K + orr $t0,$t0,$t1 + add $e,$e,$t2 // e+=rot(a,5) + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] + ror $b,$b,#2 + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) + ror @Xw[$j],@Xw[$j],#31 +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i==59); + movz $K,#0xc1d6 + movk $K,#0xca62,lsl#16 +___ +$code.=<<___; + orr $t0,$b,$c + and $t1,$b,$c + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] + ror $t2,$a,#27 + and $t0,$t0,$d + add $d,$d,$K // future e+=K + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] + add $e,$e,$t2 // e+=rot(a,5) + orr $t0,$t0,$t1 + ror $b,$b,#2 + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) + ror @Xw[$j],@Xw[$j],#31 +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i==39); + movz $K,#0xbcdc + movk $K,#0x8f1b,lsl#16 +___ +$code.=<<___ if ($i<78); + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] + eor $t0,$d,$b + ror $t2,$a,#27 + add $d,$d,$K // future e+=K + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] + eor $t0,$t0,$c + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) + ror @Xw[$j],@Xw[$j],#31 +___ +$code.=<<___ if ($i==78); + ldp @Xw[1],@Xw[2],[$ctx] + eor $t0,$d,$b + ror $t2,$a,#27 + add $d,$d,$K // future e+=K + eor $t0,$t0,$c + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) +___ +$code.=<<___ if ($i==79); + ldp @Xw[3],@Xw[4],[$ctx,#8] + eor $t0,$d,$b + ror $t2,$a,#27 + eor $t0,$t0,$c + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + ldr @Xw[5],[$ctx,#16] + add $e,$e,$t0 // e+=F(b,c,d) +___ +} + +$code.=<<___; +#include "arm_arch.h" + +.text + +.globl sha1_block_data_order +.type sha1_block_data_order,%function +.align 6 +sha1_block_data_order: + ldr x16,.LOPENSSL_armcap_P + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA1 + b.ne .Lv8_entry + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldr $E,[$ctx,#16] + +.Loop: + ldr @Xx[0],[$inp],#64 + movz $K,#0x7999 + sub $num,$num,#1 + movk $K,#0x5a82,lsl#16 +#ifdef __ARMEB__ + ror $Xx[0],@Xx[0],#32 +#else + rev32 @Xx[0],@Xx[0] +#endif + add $E,$E,$K // warm it up + add $E,$E,@Xw[0] +___ +for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + add $B,$B,@Xw[2] + add $C,$C,@Xw[3] + add $A,$A,@Xw[1] + add $D,$D,@Xw[4] + add $E,$E,@Xw[5] + stp $A,$B,[$ctx] + stp $C,$D,[$ctx,#8] + str $E,[$ctx,#16] + cbnz $num,.Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret +.size sha1_block_data_order,.-sha1_block_data_order +___ +{{{ +my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3)); +my @MSG=map("v$_.16b",(4..7)); +my @Kxx=map("v$_.4s",(16..19)); +my ($W0,$W1)=("v20.4s","v21.4s"); +my $ABCD_SAVE="v22.16b"; + +$code.=<<___; +.type sha1_block_armv8,%function +.align 6 +sha1_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adr x4,.Lconst + eor $E,$E,$E + ld1.32 {$ABCD},[$ctx],#16 + ld1.32 {$E}[0],[$ctx] + sub $ctx,$ctx,#16 + ld1.32 {@Kxx[0]-@Kxx[3]},[x4] + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + + add.i32 $W0,@Kxx[0],@MSG[0] + rev32 @MSG[2],@MSG[2] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + + add.i32 $W1,@Kxx[0],@MSG[1] + rev32 @MSG[3],@MSG[3] + sha1h $E1,$ABCD + sha1c $ABCD,$E,$W0 // 0 + add.i32 $W0,@Kxx[$j],@MSG[2] + sha1su0 @MSG[0],@MSG[1],@MSG[2] +___ +for ($j=0,$i=1;$i<20-3;$i++) { +my $f=("c","p","m","p")[$i/5]; +$code.=<<___; + sha1h $E0,$ABCD // $i + sha1$f $ABCD,$E1,$W1 + add.i32 $W1,@Kxx[$j],@MSG[3] + sha1su1 @MSG[0],@MSG[3] +___ +$code.=<<___ if ($i<20-4); + sha1su0 @MSG[1],@MSG[2],@MSG[3] +___ + ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0); + push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0); +} +$code.=<<___; + sha1h $E0,$ABCD // $i + sha1p $ABCD,$E1,$W1 + add.i32 $W1,@Kxx[$j],@MSG[3] + + sha1h $E1,$ABCD // 18 + sha1p $ABCD,$E0,$W0 + + sha1h $E0,$ABCD // 19 + sha1p $ABCD,$E1,$W1 + + add.i32 $E,$E,$E0 + add.i32 $ABCD,$ABCD,$ABCD_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD},[$ctx],#16 + st1.32 {$E}[0],[$ctx] + + ldr x29,[sp],#16 + ret +.size sha1_block_armv8,.-sha1_block_armv8 +.align 6 +.Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.LOPENSSL_armcap_P: +.quad OPENSSL_armcap_P-. +.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +.comm OPENSSL_armcap_P,4,4 +___ +}}} + +{ my %opcode = ( + "sha1c" => 0x5e000000, "sha1p" => 0x5e001000, + "sha1m" => 0x5e002000, "sha1su0" => 0x5e003000, + "sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 ); + + sub unsha1 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo; + + s/\.\w?32\b//o and s/\.16b/\.4s/go; + m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; + + print $_,"\n"; +} + +close STDOUT; diff --git a/app/openssl/crypto/sha/asm/sha256-armv4.S b/app/openssl/crypto/sha/asm/sha256-armv4.S index d4f53c1d..853d7da5 120000..100644 --- a/app/openssl/crypto/sha/asm/sha256-armv4.S +++ b/app/openssl/crypto/sha/asm/sha256-armv4.S @@ -1 +1,2690 @@ -sha256-armv4.s
\ No newline at end of file +#include "arm_arch.h" + +.text +.code 32 + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha256_block_data_order +.align 5 + +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: + sub r3,pc,#8 @ sha256_block_data_order + add r2,r1,r2,lsl#6 @ len to point at the end of inp +#if __ARM_ARCH__>=7 + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + stmdb sp!,{r0,r1,r2,r4-r11,lr} + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r14,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ magic + eor r12,r12,r12 +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 0 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 0 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 0==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 0<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 1 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 1 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 1==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 1<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 2 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 2 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 2==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 2<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 3 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 3 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 3==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 3<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 4 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 4 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 4==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 4<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 5 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 5==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 5<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 6 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 6 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 6==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 6<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 7 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 7==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 7<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 8 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 8 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 8==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 8<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 9 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 9 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 9==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 9<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 10 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 10 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 10==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 10<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 11 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 11 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 11==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 11<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 12 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 12 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 12==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 12<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 13 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 13 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 13==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 13<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 14 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 14 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 14==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 14<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 15 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 15 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 15==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 15<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +.Lrounds_16_xx: + @ ldr r2,[sp,#1*4] @ 16 + @ ldr r1,[sp,#14*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#0*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#9*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 16==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 16<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#2*4] @ 17 + @ ldr r1,[sp,#15*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#1*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#10*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 17==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 17<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#3*4] @ 18 + @ ldr r1,[sp,#0*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#2*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#11*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 18==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 18<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#4*4] @ 19 + @ ldr r1,[sp,#1*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#3*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#12*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 19==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 19<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#5*4] @ 20 + @ ldr r1,[sp,#2*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#4*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#13*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 20==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 20<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#6*4] @ 21 + @ ldr r1,[sp,#3*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#5*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#14*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 21==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 21<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#7*4] @ 22 + @ ldr r1,[sp,#4*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#6*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#15*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 22==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 22<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#8*4] @ 23 + @ ldr r1,[sp,#5*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#7*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#0*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 23==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 23<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#9*4] @ 24 + @ ldr r1,[sp,#6*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#8*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#1*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 24==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 24<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#10*4] @ 25 + @ ldr r1,[sp,#7*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#9*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#2*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 25==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 25<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#11*4] @ 26 + @ ldr r1,[sp,#8*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#10*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#3*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 26==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 26<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#12*4] @ 27 + @ ldr r1,[sp,#9*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#11*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#4*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 27==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 27<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#13*4] @ 28 + @ ldr r1,[sp,#10*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#12*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#5*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 28==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 28<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#14*4] @ 29 + @ ldr r1,[sp,#11*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#13*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#6*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 29==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 29<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#15*4] @ 30 + @ ldr r1,[sp,#12*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#14*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#7*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 30==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 30<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#0*4] @ 31 + @ ldr r1,[sp,#13*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#15*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#8*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 31==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 31<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + ldreq r3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r0,[r3,#0] + ldr r2,[r3,#4] + ldr r12,[r3,#8] + add r4,r4,r0 + ldr r0,[r3,#12] + add r5,r5,r2 + ldr r2,[r3,#16] + add r6,r6,r12 + ldr r12,[r3,#20] + add r7,r7,r0 + ldr r0,[r3,#24] + add r8,r8,r2 + ldr r2,[r3,#28] + add r9,r9,r12 + ldr r1,[sp,#17*4] @ pull inp + ldr r12,[sp,#18*4] @ pull inp+len + add r10,r10,r0 + add r11,r11,r2 + stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} + cmp r1,r12 + sub r14,r14,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#19*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order,.-sha256_block_data_order +#if __ARM_ARCH__>=7 +.fpu neon + +.type sha256_block_data_order_neon,%function +.align 4 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + mov r12,sp + sub sp,sp,#16*4+16 @ alloca + sub r14,r3,#256+32 @ K256 + bic sp,sp,#15 @ align for 128-bit stores + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4-r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b .L_00_48 + +.align 4 +.L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne .L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8-r11} + + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + eorne r3,r5,r6 + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +#if __ARM_ARCH__>=7 +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {q0,q1},[r0] + sub r3,r3,#sha256_block_data_order-K256 + +.Loop_v8: + vld1.8 {q8-q9},[r1]! + vld1.8 {q10-q11},[r1]! + vld1.32 {q12},[r3]! + vrev32.8 q8,q8 + vrev32.8 q9,q9 + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vmov q14,q0 @ offload + vmov q15,q1 + teq r1,r2 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + + vld1.32 {q13},[r3] + vadd.i32 q12,q12,q10 + sub r3,r3,#256-16 @ rewind + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + + vadd.i32 q13,q13,q11 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + + vadd.i32 q0,q0,q14 + vadd.i32 q1,q1,q15 + bne .Loop_v8 + + vst1.32 {q0,q1},[r0] + + bx lr @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +.comm OPENSSL_armcap_P,4,4 diff --git a/app/openssl/crypto/sha/asm/sha256-armv4.pl b/app/openssl/crypto/sha/asm/sha256-armv4.pl index 9c84e8d9..505ca8f3 100644 --- a/app/openssl/crypto/sha/asm/sha256-armv4.pl +++ b/app/openssl/crypto/sha/asm/sha256-armv4.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -21,15 +21,27 @@ # February 2011. # # Profiler-assisted and platform-specific optimization resulted in 16% -# improvement on Cortex A8 core and ~17 cycles per processed byte. +# improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +# September 2013. +# +# Add NEON implementation. On Cortex A8 it was measured to process one +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +# code (meaning that latter performs sub-optimally, nothing was done +# about it). + +# May 2014. +# +# Add ARMv8 code path performing at 2.0 cpb on Apple A7. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $ctx="r0"; $t0="r0"; -$inp="r1"; $t3="r1"; +$inp="r1"; $t4="r1"; $len="r2"; $t1="r2"; -$T1="r3"; +$T1="r3"; $t3="r3"; $A="r4"; $B="r5"; $C="r6"; @@ -52,71 +64,88 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___ if ($i<16); #if __ARM_ARCH__>=7 - ldr $T1,[$inp],#4 + @ ldr $t1,[$inp],#4 @ $i +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) + rev $t1,$t1 #else - ldrb $T1,[$inp,#3] @ $i + @ ldrb $t1,[$inp,#3] @ $i + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past ldrb $t2,[$inp,#2] - ldrb $t1,[$inp,#1] - ldrb $t0,[$inp],#4 - orr $T1,$T1,$t2,lsl#8 - orr $T1,$T1,$t1,lsl#16 - orr $T1,$T1,$t0,lsl#24 + ldrb $t0,[$inp,#1] + orr $t1,$t1,$t2,lsl#8 + ldrb $t2,[$inp],#4 + orr $t1,$t1,$t0,lsl#16 +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + orr $t1,$t1,$t2,lsl#24 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) #endif ___ $code.=<<___; - mov $t0,$e,ror#$Sigma1[0] ldr $t2,[$Ktbl],#4 @ *K256++ - eor $t0,$t0,$e,ror#$Sigma1[1] + add $h,$h,$t1 @ h+=X[i] + str $t1,[sp,#`$i%16`*4] eor $t1,$f,$g -#if $i>=16 - add $T1,$T1,$t3 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev $T1,$T1 -#endif -#if $i==15 - str $inp,[sp,#17*4] @ leave room for $t3 -#endif - eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) and $t1,$t1,$e - str $T1,[sp,#`$i%16`*4] - add $T1,$T1,$t0 + add $h,$h,$t2 @ h+=K256[i] eor $t1,$t1,$g @ Ch(e,f,g) - add $T1,$T1,$h - mov $h,$a,ror#$Sigma0[0] - add $T1,$T1,$t1 - eor $h,$h,$a,ror#$Sigma0[1] - add $T1,$T1,$t2 - eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) -#if $i>=15 - ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` + add $h,$h,$t1 @ h+=Ch(e,f,g) +#if $i==31 + and $t2,$t2,#0xff + cmp $t2,#0xf2 @ done? #endif - orr $t0,$a,$b - and $t1,$a,$b - and $t0,$t0,$c - add $h,$h,$T1 - orr $t0,$t0,$t1 @ Maj(a,b,c) - add $d,$d,$T1 - add $h,$h,$t0 +#if $i<15 +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 @ prefetch +# else + ldrb $t1,[$inp,#3] +# endif + eor $t2,$a,$b @ a^b, b^c in next round +#else + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx + eor $t2,$a,$b @ a^b, b^c in next round + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx +#endif + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) + and $t3,$t3,$t2 @ (b^c)&=(a^b) + add $d,$d,$h @ d+=h + eor $t3,$t3,$b @ Maj(a,b,c) + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) + @ add $h,$h,$t3 @ h+=Maj(a,b,c) ___ + ($t2,$t3)=($t3,$t2); } sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i - ldr $t2,[sp,#`($i+14)%16`*4] - mov $t0,$t3,ror#$sigma0[0] - ldr $T1,[sp,#`($i+0)%16`*4] - eor $t0,$t0,$t3,ror#$sigma0[1] - ldr $t1,[sp,#`($i+9)%16`*4] - eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) - mov $t3,$t2,ror#$sigma1[0] - add $T1,$T1,$t0 - eor $t3,$t3,$t2,ror#$sigma1[1] - add $T1,$T1,$t1 - eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) - @ add $T1,$T1,$t3 + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i + @ ldr $t4,[sp,#`($i+14)%16`*4] + mov $t0,$t1,ror#$sigma0[0] + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + mov $t2,$t4,ror#$sigma1[0] + eor $t0,$t0,$t1,ror#$sigma0[1] + eor $t2,$t2,$t4,ror#$sigma1[1] + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) + ldr $t1,[sp,#`($i+0)%16`*4] + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) + ldr $t4,[sp,#`($i+9)%16`*4] + + add $t2,$t2,$t0 + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 + add $t1,$t1,$t2 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) + add $t1,$t1,$t4 @ X[i] ___ &BODY_00_15(@_); } @@ -147,46 +176,64 @@ K256: .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 +.word 0 @ terminator +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha256_block_data_order +.align 5 .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: sub r3,pc,#8 @ sha256_block_data_order add $len,$inp,$len,lsl#6 @ len to point at the end of inp +#if __ARM_ARCH__>=7 + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} - sub $Ktbl,r3,#256 @ K256 + sub $Ktbl,r3,#256+32 @ K256 sub sp,sp,#16*4 @ alloca(X[16]) .Loop: +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 +# else + ldrb $t1,[$inp,#3] +# endif + eor $t3,$B,$C @ magic + eor $t2,$t2,$t2 ___ for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } $code.=".Lrounds_16_xx:\n"; for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } $code.=<<___; - and $t2,$t2,#0xff - cmp $t2,#0xf2 + ldreq $t3,[sp,#16*4] @ pull ctx bne .Lrounds_16_xx - ldr $T1,[sp,#16*4] @ pull ctx - ldr $t0,[$T1,#0] - ldr $t1,[$T1,#4] - ldr $t2,[$T1,#8] + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t0,[$t3,#0] + ldr $t1,[$t3,#4] + ldr $t2,[$t3,#8] add $A,$A,$t0 - ldr $t0,[$T1,#12] + ldr $t0,[$t3,#12] add $B,$B,$t1 - ldr $t1,[$T1,#16] + ldr $t1,[$t3,#16] add $C,$C,$t2 - ldr $t2,[$T1,#20] + ldr $t2,[$t3,#20] add $D,$D,$t0 - ldr $t0,[$T1,#24] + ldr $t0,[$t3,#24] add $E,$E,$t1 - ldr $t1,[$T1,#28] + ldr $t1,[$t3,#28] add $F,$F,$t2 ldr $inp,[sp,#17*4] @ pull inp ldr $t2,[sp,#18*4] @ pull inp+len add $G,$G,$t0 add $H,$H,$t1 - stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H} + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} cmp $inp,$t2 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl bne .Loop @@ -200,12 +247,410 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size sha256_block_data_order,.-sha256_block_data_order -.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +.size sha256_block_data_order,.-sha256_block_data_order +___ +###################################################################### +# NEON stuff +# +{{{ +my @X=map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); +my $Xfer=$t4; +my $j=0; + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + while($#insns>=2) { eval(shift(@insns)); } + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vrev32_8 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&eor ($t1,$f,$g)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&and ($t1,$t1,$e)', + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&eor ($t1,$t1,$g)', # Ch(e,f,g) + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&ldr ($t1,"[sp,#64]") if ($j==31)', + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&add ($d,$d,$h)', # d+=h + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.type sha256_block_data_order_neon,%function +.align 4 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + mov $t2,sp + sub sp,sp,#16*4+16 @ alloca + sub $Ktbl,r3,#256+32 @ K256 + bic sp,sp,#15 @ align for 128-bit stores + + vld1.8 {@X[0]},[$inp]! + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + vld1.32 {$T0},[$Ktbl,:128]! + vld1.32 {$T1},[$Ktbl,:128]! + vld1.32 {$T2},[$Ktbl,:128]! + vld1.32 {$T3},[$Ktbl,:128]! + vrev32.8 @X[0],@X[0] @ yes, even on + str $ctx,[sp,#64] + vrev32.8 @X[1],@X[1] @ big-endian + str $inp,[sp,#68] + mov $Xfer,sp + vrev32.8 @X[2],@X[2] + str $len,[sp,#72] + vrev32.8 @X[3],@X[3] + str $t2,[sp,#76] @ save original sp + vadd.i32 $T0,$T0,@X[0] + vadd.i32 $T1,$T1,@X[1] + vst1.32 {$T0},[$Xfer,:128]! + vadd.i32 $T2,$T2,@X[2] + vst1.32 {$T1},[$Xfer,:128]! + vadd.i32 $T3,$T3,@X[3] + vst1.32 {$T2},[$Xfer,:128]! + vst1.32 {$T3},[$Xfer,:128]! + + ldmia $ctx,{$A-$H} + sub $Xfer,$Xfer,#64 + ldr $t1,[sp,#0] + eor $t2,$t2,$t2 + eor $t3,$B,$C + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + teq $t1,#0 @ check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + ldr $inp,[sp,#68] + ldr $t0,[sp,#72] + sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl + teq $inp,$t0 + subeq $inp,$inp,#64 @ avoid SEGV + vld1.8 {@X[0]},[$inp]! @ load next input block + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + strne $inp,[sp,#68] + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + ldr $t0,[$t1,#0] + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t2,[$t1,#4] + ldr $t3,[$t1,#8] + ldr $t4,[$t1,#12] + add $A,$A,$t0 @ accumulate + ldr $t0,[$t1,#16] + add $B,$B,$t2 + ldr $t2,[$t1,#20] + add $C,$C,$t3 + ldr $t3,[$t1,#24] + add $D,$D,$t4 + ldr $t4,[$t1,#28] + add $E,$E,$t0 + str $A,[$t1],#4 + add $F,$F,$t2 + str $B,[$t1],#4 + add $G,$G,$t3 + str $C,[$t1],#4 + add $H,$H,$t4 + str $D,[$t1],#4 + stmia $t1,{$E-$H} + + movne $Xfer,sp + ldrne $t1,[sp,#0] + eorne $t2,$t2,$t2 + ldreq sp,[sp,#76] @ restore original sp + eorne $t3,$B,$C + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +___ +}}} +###################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); +my @MSG=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); +my $Ktbl="r3"; + +$code.=<<___; +#if __ARM_ARCH__>=7 +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {$ABCD,$EFGH},[$ctx] + sub $Ktbl,r3,#sha256_block_data_order-K256 + +.Loop_v8: + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! + vld1.32 {$W0},[$Ktbl]! + vrev32.8 @MSG[0],@MSG[0] + vrev32.8 @MSG[1],@MSG[1] + vrev32.8 @MSG[2],@MSG[2] + vrev32.8 @MSG[3],@MSG[3] + vmov $ABCD_SAVE,$ABCD @ offload + vmov $EFGH_SAVE,$EFGH + teq $inp,$len +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vld1.32 {$W0},[$Ktbl]! + vadd.i32 $W1,$W1,@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vld1.32 {$W1},[$Ktbl] + vadd.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#256-16 @ rewind + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vadd.i32 $W1,$W1,@MSG[3] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE + vadd.i32 $EFGH,$EFGH,$EFGH_SAVE + bne .Loop_v8 + + vst1.32 {$ABCD,$EFGH},[$ctx] + + ret @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 +.comm OPENSSL_armcap_P,4,4 ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -print $code; +{ my %opcode = ( + "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, + "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } +} + +foreach (split($/,$code)) { + + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; + + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + close STDOUT; # enforce flush diff --git a/app/openssl/crypto/sha/asm/sha256-armv4.s b/app/openssl/crypto/sha/asm/sha256-armv4.s deleted file mode 100644 index 9c20a63c..00000000 --- a/app/openssl/crypto/sha/asm/sha256-armv4.s +++ /dev/null @@ -1,1517 +0,0 @@ -#include "arm_arch.h" - -.text -.code 32 - -.type K256,%object -.align 5 -K256: -.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.size K256,.-K256 - -.global sha256_block_data_order -.type sha256_block_data_order,%function -sha256_block_data_order: - sub r3,pc,#8 @ sha256_block_data_order - add r2,r1,r2,lsl#6 @ len to point at the end of inp - stmdb sp!,{r0,r1,r2,r4-r11,lr} - ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} - sub r14,r3,#256 @ K256 - sub sp,sp,#16*4 @ alloca(X[16]) -.Loop: -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 0 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r8,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r8,ror#11 - eor r2,r9,r10 -#if 0>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 0==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r8,ror#25 @ Sigma1(e) - and r2,r2,r8 - str r3,[sp,#0*4] - add r3,r3,r0 - eor r2,r2,r10 @ Ch(e,f,g) - add r3,r3,r11 - mov r11,r4,ror#2 - add r3,r3,r2 - eor r11,r11,r4,ror#13 - add r3,r3,r12 - eor r11,r11,r4,ror#22 @ Sigma0(a) -#if 0>=15 - ldr r1,[sp,#2*4] @ from BODY_16_xx -#endif - orr r0,r4,r5 - and r2,r4,r5 - and r0,r0,r6 - add r11,r11,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r7,r7,r3 - add r11,r11,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 1 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r7,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r7,ror#11 - eor r2,r8,r9 -#if 1>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 1==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r7,ror#25 @ Sigma1(e) - and r2,r2,r7 - str r3,[sp,#1*4] - add r3,r3,r0 - eor r2,r2,r9 @ Ch(e,f,g) - add r3,r3,r10 - mov r10,r11,ror#2 - add r3,r3,r2 - eor r10,r10,r11,ror#13 - add r3,r3,r12 - eor r10,r10,r11,ror#22 @ Sigma0(a) -#if 1>=15 - ldr r1,[sp,#3*4] @ from BODY_16_xx -#endif - orr r0,r11,r4 - and r2,r11,r4 - and r0,r0,r5 - add r10,r10,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r6,r6,r3 - add r10,r10,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 2 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r6,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r6,ror#11 - eor r2,r7,r8 -#if 2>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 2==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r6,ror#25 @ Sigma1(e) - and r2,r2,r6 - str r3,[sp,#2*4] - add r3,r3,r0 - eor r2,r2,r8 @ Ch(e,f,g) - add r3,r3,r9 - mov r9,r10,ror#2 - add r3,r3,r2 - eor r9,r9,r10,ror#13 - add r3,r3,r12 - eor r9,r9,r10,ror#22 @ Sigma0(a) -#if 2>=15 - ldr r1,[sp,#4*4] @ from BODY_16_xx -#endif - orr r0,r10,r11 - and r2,r10,r11 - and r0,r0,r4 - add r9,r9,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r5,r5,r3 - add r9,r9,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 3 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r5,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r5,ror#11 - eor r2,r6,r7 -#if 3>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 3==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r5,ror#25 @ Sigma1(e) - and r2,r2,r5 - str r3,[sp,#3*4] - add r3,r3,r0 - eor r2,r2,r7 @ Ch(e,f,g) - add r3,r3,r8 - mov r8,r9,ror#2 - add r3,r3,r2 - eor r8,r8,r9,ror#13 - add r3,r3,r12 - eor r8,r8,r9,ror#22 @ Sigma0(a) -#if 3>=15 - ldr r1,[sp,#5*4] @ from BODY_16_xx -#endif - orr r0,r9,r10 - and r2,r9,r10 - and r0,r0,r11 - add r8,r8,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r4,r4,r3 - add r8,r8,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 4 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r4,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r4,ror#11 - eor r2,r5,r6 -#if 4>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 4==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r4,ror#25 @ Sigma1(e) - and r2,r2,r4 - str r3,[sp,#4*4] - add r3,r3,r0 - eor r2,r2,r6 @ Ch(e,f,g) - add r3,r3,r7 - mov r7,r8,ror#2 - add r3,r3,r2 - eor r7,r7,r8,ror#13 - add r3,r3,r12 - eor r7,r7,r8,ror#22 @ Sigma0(a) -#if 4>=15 - ldr r1,[sp,#6*4] @ from BODY_16_xx -#endif - orr r0,r8,r9 - and r2,r8,r9 - and r0,r0,r10 - add r7,r7,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r11,r11,r3 - add r7,r7,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 5 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r11,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r11,ror#11 - eor r2,r4,r5 -#if 5>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 5==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r11,ror#25 @ Sigma1(e) - and r2,r2,r11 - str r3,[sp,#5*4] - add r3,r3,r0 - eor r2,r2,r5 @ Ch(e,f,g) - add r3,r3,r6 - mov r6,r7,ror#2 - add r3,r3,r2 - eor r6,r6,r7,ror#13 - add r3,r3,r12 - eor r6,r6,r7,ror#22 @ Sigma0(a) -#if 5>=15 - ldr r1,[sp,#7*4] @ from BODY_16_xx -#endif - orr r0,r7,r8 - and r2,r7,r8 - and r0,r0,r9 - add r6,r6,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r10,r10,r3 - add r6,r6,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 6 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r10,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r10,ror#11 - eor r2,r11,r4 -#if 6>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 6==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r10,ror#25 @ Sigma1(e) - and r2,r2,r10 - str r3,[sp,#6*4] - add r3,r3,r0 - eor r2,r2,r4 @ Ch(e,f,g) - add r3,r3,r5 - mov r5,r6,ror#2 - add r3,r3,r2 - eor r5,r5,r6,ror#13 - add r3,r3,r12 - eor r5,r5,r6,ror#22 @ Sigma0(a) -#if 6>=15 - ldr r1,[sp,#8*4] @ from BODY_16_xx -#endif - orr r0,r6,r7 - and r2,r6,r7 - and r0,r0,r8 - add r5,r5,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r9,r9,r3 - add r5,r5,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 7 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r9,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r9,ror#11 - eor r2,r10,r11 -#if 7>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 7==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r9,ror#25 @ Sigma1(e) - and r2,r2,r9 - str r3,[sp,#7*4] - add r3,r3,r0 - eor r2,r2,r11 @ Ch(e,f,g) - add r3,r3,r4 - mov r4,r5,ror#2 - add r3,r3,r2 - eor r4,r4,r5,ror#13 - add r3,r3,r12 - eor r4,r4,r5,ror#22 @ Sigma0(a) -#if 7>=15 - ldr r1,[sp,#9*4] @ from BODY_16_xx -#endif - orr r0,r5,r6 - and r2,r5,r6 - and r0,r0,r7 - add r4,r4,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r8,r8,r3 - add r4,r4,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 8 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r8,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r8,ror#11 - eor r2,r9,r10 -#if 8>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 8==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r8,ror#25 @ Sigma1(e) - and r2,r2,r8 - str r3,[sp,#8*4] - add r3,r3,r0 - eor r2,r2,r10 @ Ch(e,f,g) - add r3,r3,r11 - mov r11,r4,ror#2 - add r3,r3,r2 - eor r11,r11,r4,ror#13 - add r3,r3,r12 - eor r11,r11,r4,ror#22 @ Sigma0(a) -#if 8>=15 - ldr r1,[sp,#10*4] @ from BODY_16_xx -#endif - orr r0,r4,r5 - and r2,r4,r5 - and r0,r0,r6 - add r11,r11,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r7,r7,r3 - add r11,r11,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 9 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r7,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r7,ror#11 - eor r2,r8,r9 -#if 9>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 9==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r7,ror#25 @ Sigma1(e) - and r2,r2,r7 - str r3,[sp,#9*4] - add r3,r3,r0 - eor r2,r2,r9 @ Ch(e,f,g) - add r3,r3,r10 - mov r10,r11,ror#2 - add r3,r3,r2 - eor r10,r10,r11,ror#13 - add r3,r3,r12 - eor r10,r10,r11,ror#22 @ Sigma0(a) -#if 9>=15 - ldr r1,[sp,#11*4] @ from BODY_16_xx -#endif - orr r0,r11,r4 - and r2,r11,r4 - and r0,r0,r5 - add r10,r10,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r6,r6,r3 - add r10,r10,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 10 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r6,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r6,ror#11 - eor r2,r7,r8 -#if 10>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 10==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r6,ror#25 @ Sigma1(e) - and r2,r2,r6 - str r3,[sp,#10*4] - add r3,r3,r0 - eor r2,r2,r8 @ Ch(e,f,g) - add r3,r3,r9 - mov r9,r10,ror#2 - add r3,r3,r2 - eor r9,r9,r10,ror#13 - add r3,r3,r12 - eor r9,r9,r10,ror#22 @ Sigma0(a) -#if 10>=15 - ldr r1,[sp,#12*4] @ from BODY_16_xx -#endif - orr r0,r10,r11 - and r2,r10,r11 - and r0,r0,r4 - add r9,r9,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r5,r5,r3 - add r9,r9,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 11 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r5,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r5,ror#11 - eor r2,r6,r7 -#if 11>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 11==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r5,ror#25 @ Sigma1(e) - and r2,r2,r5 - str r3,[sp,#11*4] - add r3,r3,r0 - eor r2,r2,r7 @ Ch(e,f,g) - add r3,r3,r8 - mov r8,r9,ror#2 - add r3,r3,r2 - eor r8,r8,r9,ror#13 - add r3,r3,r12 - eor r8,r8,r9,ror#22 @ Sigma0(a) -#if 11>=15 - ldr r1,[sp,#13*4] @ from BODY_16_xx -#endif - orr r0,r9,r10 - and r2,r9,r10 - and r0,r0,r11 - add r8,r8,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r4,r4,r3 - add r8,r8,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 12 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r4,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r4,ror#11 - eor r2,r5,r6 -#if 12>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 12==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r4,ror#25 @ Sigma1(e) - and r2,r2,r4 - str r3,[sp,#12*4] - add r3,r3,r0 - eor r2,r2,r6 @ Ch(e,f,g) - add r3,r3,r7 - mov r7,r8,ror#2 - add r3,r3,r2 - eor r7,r7,r8,ror#13 - add r3,r3,r12 - eor r7,r7,r8,ror#22 @ Sigma0(a) -#if 12>=15 - ldr r1,[sp,#14*4] @ from BODY_16_xx -#endif - orr r0,r8,r9 - and r2,r8,r9 - and r0,r0,r10 - add r7,r7,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r11,r11,r3 - add r7,r7,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 13 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r11,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r11,ror#11 - eor r2,r4,r5 -#if 13>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 13==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r11,ror#25 @ Sigma1(e) - and r2,r2,r11 - str r3,[sp,#13*4] - add r3,r3,r0 - eor r2,r2,r5 @ Ch(e,f,g) - add r3,r3,r6 - mov r6,r7,ror#2 - add r3,r3,r2 - eor r6,r6,r7,ror#13 - add r3,r3,r12 - eor r6,r6,r7,ror#22 @ Sigma0(a) -#if 13>=15 - ldr r1,[sp,#15*4] @ from BODY_16_xx -#endif - orr r0,r7,r8 - and r2,r7,r8 - and r0,r0,r9 - add r6,r6,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r10,r10,r3 - add r6,r6,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 14 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r10,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r10,ror#11 - eor r2,r11,r4 -#if 14>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 14==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r10,ror#25 @ Sigma1(e) - and r2,r2,r10 - str r3,[sp,#14*4] - add r3,r3,r0 - eor r2,r2,r4 @ Ch(e,f,g) - add r3,r3,r5 - mov r5,r6,ror#2 - add r3,r3,r2 - eor r5,r5,r6,ror#13 - add r3,r3,r12 - eor r5,r5,r6,ror#22 @ Sigma0(a) -#if 14>=15 - ldr r1,[sp,#0*4] @ from BODY_16_xx -#endif - orr r0,r6,r7 - and r2,r6,r7 - and r0,r0,r8 - add r5,r5,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r9,r9,r3 - add r5,r5,r0 -#if __ARM_ARCH__>=7 - ldr r3,[r1],#4 -#else - ldrb r3,[r1,#3] @ 15 - ldrb r12,[r1,#2] - ldrb r2,[r1,#1] - ldrb r0,[r1],#4 - orr r3,r3,r12,lsl#8 - orr r3,r3,r2,lsl#16 - orr r3,r3,r0,lsl#24 -#endif - mov r0,r9,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r9,ror#11 - eor r2,r10,r11 -#if 15>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 15==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r9,ror#25 @ Sigma1(e) - and r2,r2,r9 - str r3,[sp,#15*4] - add r3,r3,r0 - eor r2,r2,r11 @ Ch(e,f,g) - add r3,r3,r4 - mov r4,r5,ror#2 - add r3,r3,r2 - eor r4,r4,r5,ror#13 - add r3,r3,r12 - eor r4,r4,r5,ror#22 @ Sigma0(a) -#if 15>=15 - ldr r1,[sp,#1*4] @ from BODY_16_xx -#endif - orr r0,r5,r6 - and r2,r5,r6 - and r0,r0,r7 - add r4,r4,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r8,r8,r3 - add r4,r4,r0 -.Lrounds_16_xx: - @ ldr r1,[sp,#1*4] @ 16 - ldr r12,[sp,#14*4] - mov r0,r1,ror#7 - ldr r3,[sp,#0*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#9*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r8,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r8,ror#11 - eor r2,r9,r10 -#if 16>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 16==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r8,ror#25 @ Sigma1(e) - and r2,r2,r8 - str r3,[sp,#0*4] - add r3,r3,r0 - eor r2,r2,r10 @ Ch(e,f,g) - add r3,r3,r11 - mov r11,r4,ror#2 - add r3,r3,r2 - eor r11,r11,r4,ror#13 - add r3,r3,r12 - eor r11,r11,r4,ror#22 @ Sigma0(a) -#if 16>=15 - ldr r1,[sp,#2*4] @ from BODY_16_xx -#endif - orr r0,r4,r5 - and r2,r4,r5 - and r0,r0,r6 - add r11,r11,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r7,r7,r3 - add r11,r11,r0 - @ ldr r1,[sp,#2*4] @ 17 - ldr r12,[sp,#15*4] - mov r0,r1,ror#7 - ldr r3,[sp,#1*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#10*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r7,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r7,ror#11 - eor r2,r8,r9 -#if 17>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 17==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r7,ror#25 @ Sigma1(e) - and r2,r2,r7 - str r3,[sp,#1*4] - add r3,r3,r0 - eor r2,r2,r9 @ Ch(e,f,g) - add r3,r3,r10 - mov r10,r11,ror#2 - add r3,r3,r2 - eor r10,r10,r11,ror#13 - add r3,r3,r12 - eor r10,r10,r11,ror#22 @ Sigma0(a) -#if 17>=15 - ldr r1,[sp,#3*4] @ from BODY_16_xx -#endif - orr r0,r11,r4 - and r2,r11,r4 - and r0,r0,r5 - add r10,r10,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r6,r6,r3 - add r10,r10,r0 - @ ldr r1,[sp,#3*4] @ 18 - ldr r12,[sp,#0*4] - mov r0,r1,ror#7 - ldr r3,[sp,#2*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#11*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r6,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r6,ror#11 - eor r2,r7,r8 -#if 18>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 18==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r6,ror#25 @ Sigma1(e) - and r2,r2,r6 - str r3,[sp,#2*4] - add r3,r3,r0 - eor r2,r2,r8 @ Ch(e,f,g) - add r3,r3,r9 - mov r9,r10,ror#2 - add r3,r3,r2 - eor r9,r9,r10,ror#13 - add r3,r3,r12 - eor r9,r9,r10,ror#22 @ Sigma0(a) -#if 18>=15 - ldr r1,[sp,#4*4] @ from BODY_16_xx -#endif - orr r0,r10,r11 - and r2,r10,r11 - and r0,r0,r4 - add r9,r9,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r5,r5,r3 - add r9,r9,r0 - @ ldr r1,[sp,#4*4] @ 19 - ldr r12,[sp,#1*4] - mov r0,r1,ror#7 - ldr r3,[sp,#3*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#12*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r5,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r5,ror#11 - eor r2,r6,r7 -#if 19>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 19==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r5,ror#25 @ Sigma1(e) - and r2,r2,r5 - str r3,[sp,#3*4] - add r3,r3,r0 - eor r2,r2,r7 @ Ch(e,f,g) - add r3,r3,r8 - mov r8,r9,ror#2 - add r3,r3,r2 - eor r8,r8,r9,ror#13 - add r3,r3,r12 - eor r8,r8,r9,ror#22 @ Sigma0(a) -#if 19>=15 - ldr r1,[sp,#5*4] @ from BODY_16_xx -#endif - orr r0,r9,r10 - and r2,r9,r10 - and r0,r0,r11 - add r8,r8,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r4,r4,r3 - add r8,r8,r0 - @ ldr r1,[sp,#5*4] @ 20 - ldr r12,[sp,#2*4] - mov r0,r1,ror#7 - ldr r3,[sp,#4*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#13*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r4,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r4,ror#11 - eor r2,r5,r6 -#if 20>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 20==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r4,ror#25 @ Sigma1(e) - and r2,r2,r4 - str r3,[sp,#4*4] - add r3,r3,r0 - eor r2,r2,r6 @ Ch(e,f,g) - add r3,r3,r7 - mov r7,r8,ror#2 - add r3,r3,r2 - eor r7,r7,r8,ror#13 - add r3,r3,r12 - eor r7,r7,r8,ror#22 @ Sigma0(a) -#if 20>=15 - ldr r1,[sp,#6*4] @ from BODY_16_xx -#endif - orr r0,r8,r9 - and r2,r8,r9 - and r0,r0,r10 - add r7,r7,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r11,r11,r3 - add r7,r7,r0 - @ ldr r1,[sp,#6*4] @ 21 - ldr r12,[sp,#3*4] - mov r0,r1,ror#7 - ldr r3,[sp,#5*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#14*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r11,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r11,ror#11 - eor r2,r4,r5 -#if 21>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 21==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r11,ror#25 @ Sigma1(e) - and r2,r2,r11 - str r3,[sp,#5*4] - add r3,r3,r0 - eor r2,r2,r5 @ Ch(e,f,g) - add r3,r3,r6 - mov r6,r7,ror#2 - add r3,r3,r2 - eor r6,r6,r7,ror#13 - add r3,r3,r12 - eor r6,r6,r7,ror#22 @ Sigma0(a) -#if 21>=15 - ldr r1,[sp,#7*4] @ from BODY_16_xx -#endif - orr r0,r7,r8 - and r2,r7,r8 - and r0,r0,r9 - add r6,r6,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r10,r10,r3 - add r6,r6,r0 - @ ldr r1,[sp,#7*4] @ 22 - ldr r12,[sp,#4*4] - mov r0,r1,ror#7 - ldr r3,[sp,#6*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#15*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r10,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r10,ror#11 - eor r2,r11,r4 -#if 22>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 22==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r10,ror#25 @ Sigma1(e) - and r2,r2,r10 - str r3,[sp,#6*4] - add r3,r3,r0 - eor r2,r2,r4 @ Ch(e,f,g) - add r3,r3,r5 - mov r5,r6,ror#2 - add r3,r3,r2 - eor r5,r5,r6,ror#13 - add r3,r3,r12 - eor r5,r5,r6,ror#22 @ Sigma0(a) -#if 22>=15 - ldr r1,[sp,#8*4] @ from BODY_16_xx -#endif - orr r0,r6,r7 - and r2,r6,r7 - and r0,r0,r8 - add r5,r5,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r9,r9,r3 - add r5,r5,r0 - @ ldr r1,[sp,#8*4] @ 23 - ldr r12,[sp,#5*4] - mov r0,r1,ror#7 - ldr r3,[sp,#7*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#0*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r9,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r9,ror#11 - eor r2,r10,r11 -#if 23>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 23==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r9,ror#25 @ Sigma1(e) - and r2,r2,r9 - str r3,[sp,#7*4] - add r3,r3,r0 - eor r2,r2,r11 @ Ch(e,f,g) - add r3,r3,r4 - mov r4,r5,ror#2 - add r3,r3,r2 - eor r4,r4,r5,ror#13 - add r3,r3,r12 - eor r4,r4,r5,ror#22 @ Sigma0(a) -#if 23>=15 - ldr r1,[sp,#9*4] @ from BODY_16_xx -#endif - orr r0,r5,r6 - and r2,r5,r6 - and r0,r0,r7 - add r4,r4,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r8,r8,r3 - add r4,r4,r0 - @ ldr r1,[sp,#9*4] @ 24 - ldr r12,[sp,#6*4] - mov r0,r1,ror#7 - ldr r3,[sp,#8*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#1*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r8,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r8,ror#11 - eor r2,r9,r10 -#if 24>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 24==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r8,ror#25 @ Sigma1(e) - and r2,r2,r8 - str r3,[sp,#8*4] - add r3,r3,r0 - eor r2,r2,r10 @ Ch(e,f,g) - add r3,r3,r11 - mov r11,r4,ror#2 - add r3,r3,r2 - eor r11,r11,r4,ror#13 - add r3,r3,r12 - eor r11,r11,r4,ror#22 @ Sigma0(a) -#if 24>=15 - ldr r1,[sp,#10*4] @ from BODY_16_xx -#endif - orr r0,r4,r5 - and r2,r4,r5 - and r0,r0,r6 - add r11,r11,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r7,r7,r3 - add r11,r11,r0 - @ ldr r1,[sp,#10*4] @ 25 - ldr r12,[sp,#7*4] - mov r0,r1,ror#7 - ldr r3,[sp,#9*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#2*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r7,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r7,ror#11 - eor r2,r8,r9 -#if 25>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 25==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r7,ror#25 @ Sigma1(e) - and r2,r2,r7 - str r3,[sp,#9*4] - add r3,r3,r0 - eor r2,r2,r9 @ Ch(e,f,g) - add r3,r3,r10 - mov r10,r11,ror#2 - add r3,r3,r2 - eor r10,r10,r11,ror#13 - add r3,r3,r12 - eor r10,r10,r11,ror#22 @ Sigma0(a) -#if 25>=15 - ldr r1,[sp,#11*4] @ from BODY_16_xx -#endif - orr r0,r11,r4 - and r2,r11,r4 - and r0,r0,r5 - add r10,r10,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r6,r6,r3 - add r10,r10,r0 - @ ldr r1,[sp,#11*4] @ 26 - ldr r12,[sp,#8*4] - mov r0,r1,ror#7 - ldr r3,[sp,#10*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#3*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r6,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r6,ror#11 - eor r2,r7,r8 -#if 26>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 26==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r6,ror#25 @ Sigma1(e) - and r2,r2,r6 - str r3,[sp,#10*4] - add r3,r3,r0 - eor r2,r2,r8 @ Ch(e,f,g) - add r3,r3,r9 - mov r9,r10,ror#2 - add r3,r3,r2 - eor r9,r9,r10,ror#13 - add r3,r3,r12 - eor r9,r9,r10,ror#22 @ Sigma0(a) -#if 26>=15 - ldr r1,[sp,#12*4] @ from BODY_16_xx -#endif - orr r0,r10,r11 - and r2,r10,r11 - and r0,r0,r4 - add r9,r9,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r5,r5,r3 - add r9,r9,r0 - @ ldr r1,[sp,#12*4] @ 27 - ldr r12,[sp,#9*4] - mov r0,r1,ror#7 - ldr r3,[sp,#11*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#4*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r5,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r5,ror#11 - eor r2,r6,r7 -#if 27>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 27==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r5,ror#25 @ Sigma1(e) - and r2,r2,r5 - str r3,[sp,#11*4] - add r3,r3,r0 - eor r2,r2,r7 @ Ch(e,f,g) - add r3,r3,r8 - mov r8,r9,ror#2 - add r3,r3,r2 - eor r8,r8,r9,ror#13 - add r3,r3,r12 - eor r8,r8,r9,ror#22 @ Sigma0(a) -#if 27>=15 - ldr r1,[sp,#13*4] @ from BODY_16_xx -#endif - orr r0,r9,r10 - and r2,r9,r10 - and r0,r0,r11 - add r8,r8,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r4,r4,r3 - add r8,r8,r0 - @ ldr r1,[sp,#13*4] @ 28 - ldr r12,[sp,#10*4] - mov r0,r1,ror#7 - ldr r3,[sp,#12*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#5*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r4,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r4,ror#11 - eor r2,r5,r6 -#if 28>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 28==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r4,ror#25 @ Sigma1(e) - and r2,r2,r4 - str r3,[sp,#12*4] - add r3,r3,r0 - eor r2,r2,r6 @ Ch(e,f,g) - add r3,r3,r7 - mov r7,r8,ror#2 - add r3,r3,r2 - eor r7,r7,r8,ror#13 - add r3,r3,r12 - eor r7,r7,r8,ror#22 @ Sigma0(a) -#if 28>=15 - ldr r1,[sp,#14*4] @ from BODY_16_xx -#endif - orr r0,r8,r9 - and r2,r8,r9 - and r0,r0,r10 - add r7,r7,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r11,r11,r3 - add r7,r7,r0 - @ ldr r1,[sp,#14*4] @ 29 - ldr r12,[sp,#11*4] - mov r0,r1,ror#7 - ldr r3,[sp,#13*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#6*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r11,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r11,ror#11 - eor r2,r4,r5 -#if 29>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 29==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r11,ror#25 @ Sigma1(e) - and r2,r2,r11 - str r3,[sp,#13*4] - add r3,r3,r0 - eor r2,r2,r5 @ Ch(e,f,g) - add r3,r3,r6 - mov r6,r7,ror#2 - add r3,r3,r2 - eor r6,r6,r7,ror#13 - add r3,r3,r12 - eor r6,r6,r7,ror#22 @ Sigma0(a) -#if 29>=15 - ldr r1,[sp,#15*4] @ from BODY_16_xx -#endif - orr r0,r7,r8 - and r2,r7,r8 - and r0,r0,r9 - add r6,r6,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r10,r10,r3 - add r6,r6,r0 - @ ldr r1,[sp,#15*4] @ 30 - ldr r12,[sp,#12*4] - mov r0,r1,ror#7 - ldr r3,[sp,#14*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#7*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r10,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r10,ror#11 - eor r2,r11,r4 -#if 30>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 30==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r10,ror#25 @ Sigma1(e) - and r2,r2,r10 - str r3,[sp,#14*4] - add r3,r3,r0 - eor r2,r2,r4 @ Ch(e,f,g) - add r3,r3,r5 - mov r5,r6,ror#2 - add r3,r3,r2 - eor r5,r5,r6,ror#13 - add r3,r3,r12 - eor r5,r5,r6,ror#22 @ Sigma0(a) -#if 30>=15 - ldr r1,[sp,#0*4] @ from BODY_16_xx -#endif - orr r0,r6,r7 - and r2,r6,r7 - and r0,r0,r8 - add r5,r5,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r9,r9,r3 - add r5,r5,r0 - @ ldr r1,[sp,#0*4] @ 31 - ldr r12,[sp,#13*4] - mov r0,r1,ror#7 - ldr r3,[sp,#15*4] - eor r0,r0,r1,ror#18 - ldr r2,[sp,#8*4] - eor r0,r0,r1,lsr#3 @ sigma0(X[i+1]) - mov r1,r12,ror#17 - add r3,r3,r0 - eor r1,r1,r12,ror#19 - add r3,r3,r2 - eor r1,r1,r12,lsr#10 @ sigma1(X[i+14]) - @ add r3,r3,r1 - mov r0,r9,ror#6 - ldr r12,[r14],#4 @ *K256++ - eor r0,r0,r9,ror#11 - eor r2,r10,r11 -#if 31>=16 - add r3,r3,r1 @ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) - rev r3,r3 -#endif -#if 31==15 - str r1,[sp,#17*4] @ leave room for r1 -#endif - eor r0,r0,r9,ror#25 @ Sigma1(e) - and r2,r2,r9 - str r3,[sp,#15*4] - add r3,r3,r0 - eor r2,r2,r11 @ Ch(e,f,g) - add r3,r3,r4 - mov r4,r5,ror#2 - add r3,r3,r2 - eor r4,r4,r5,ror#13 - add r3,r3,r12 - eor r4,r4,r5,ror#22 @ Sigma0(a) -#if 31>=15 - ldr r1,[sp,#1*4] @ from BODY_16_xx -#endif - orr r0,r5,r6 - and r2,r5,r6 - and r0,r0,r7 - add r4,r4,r3 - orr r0,r0,r2 @ Maj(a,b,c) - add r8,r8,r3 - add r4,r4,r0 - and r12,r12,#0xff - cmp r12,#0xf2 - bne .Lrounds_16_xx - - ldr r3,[sp,#16*4] @ pull ctx - ldr r0,[r3,#0] - ldr r2,[r3,#4] - ldr r12,[r3,#8] - add r4,r4,r0 - ldr r0,[r3,#12] - add r5,r5,r2 - ldr r2,[r3,#16] - add r6,r6,r12 - ldr r12,[r3,#20] - add r7,r7,r0 - ldr r0,[r3,#24] - add r8,r8,r2 - ldr r2,[r3,#28] - add r9,r9,r12 - ldr r1,[sp,#17*4] @ pull inp - ldr r12,[sp,#18*4] @ pull inp+len - add r10,r10,r0 - add r11,r11,r2 - stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} - cmp r1,r12 - sub r14,r14,#256 @ rewind Ktbl - bne .Loop - - add sp,sp,#19*4 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r11,pc} -#else - ldmia sp!,{r4-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size sha256_block_data_order,.-sha256_block_data_order -.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" -.align 2 diff --git a/app/openssl/crypto/sha/asm/sha256-armv8.S b/app/openssl/crypto/sha/asm/sha256-armv8.S new file mode 100644 index 00000000..bd43b1fe --- /dev/null +++ b/app/openssl/crypto/sha/asm/sha256-armv8.S @@ -0,0 +1,1141 @@ +#include "arm_arch.h" + +.text + +.globl sha256_block_data_order +.type sha256_block_data_order,%function +.align 6 +sha256_block_data_order: + ldr x16,.LOPENSSL_armcap_P + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adr x30,K256 + stp x0,x2,[x29,#96] + +.Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __ARMEB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +.Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size sha256_block_data_order,.-sha256_block_data_order + +.align 6 +.type K256,%object +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +.size K256,.-K256 +.align 3 +.LOPENSSL_armcap_P: + .quad OPENSSL_armcap_P-. +.asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,K256 + +.Loop_hw: + ld1 {v4.16b-v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +.comm OPENSSL_armcap_P,4,4 diff --git a/app/openssl/crypto/sha/asm/sha512-armv4.S b/app/openssl/crypto/sha/asm/sha512-armv4.S index 046c909a..fd462771 120000..100644 --- a/app/openssl/crypto/sha/asm/sha512-armv4.S +++ b/app/openssl/crypto/sha/asm/sha512-armv4.S @@ -1 +1,1783 @@ -sha512-armv4.s
\ No newline at end of file +#include "arm_arch.h" +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +.code 32 +.type K512,%object +.align 5 +K512: +WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) +WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) +WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) +WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) +WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) +WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) +WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) +WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) +WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) +WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) +WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) +WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) +WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) +WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) +WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) +WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) +WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) +WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) +WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) +WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) +WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) +WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) +WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) +WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) +WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) +WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) +WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) +WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) +WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) +WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) +WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) +WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) +WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) +WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) +WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) +WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) +WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) +WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) +WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) +WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha512_block_data_order +.skip 32-4 + +.global sha512_block_data_order +.type sha512_block_data_order,%function +sha512_block_data_order: + sub r3,pc,#8 @ sha512_block_data_order + add r2,r1,r2,lsl#7 @ len to point at the end of inp +#if __ARM_ARCH__>=7 + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#1 + bne .LNEON +#endif + stmdb sp!,{r4-r12,lr} + sub r14,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr r7,[r0,#32+LO] + ldr r8,[r0,#32+HI] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] +.Loop: + str r9, [sp,#48+0] + str r10, [sp,#48+4] + str r11, [sp,#56+0] + str r12, [sp,#56+4] + ldr r5,[r0,#0+LO] + ldr r6,[r0,#0+HI] + ldr r3,[r0,#8+LO] + ldr r4,[r0,#8+HI] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + str r3,[sp,#8+0] + str r4,[sp,#8+4] + str r9, [sp,#16+0] + str r10, [sp,#16+4] + str r11, [sp,#24+0] + str r12, [sp,#24+4] + ldr r3,[r0,#40+LO] + ldr r4,[r0,#40+HI] + str r3,[sp,#40+0] + str r4,[sp,#40+4] + +.L00_15: +#if __ARM_ARCH__<7 + ldrb r3,[r1,#7] + ldrb r9, [r1,#6] + ldrb r10, [r1,#5] + ldrb r11, [r1,#4] + ldrb r4,[r1,#3] + ldrb r12, [r1,#2] + orr r3,r3,r9,lsl#8 + ldrb r9, [r1,#1] + orr r3,r3,r10,lsl#16 + ldrb r10, [r1],#8 + orr r3,r3,r11,lsl#24 + orr r4,r4,r12,lsl#8 + orr r4,r4,r9,lsl#16 + orr r4,r4,r10,lsl#24 +#else + ldr r3,[r1,#4] + ldr r4,[r1],#8 +#ifdef __ARMEL__ + rev r3,r3 + rev r4,r4 +#endif +#endif + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#148 + + ldr r12,[sp,#16+0] @ c.lo + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + tst r14,#1 + beq .L00_15 + ldr r9,[sp,#184+0] + ldr r10,[sp,#184+4] + bic r14,r14,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov r3,r9,lsr#1 + ldr r11,[sp,#80+0] + mov r4,r10,lsr#1 + ldr r12,[sp,#80+4] + eor r3,r3,r10,lsl#31 + eor r4,r4,r9,lsl#31 + eor r3,r3,r9,lsr#8 + eor r4,r4,r10,lsr#8 + eor r3,r3,r10,lsl#24 + eor r4,r4,r9,lsl#24 + eor r3,r3,r9,lsr#7 + eor r4,r4,r10,lsr#7 + eor r3,r3,r10,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov r9,r11,lsr#19 + mov r10,r12,lsr#19 + eor r9,r9,r12,lsl#13 + eor r10,r10,r11,lsl#13 + eor r9,r9,r12,lsr#29 + eor r10,r10,r11,lsr#29 + eor r9,r9,r11,lsl#3 + eor r10,r10,r12,lsl#3 + eor r9,r9,r11,lsr#6 + eor r10,r10,r12,lsr#6 + ldr r11,[sp,#120+0] + eor r9,r9,r12,lsl#26 + + ldr r12,[sp,#120+4] + adds r3,r3,r9 + ldr r9,[sp,#192+0] + adc r4,r4,r10 + + ldr r10,[sp,#192+4] + adds r3,r3,r11 + adc r4,r4,r12 + adds r3,r3,r9 + adc r4,r4,r10 + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#23 + + ldr r12,[sp,#16+0] @ c.lo + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + ldreq r9,[sp,#184+0] + ldreq r10,[sp,#184+4] + beq .L16_79 + bic r14,r14,#1 + + ldr r3,[sp,#8+0] + ldr r4,[sp,#8+4] + ldr r9, [r0,#0+LO] + ldr r10, [r0,#0+HI] + ldr r11, [r0,#8+LO] + ldr r12, [r0,#8+HI] + adds r9,r5,r9 + str r9, [r0,#0+LO] + adc r10,r6,r10 + str r10, [r0,#0+HI] + adds r11,r3,r11 + str r11, [r0,#8+LO] + adc r12,r4,r12 + str r12, [r0,#8+HI] + + ldr r5,[sp,#16+0] + ldr r6,[sp,#16+4] + ldr r3,[sp,#24+0] + ldr r4,[sp,#24+4] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + adds r9,r5,r9 + str r9, [r0,#16+LO] + adc r10,r6,r10 + str r10, [r0,#16+HI] + adds r11,r3,r11 + str r11, [r0,#24+LO] + adc r12,r4,r12 + str r12, [r0,#24+HI] + + ldr r3,[sp,#40+0] + ldr r4,[sp,#40+4] + ldr r9, [r0,#32+LO] + ldr r10, [r0,#32+HI] + ldr r11, [r0,#40+LO] + ldr r12, [r0,#40+HI] + adds r7,r7,r9 + str r7,[r0,#32+LO] + adc r8,r8,r10 + str r8,[r0,#32+HI] + adds r11,r3,r11 + str r11, [r0,#40+LO] + adc r12,r4,r12 + str r12, [r0,#40+HI] + + ldr r5,[sp,#48+0] + ldr r6,[sp,#48+4] + ldr r3,[sp,#56+0] + ldr r4,[sp,#56+4] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] + adds r9,r5,r9 + str r9, [r0,#48+LO] + adc r10,r6,r10 + str r10, [r0,#48+HI] + adds r11,r3,r11 + str r11, [r0,#56+LO] + adc r12,r4,r12 + str r12, [r0,#56+HI] + + add sp,sp,#640 + sub r14,r14,#640 + + teq r1,r2 + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +#if __ARM_ARCH__>=7 +.fpu neon + +.align 4 +.LNEON: + dmb @ errata #451034 on early Cortex A8 + vstmdb sp!,{d8-d15} @ ABI specification says so + sub r3,r3,#672 @ K512 + vldmia r0,{d16-d23} @ load context +.Loop_neon: + vshr.u64 d24,d20,#14 @ 0 +#if 0<16 + vld1.64 {d0},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vsli.64 d26,d20,#23 +#if 0<16 && defined(__ARMEL__) + vrev64.8 d0,d0 +#endif + vadd.i64 d27,d28,d23 + veor d29,d21,d22 + veor d24,d25 + vand d29,d20 + veor d24,d26 @ Sigma1(e) + veor d29,d22 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d16,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d16,#34 + vshr.u64 d26,d16,#39 + vsli.64 d24,d16,#36 + vsli.64 d25,d16,#30 + vsli.64 d26,d16,#25 + vadd.i64 d27,d0 + vorr d30,d16,d18 + vand d29,d16,d18 + veor d23,d24,d25 + vand d30,d17 + veor d23,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d23,d27 + vadd.i64 d19,d27 + vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 1 +#if 1<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vsli.64 d26,d19,#23 +#if 1<16 && defined(__ARMEL__) + vrev64.8 d1,d1 +#endif + vadd.i64 d27,d28,d22 + veor d29,d20,d21 + veor d24,d25 + vand d29,d19 + veor d24,d26 @ Sigma1(e) + veor d29,d21 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d23,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d23,#34 + vshr.u64 d26,d23,#39 + vsli.64 d24,d23,#36 + vsli.64 d25,d23,#30 + vsli.64 d26,d23,#25 + vadd.i64 d27,d1 + vorr d30,d23,d17 + vand d29,d23,d17 + veor d22,d24,d25 + vand d30,d16 + veor d22,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d22,d27 + vadd.i64 d18,d27 + vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 2 +#if 2<16 + vld1.64 {d2},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vsli.64 d26,d18,#23 +#if 2<16 && defined(__ARMEL__) + vrev64.8 d2,d2 +#endif + vadd.i64 d27,d28,d21 + veor d29,d19,d20 + veor d24,d25 + vand d29,d18 + veor d24,d26 @ Sigma1(e) + veor d29,d20 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d22,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d22,#34 + vshr.u64 d26,d22,#39 + vsli.64 d24,d22,#36 + vsli.64 d25,d22,#30 + vsli.64 d26,d22,#25 + vadd.i64 d27,d2 + vorr d30,d22,d16 + vand d29,d22,d16 + veor d21,d24,d25 + vand d30,d23 + veor d21,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d21,d27 + vadd.i64 d17,d27 + vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 3 +#if 3<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vsli.64 d26,d17,#23 +#if 3<16 && defined(__ARMEL__) + vrev64.8 d3,d3 +#endif + vadd.i64 d27,d28,d20 + veor d29,d18,d19 + veor d24,d25 + vand d29,d17 + veor d24,d26 @ Sigma1(e) + veor d29,d19 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d21,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d21,#34 + vshr.u64 d26,d21,#39 + vsli.64 d24,d21,#36 + vsli.64 d25,d21,#30 + vsli.64 d26,d21,#25 + vadd.i64 d27,d3 + vorr d30,d21,d23 + vand d29,d21,d23 + veor d20,d24,d25 + vand d30,d22 + veor d20,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d20,d27 + vadd.i64 d16,d27 + vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 4 +#if 4<16 + vld1.64 {d4},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vsli.64 d26,d16,#23 +#if 4<16 && defined(__ARMEL__) + vrev64.8 d4,d4 +#endif + vadd.i64 d27,d28,d19 + veor d29,d17,d18 + veor d24,d25 + vand d29,d16 + veor d24,d26 @ Sigma1(e) + veor d29,d18 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d20,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d20,#34 + vshr.u64 d26,d20,#39 + vsli.64 d24,d20,#36 + vsli.64 d25,d20,#30 + vsli.64 d26,d20,#25 + vadd.i64 d27,d4 + vorr d30,d20,d22 + vand d29,d20,d22 + veor d19,d24,d25 + vand d30,d21 + veor d19,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d19,d27 + vadd.i64 d23,d27 + vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 5 +#if 5<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vsli.64 d26,d23,#23 +#if 5<16 && defined(__ARMEL__) + vrev64.8 d5,d5 +#endif + vadd.i64 d27,d28,d18 + veor d29,d16,d17 + veor d24,d25 + vand d29,d23 + veor d24,d26 @ Sigma1(e) + veor d29,d17 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d19,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d19,#34 + vshr.u64 d26,d19,#39 + vsli.64 d24,d19,#36 + vsli.64 d25,d19,#30 + vsli.64 d26,d19,#25 + vadd.i64 d27,d5 + vorr d30,d19,d21 + vand d29,d19,d21 + veor d18,d24,d25 + vand d30,d20 + veor d18,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d18,d27 + vadd.i64 d22,d27 + vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 6 +#if 6<16 + vld1.64 {d6},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vsli.64 d26,d22,#23 +#if 6<16 && defined(__ARMEL__) + vrev64.8 d6,d6 +#endif + vadd.i64 d27,d28,d17 + veor d29,d23,d16 + veor d24,d25 + vand d29,d22 + veor d24,d26 @ Sigma1(e) + veor d29,d16 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d18,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d18,#34 + vshr.u64 d26,d18,#39 + vsli.64 d24,d18,#36 + vsli.64 d25,d18,#30 + vsli.64 d26,d18,#25 + vadd.i64 d27,d6 + vorr d30,d18,d20 + vand d29,d18,d20 + veor d17,d24,d25 + vand d30,d19 + veor d17,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d17,d27 + vadd.i64 d21,d27 + vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 7 +#if 7<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vsli.64 d26,d21,#23 +#if 7<16 && defined(__ARMEL__) + vrev64.8 d7,d7 +#endif + vadd.i64 d27,d28,d16 + veor d29,d22,d23 + veor d24,d25 + vand d29,d21 + veor d24,d26 @ Sigma1(e) + veor d29,d23 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d17,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d17,#34 + vshr.u64 d26,d17,#39 + vsli.64 d24,d17,#36 + vsli.64 d25,d17,#30 + vsli.64 d26,d17,#25 + vadd.i64 d27,d7 + vorr d30,d17,d19 + vand d29,d17,d19 + veor d16,d24,d25 + vand d30,d18 + veor d16,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d16,d27 + vadd.i64 d20,d27 + vadd.i64 d16,d30 + vshr.u64 d24,d20,#14 @ 8 +#if 8<16 + vld1.64 {d8},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vsli.64 d26,d20,#23 +#if 8<16 && defined(__ARMEL__) + vrev64.8 d8,d8 +#endif + vadd.i64 d27,d28,d23 + veor d29,d21,d22 + veor d24,d25 + vand d29,d20 + veor d24,d26 @ Sigma1(e) + veor d29,d22 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d16,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d16,#34 + vshr.u64 d26,d16,#39 + vsli.64 d24,d16,#36 + vsli.64 d25,d16,#30 + vsli.64 d26,d16,#25 + vadd.i64 d27,d8 + vorr d30,d16,d18 + vand d29,d16,d18 + veor d23,d24,d25 + vand d30,d17 + veor d23,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d23,d27 + vadd.i64 d19,d27 + vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 9 +#if 9<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vsli.64 d26,d19,#23 +#if 9<16 && defined(__ARMEL__) + vrev64.8 d9,d9 +#endif + vadd.i64 d27,d28,d22 + veor d29,d20,d21 + veor d24,d25 + vand d29,d19 + veor d24,d26 @ Sigma1(e) + veor d29,d21 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d23,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d23,#34 + vshr.u64 d26,d23,#39 + vsli.64 d24,d23,#36 + vsli.64 d25,d23,#30 + vsli.64 d26,d23,#25 + vadd.i64 d27,d9 + vorr d30,d23,d17 + vand d29,d23,d17 + veor d22,d24,d25 + vand d30,d16 + veor d22,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d22,d27 + vadd.i64 d18,d27 + vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 10 +#if 10<16 + vld1.64 {d10},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vsli.64 d26,d18,#23 +#if 10<16 && defined(__ARMEL__) + vrev64.8 d10,d10 +#endif + vadd.i64 d27,d28,d21 + veor d29,d19,d20 + veor d24,d25 + vand d29,d18 + veor d24,d26 @ Sigma1(e) + veor d29,d20 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d22,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d22,#34 + vshr.u64 d26,d22,#39 + vsli.64 d24,d22,#36 + vsli.64 d25,d22,#30 + vsli.64 d26,d22,#25 + vadd.i64 d27,d10 + vorr d30,d22,d16 + vand d29,d22,d16 + veor d21,d24,d25 + vand d30,d23 + veor d21,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d21,d27 + vadd.i64 d17,d27 + vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 11 +#if 11<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vsli.64 d26,d17,#23 +#if 11<16 && defined(__ARMEL__) + vrev64.8 d11,d11 +#endif + vadd.i64 d27,d28,d20 + veor d29,d18,d19 + veor d24,d25 + vand d29,d17 + veor d24,d26 @ Sigma1(e) + veor d29,d19 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d21,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d21,#34 + vshr.u64 d26,d21,#39 + vsli.64 d24,d21,#36 + vsli.64 d25,d21,#30 + vsli.64 d26,d21,#25 + vadd.i64 d27,d11 + vorr d30,d21,d23 + vand d29,d21,d23 + veor d20,d24,d25 + vand d30,d22 + veor d20,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d20,d27 + vadd.i64 d16,d27 + vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 12 +#if 12<16 + vld1.64 {d12},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vsli.64 d26,d16,#23 +#if 12<16 && defined(__ARMEL__) + vrev64.8 d12,d12 +#endif + vadd.i64 d27,d28,d19 + veor d29,d17,d18 + veor d24,d25 + vand d29,d16 + veor d24,d26 @ Sigma1(e) + veor d29,d18 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d20,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d20,#34 + vshr.u64 d26,d20,#39 + vsli.64 d24,d20,#36 + vsli.64 d25,d20,#30 + vsli.64 d26,d20,#25 + vadd.i64 d27,d12 + vorr d30,d20,d22 + vand d29,d20,d22 + veor d19,d24,d25 + vand d30,d21 + veor d19,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d19,d27 + vadd.i64 d23,d27 + vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 13 +#if 13<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vsli.64 d26,d23,#23 +#if 13<16 && defined(__ARMEL__) + vrev64.8 d13,d13 +#endif + vadd.i64 d27,d28,d18 + veor d29,d16,d17 + veor d24,d25 + vand d29,d23 + veor d24,d26 @ Sigma1(e) + veor d29,d17 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d19,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d19,#34 + vshr.u64 d26,d19,#39 + vsli.64 d24,d19,#36 + vsli.64 d25,d19,#30 + vsli.64 d26,d19,#25 + vadd.i64 d27,d13 + vorr d30,d19,d21 + vand d29,d19,d21 + veor d18,d24,d25 + vand d30,d20 + veor d18,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d18,d27 + vadd.i64 d22,d27 + vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 14 +#if 14<16 + vld1.64 {d14},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vsli.64 d26,d22,#23 +#if 14<16 && defined(__ARMEL__) + vrev64.8 d14,d14 +#endif + vadd.i64 d27,d28,d17 + veor d29,d23,d16 + veor d24,d25 + vand d29,d22 + veor d24,d26 @ Sigma1(e) + veor d29,d16 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d18,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d18,#34 + vshr.u64 d26,d18,#39 + vsli.64 d24,d18,#36 + vsli.64 d25,d18,#30 + vsli.64 d26,d18,#25 + vadd.i64 d27,d14 + vorr d30,d18,d20 + vand d29,d18,d20 + veor d17,d24,d25 + vand d30,d19 + veor d17,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d17,d27 + vadd.i64 d21,d27 + vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 15 +#if 15<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vsli.64 d26,d21,#23 +#if 15<16 && defined(__ARMEL__) + vrev64.8 d15,d15 +#endif + vadd.i64 d27,d28,d16 + veor d29,d22,d23 + veor d24,d25 + vand d29,d21 + veor d24,d26 @ Sigma1(e) + veor d29,d23 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d17,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d17,#34 + vshr.u64 d26,d17,#39 + vsli.64 d24,d17,#36 + vsli.64 d25,d17,#30 + vsli.64 d26,d17,#25 + vadd.i64 d27,d15 + vorr d30,d17,d19 + vand d29,d17,d19 + veor d16,d24,d25 + vand d30,d18 + veor d16,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d16,d27 + vadd.i64 d20,d27 + vadd.i64 d16,d30 + mov r12,#4 +.L16_79_neon: + subs r12,#1 + vshr.u64 q12,q7,#19 + vshr.u64 q13,q7,#61 + vshr.u64 q15,q7,#6 + vsli.64 q12,q7,#45 + vext.8 q14,q0,q1,#8 @ X[i+1] + vsli.64 q13,q7,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q0,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q4,q5,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q0,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q0,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vsli.64 d26,d20,#23 +#if 16<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d23 + veor d29,d21,d22 + veor d24,d25 + vand d29,d20 + veor d24,d26 @ Sigma1(e) + veor d29,d22 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d16,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d16,#34 + vshr.u64 d26,d16,#39 + vsli.64 d24,d16,#36 + vsli.64 d25,d16,#30 + vsli.64 d26,d16,#25 + vadd.i64 d27,d0 + vorr d30,d16,d18 + vand d29,d16,d18 + veor d23,d24,d25 + vand d30,d17 + veor d23,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d23,d27 + vadd.i64 d19,d27 + vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 17 +#if 17<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vsli.64 d26,d19,#23 +#if 17<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d22 + veor d29,d20,d21 + veor d24,d25 + vand d29,d19 + veor d24,d26 @ Sigma1(e) + veor d29,d21 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d23,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d23,#34 + vshr.u64 d26,d23,#39 + vsli.64 d24,d23,#36 + vsli.64 d25,d23,#30 + vsli.64 d26,d23,#25 + vadd.i64 d27,d1 + vorr d30,d23,d17 + vand d29,d23,d17 + veor d22,d24,d25 + vand d30,d16 + veor d22,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d22,d27 + vadd.i64 d18,d27 + vadd.i64 d22,d30 + vshr.u64 q12,q0,#19 + vshr.u64 q13,q0,#61 + vshr.u64 q15,q0,#6 + vsli.64 q12,q0,#45 + vext.8 q14,q1,q2,#8 @ X[i+1] + vsli.64 q13,q0,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q1,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q5,q6,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q1,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q1,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vsli.64 d26,d18,#23 +#if 18<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d21 + veor d29,d19,d20 + veor d24,d25 + vand d29,d18 + veor d24,d26 @ Sigma1(e) + veor d29,d20 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d22,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d22,#34 + vshr.u64 d26,d22,#39 + vsli.64 d24,d22,#36 + vsli.64 d25,d22,#30 + vsli.64 d26,d22,#25 + vadd.i64 d27,d2 + vorr d30,d22,d16 + vand d29,d22,d16 + veor d21,d24,d25 + vand d30,d23 + veor d21,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d21,d27 + vadd.i64 d17,d27 + vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 19 +#if 19<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vsli.64 d26,d17,#23 +#if 19<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d20 + veor d29,d18,d19 + veor d24,d25 + vand d29,d17 + veor d24,d26 @ Sigma1(e) + veor d29,d19 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d21,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d21,#34 + vshr.u64 d26,d21,#39 + vsli.64 d24,d21,#36 + vsli.64 d25,d21,#30 + vsli.64 d26,d21,#25 + vadd.i64 d27,d3 + vorr d30,d21,d23 + vand d29,d21,d23 + veor d20,d24,d25 + vand d30,d22 + veor d20,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d20,d27 + vadd.i64 d16,d27 + vadd.i64 d20,d30 + vshr.u64 q12,q1,#19 + vshr.u64 q13,q1,#61 + vshr.u64 q15,q1,#6 + vsli.64 q12,q1,#45 + vext.8 q14,q2,q3,#8 @ X[i+1] + vsli.64 q13,q1,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q2,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q6,q7,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q2,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q2,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vsli.64 d26,d16,#23 +#if 20<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d19 + veor d29,d17,d18 + veor d24,d25 + vand d29,d16 + veor d24,d26 @ Sigma1(e) + veor d29,d18 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d20,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d20,#34 + vshr.u64 d26,d20,#39 + vsli.64 d24,d20,#36 + vsli.64 d25,d20,#30 + vsli.64 d26,d20,#25 + vadd.i64 d27,d4 + vorr d30,d20,d22 + vand d29,d20,d22 + veor d19,d24,d25 + vand d30,d21 + veor d19,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d19,d27 + vadd.i64 d23,d27 + vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 21 +#if 21<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vsli.64 d26,d23,#23 +#if 21<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d18 + veor d29,d16,d17 + veor d24,d25 + vand d29,d23 + veor d24,d26 @ Sigma1(e) + veor d29,d17 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d19,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d19,#34 + vshr.u64 d26,d19,#39 + vsli.64 d24,d19,#36 + vsli.64 d25,d19,#30 + vsli.64 d26,d19,#25 + vadd.i64 d27,d5 + vorr d30,d19,d21 + vand d29,d19,d21 + veor d18,d24,d25 + vand d30,d20 + veor d18,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d18,d27 + vadd.i64 d22,d27 + vadd.i64 d18,d30 + vshr.u64 q12,q2,#19 + vshr.u64 q13,q2,#61 + vshr.u64 q15,q2,#6 + vsli.64 q12,q2,#45 + vext.8 q14,q3,q4,#8 @ X[i+1] + vsli.64 q13,q2,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q3,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q7,q0,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q3,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q3,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vsli.64 d26,d22,#23 +#if 22<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d17 + veor d29,d23,d16 + veor d24,d25 + vand d29,d22 + veor d24,d26 @ Sigma1(e) + veor d29,d16 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d18,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d18,#34 + vshr.u64 d26,d18,#39 + vsli.64 d24,d18,#36 + vsli.64 d25,d18,#30 + vsli.64 d26,d18,#25 + vadd.i64 d27,d6 + vorr d30,d18,d20 + vand d29,d18,d20 + veor d17,d24,d25 + vand d30,d19 + veor d17,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d17,d27 + vadd.i64 d21,d27 + vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 23 +#if 23<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vsli.64 d26,d21,#23 +#if 23<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d16 + veor d29,d22,d23 + veor d24,d25 + vand d29,d21 + veor d24,d26 @ Sigma1(e) + veor d29,d23 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d17,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d17,#34 + vshr.u64 d26,d17,#39 + vsli.64 d24,d17,#36 + vsli.64 d25,d17,#30 + vsli.64 d26,d17,#25 + vadd.i64 d27,d7 + vorr d30,d17,d19 + vand d29,d17,d19 + veor d16,d24,d25 + vand d30,d18 + veor d16,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d16,d27 + vadd.i64 d20,d27 + vadd.i64 d16,d30 + vshr.u64 q12,q3,#19 + vshr.u64 q13,q3,#61 + vshr.u64 q15,q3,#6 + vsli.64 q12,q3,#45 + vext.8 q14,q4,q5,#8 @ X[i+1] + vsli.64 q13,q3,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q4,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q0,q1,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q4,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q4,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vsli.64 d26,d20,#23 +#if 24<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d23 + veor d29,d21,d22 + veor d24,d25 + vand d29,d20 + veor d24,d26 @ Sigma1(e) + veor d29,d22 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d16,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d16,#34 + vshr.u64 d26,d16,#39 + vsli.64 d24,d16,#36 + vsli.64 d25,d16,#30 + vsli.64 d26,d16,#25 + vadd.i64 d27,d8 + vorr d30,d16,d18 + vand d29,d16,d18 + veor d23,d24,d25 + vand d30,d17 + veor d23,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d23,d27 + vadd.i64 d19,d27 + vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 25 +#if 25<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vsli.64 d26,d19,#23 +#if 25<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d22 + veor d29,d20,d21 + veor d24,d25 + vand d29,d19 + veor d24,d26 @ Sigma1(e) + veor d29,d21 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d23,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d23,#34 + vshr.u64 d26,d23,#39 + vsli.64 d24,d23,#36 + vsli.64 d25,d23,#30 + vsli.64 d26,d23,#25 + vadd.i64 d27,d9 + vorr d30,d23,d17 + vand d29,d23,d17 + veor d22,d24,d25 + vand d30,d16 + veor d22,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d22,d27 + vadd.i64 d18,d27 + vadd.i64 d22,d30 + vshr.u64 q12,q4,#19 + vshr.u64 q13,q4,#61 + vshr.u64 q15,q4,#6 + vsli.64 q12,q4,#45 + vext.8 q14,q5,q6,#8 @ X[i+1] + vsli.64 q13,q4,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q5,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q1,q2,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q5,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q5,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vsli.64 d26,d18,#23 +#if 26<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d21 + veor d29,d19,d20 + veor d24,d25 + vand d29,d18 + veor d24,d26 @ Sigma1(e) + veor d29,d20 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d22,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d22,#34 + vshr.u64 d26,d22,#39 + vsli.64 d24,d22,#36 + vsli.64 d25,d22,#30 + vsli.64 d26,d22,#25 + vadd.i64 d27,d10 + vorr d30,d22,d16 + vand d29,d22,d16 + veor d21,d24,d25 + vand d30,d23 + veor d21,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d21,d27 + vadd.i64 d17,d27 + vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 27 +#if 27<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vsli.64 d26,d17,#23 +#if 27<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d20 + veor d29,d18,d19 + veor d24,d25 + vand d29,d17 + veor d24,d26 @ Sigma1(e) + veor d29,d19 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d21,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d21,#34 + vshr.u64 d26,d21,#39 + vsli.64 d24,d21,#36 + vsli.64 d25,d21,#30 + vsli.64 d26,d21,#25 + vadd.i64 d27,d11 + vorr d30,d21,d23 + vand d29,d21,d23 + veor d20,d24,d25 + vand d30,d22 + veor d20,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d20,d27 + vadd.i64 d16,d27 + vadd.i64 d20,d30 + vshr.u64 q12,q5,#19 + vshr.u64 q13,q5,#61 + vshr.u64 q15,q5,#6 + vsli.64 q12,q5,#45 + vext.8 q14,q6,q7,#8 @ X[i+1] + vsli.64 q13,q5,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q6,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q2,q3,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q6,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q6,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vsli.64 d26,d16,#23 +#if 28<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d19 + veor d29,d17,d18 + veor d24,d25 + vand d29,d16 + veor d24,d26 @ Sigma1(e) + veor d29,d18 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d20,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d20,#34 + vshr.u64 d26,d20,#39 + vsli.64 d24,d20,#36 + vsli.64 d25,d20,#30 + vsli.64 d26,d20,#25 + vadd.i64 d27,d12 + vorr d30,d20,d22 + vand d29,d20,d22 + veor d19,d24,d25 + vand d30,d21 + veor d19,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d19,d27 + vadd.i64 d23,d27 + vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 29 +#if 29<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vsli.64 d26,d23,#23 +#if 29<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d18 + veor d29,d16,d17 + veor d24,d25 + vand d29,d23 + veor d24,d26 @ Sigma1(e) + veor d29,d17 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d19,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d19,#34 + vshr.u64 d26,d19,#39 + vsli.64 d24,d19,#36 + vsli.64 d25,d19,#30 + vsli.64 d26,d19,#25 + vadd.i64 d27,d13 + vorr d30,d19,d21 + vand d29,d19,d21 + veor d18,d24,d25 + vand d30,d20 + veor d18,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d18,d27 + vadd.i64 d22,d27 + vadd.i64 d18,d30 + vshr.u64 q12,q6,#19 + vshr.u64 q13,q6,#61 + vshr.u64 q15,q6,#6 + vsli.64 q12,q6,#45 + vext.8 q14,q7,q0,#8 @ X[i+1] + vsli.64 q13,q6,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q7,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q3,q4,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q7,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q7,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vsli.64 d26,d22,#23 +#if 30<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d17 + veor d29,d23,d16 + veor d24,d25 + vand d29,d22 + veor d24,d26 @ Sigma1(e) + veor d29,d16 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d18,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d18,#34 + vshr.u64 d26,d18,#39 + vsli.64 d24,d18,#36 + vsli.64 d25,d18,#30 + vsli.64 d26,d18,#25 + vadd.i64 d27,d14 + vorr d30,d18,d20 + vand d29,d18,d20 + veor d17,d24,d25 + vand d30,d19 + veor d17,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d17,d27 + vadd.i64 d21,d27 + vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 31 +#if 31<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vsli.64 d26,d21,#23 +#if 31<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d16 + veor d29,d22,d23 + veor d24,d25 + vand d29,d21 + veor d24,d26 @ Sigma1(e) + veor d29,d23 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d17,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d17,#34 + vshr.u64 d26,d17,#39 + vsli.64 d24,d17,#36 + vsli.64 d25,d17,#30 + vsli.64 d26,d17,#25 + vadd.i64 d27,d15 + vorr d30,d17,d19 + vand d29,d17,d19 + veor d16,d24,d25 + vand d30,d18 + veor d16,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d16,d27 + vadd.i64 d20,d27 + vadd.i64 d16,d30 + bne .L16_79_neon + + vldmia r0,{d24-d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia r0,{d16-d23} @ save context + teq r1,r2 + sub r3,#640 @ rewind K512 + bne .Loop_neon + + vldmia sp!,{d8-d15} @ epilogue + bx lr @ .word 0xe12fff1e +#endif +.size sha512_block_data_order,.-sha512_block_data_order +.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +.comm OPENSSL_armcap_P,4,4 diff --git a/app/openssl/crypto/sha/asm/sha512-armv4.pl b/app/openssl/crypto/sha/asm/sha512-armv4.pl index 7faf37b1..71aa9356 100644 --- a/app/openssl/crypto/sha/asm/sha512-armv4.pl +++ b/app/openssl/crypto/sha/asm/sha512-armv4.pl @@ -565,7 +565,7 @@ $code.=<<___; bne .Loop_neon vldmia sp!,{d8-d15} @ epilogue - bx lr + ret @ bx lr #endif ___ } @@ -578,5 +578,6 @@ ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; print $code; close STDOUT; # enforce flush diff --git a/app/openssl/crypto/sha/asm/sha512-armv4.s b/app/openssl/crypto/sha/asm/sha512-armv4.s deleted file mode 100644 index 57301922..00000000 --- a/app/openssl/crypto/sha/asm/sha512-armv4.s +++ /dev/null @@ -1,1783 +0,0 @@ -#include "arm_arch.h" -#ifdef __ARMEL__ -# define LO 0 -# define HI 4 -# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 -#else -# define HI 0 -# define LO 4 -# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 -#endif - -.text -.code 32 -.type K512,%object -.align 5 -K512: -WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) -WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) -WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) -WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) -WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) -WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) -WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) -WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) -WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) -WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) -WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) -WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) -WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) -WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) -WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) -WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) -WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) -WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) -WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) -WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) -WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) -WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) -WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) -WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) -WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) -WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) -WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) -WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) -WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) -WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) -WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) -WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) -WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) -WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) -WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) -WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) -WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) -WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) -WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) -WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) -.size K512,.-K512 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha512_block_data_order -.skip 32-4 - -.global sha512_block_data_order -.type sha512_block_data_order,%function -sha512_block_data_order: - sub r3,pc,#8 @ sha512_block_data_order - add r2,r1,r2,lsl#7 @ len to point at the end of inp -#if __ARM_ARCH__>=7 - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#1 - bne .LNEON -#endif - stmdb sp!,{r4-r12,lr} - sub r14,r3,#672 @ K512 - sub sp,sp,#9*8 - - ldr r7,[r0,#32+LO] - ldr r8,[r0,#32+HI] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] -.Loop: - str r9, [sp,#48+0] - str r10, [sp,#48+4] - str r11, [sp,#56+0] - str r12, [sp,#56+4] - ldr r5,[r0,#0+LO] - ldr r6,[r0,#0+HI] - ldr r3,[r0,#8+LO] - ldr r4,[r0,#8+HI] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - str r3,[sp,#8+0] - str r4,[sp,#8+4] - str r9, [sp,#16+0] - str r10, [sp,#16+4] - str r11, [sp,#24+0] - str r12, [sp,#24+4] - ldr r3,[r0,#40+LO] - ldr r4,[r0,#40+HI] - str r3,[sp,#40+0] - str r4,[sp,#40+4] - -.L00_15: -#if __ARM_ARCH__<7 - ldrb r3,[r1,#7] - ldrb r9, [r1,#6] - ldrb r10, [r1,#5] - ldrb r11, [r1,#4] - ldrb r4,[r1,#3] - ldrb r12, [r1,#2] - orr r3,r3,r9,lsl#8 - ldrb r9, [r1,#1] - orr r3,r3,r10,lsl#16 - ldrb r10, [r1],#8 - orr r3,r3,r11,lsl#24 - orr r4,r4,r12,lsl#8 - orr r4,r4,r9,lsl#16 - orr r4,r4,r10,lsl#24 -#else - ldr r3,[r1,#4] - ldr r4,[r1],#8 -#ifdef __ARMEL__ - rev r3,r3 - rev r4,r4 -#endif -#endif - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#148 - - ldr r12,[sp,#16+0] @ c.lo - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 - tst r14,#1 - beq .L00_15 - ldr r9,[sp,#184+0] - ldr r10,[sp,#184+4] - bic r14,r14,#1 -.L16_79: - @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) - @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 - @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 - mov r3,r9,lsr#1 - ldr r11,[sp,#80+0] - mov r4,r10,lsr#1 - ldr r12,[sp,#80+4] - eor r3,r3,r10,lsl#31 - eor r4,r4,r9,lsl#31 - eor r3,r3,r9,lsr#8 - eor r4,r4,r10,lsr#8 - eor r3,r3,r10,lsl#24 - eor r4,r4,r9,lsl#24 - eor r3,r3,r9,lsr#7 - eor r4,r4,r10,lsr#7 - eor r3,r3,r10,lsl#25 - - @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 - @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 - mov r9,r11,lsr#19 - mov r10,r12,lsr#19 - eor r9,r9,r12,lsl#13 - eor r10,r10,r11,lsl#13 - eor r9,r9,r12,lsr#29 - eor r10,r10,r11,lsr#29 - eor r9,r9,r11,lsl#3 - eor r10,r10,r12,lsl#3 - eor r9,r9,r11,lsr#6 - eor r10,r10,r12,lsr#6 - ldr r11,[sp,#120+0] - eor r9,r9,r12,lsl#26 - - ldr r12,[sp,#120+4] - adds r3,r3,r9 - ldr r9,[sp,#192+0] - adc r4,r4,r10 - - ldr r10,[sp,#192+4] - adds r3,r3,r11 - adc r4,r4,r12 - adds r3,r3,r9 - adc r4,r4,r10 - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#23 - - ldr r12,[sp,#16+0] @ c.lo - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 - ldreq r9,[sp,#184+0] - ldreq r10,[sp,#184+4] - beq .L16_79 - bic r14,r14,#1 - - ldr r3,[sp,#8+0] - ldr r4,[sp,#8+4] - ldr r9, [r0,#0+LO] - ldr r10, [r0,#0+HI] - ldr r11, [r0,#8+LO] - ldr r12, [r0,#8+HI] - adds r9,r5,r9 - str r9, [r0,#0+LO] - adc r10,r6,r10 - str r10, [r0,#0+HI] - adds r11,r3,r11 - str r11, [r0,#8+LO] - adc r12,r4,r12 - str r12, [r0,#8+HI] - - ldr r5,[sp,#16+0] - ldr r6,[sp,#16+4] - ldr r3,[sp,#24+0] - ldr r4,[sp,#24+4] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - adds r9,r5,r9 - str r9, [r0,#16+LO] - adc r10,r6,r10 - str r10, [r0,#16+HI] - adds r11,r3,r11 - str r11, [r0,#24+LO] - adc r12,r4,r12 - str r12, [r0,#24+HI] - - ldr r3,[sp,#40+0] - ldr r4,[sp,#40+4] - ldr r9, [r0,#32+LO] - ldr r10, [r0,#32+HI] - ldr r11, [r0,#40+LO] - ldr r12, [r0,#40+HI] - adds r7,r7,r9 - str r7,[r0,#32+LO] - adc r8,r8,r10 - str r8,[r0,#32+HI] - adds r11,r3,r11 - str r11, [r0,#40+LO] - adc r12,r4,r12 - str r12, [r0,#40+HI] - - ldr r5,[sp,#48+0] - ldr r6,[sp,#48+4] - ldr r3,[sp,#56+0] - ldr r4,[sp,#56+4] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] - adds r9,r5,r9 - str r9, [r0,#48+LO] - adc r10,r6,r10 - str r10, [r0,#48+HI] - adds r11,r3,r11 - str r11, [r0,#56+LO] - adc r12,r4,r12 - str r12, [r0,#56+HI] - - add sp,sp,#640 - sub r14,r14,#640 - - teq r1,r2 - bne .Loop - - add sp,sp,#8*9 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -#if __ARM_ARCH__>=7 -.fpu neon - -.align 4 -.LNEON: - dmb @ errata #451034 on early Cortex A8 - vstmdb sp!,{d8-d15} @ ABI specification says so - sub r3,r3,#672 @ K512 - vldmia r0,{d16-d23} @ load context -.Loop_neon: - vshr.u64 d24,d20,#14 @ 0 -#if 0<16 - vld1.64 {d0},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vsli.64 d26,d20,#23 -#if 0<16 && defined(__ARMEL__) - vrev64.8 d0,d0 -#endif - vadd.i64 d27,d28,d23 - veor d29,d21,d22 - veor d24,d25 - vand d29,d20 - veor d24,d26 @ Sigma1(e) - veor d29,d22 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d16,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d16,#34 - vshr.u64 d26,d16,#39 - vsli.64 d24,d16,#36 - vsli.64 d25,d16,#30 - vsli.64 d26,d16,#25 - vadd.i64 d27,d0 - vorr d30,d16,d18 - vand d29,d16,d18 - veor d23,d24,d25 - vand d30,d17 - veor d23,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d23,d27 - vadd.i64 d19,d27 - vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 1 -#if 1<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vsli.64 d26,d19,#23 -#if 1<16 && defined(__ARMEL__) - vrev64.8 d1,d1 -#endif - vadd.i64 d27,d28,d22 - veor d29,d20,d21 - veor d24,d25 - vand d29,d19 - veor d24,d26 @ Sigma1(e) - veor d29,d21 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d23,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d23,#34 - vshr.u64 d26,d23,#39 - vsli.64 d24,d23,#36 - vsli.64 d25,d23,#30 - vsli.64 d26,d23,#25 - vadd.i64 d27,d1 - vorr d30,d23,d17 - vand d29,d23,d17 - veor d22,d24,d25 - vand d30,d16 - veor d22,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d22,d27 - vadd.i64 d18,d27 - vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 2 -#if 2<16 - vld1.64 {d2},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vsli.64 d26,d18,#23 -#if 2<16 && defined(__ARMEL__) - vrev64.8 d2,d2 -#endif - vadd.i64 d27,d28,d21 - veor d29,d19,d20 - veor d24,d25 - vand d29,d18 - veor d24,d26 @ Sigma1(e) - veor d29,d20 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d22,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d22,#34 - vshr.u64 d26,d22,#39 - vsli.64 d24,d22,#36 - vsli.64 d25,d22,#30 - vsli.64 d26,d22,#25 - vadd.i64 d27,d2 - vorr d30,d22,d16 - vand d29,d22,d16 - veor d21,d24,d25 - vand d30,d23 - veor d21,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d21,d27 - vadd.i64 d17,d27 - vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 3 -#if 3<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vsli.64 d26,d17,#23 -#if 3<16 && defined(__ARMEL__) - vrev64.8 d3,d3 -#endif - vadd.i64 d27,d28,d20 - veor d29,d18,d19 - veor d24,d25 - vand d29,d17 - veor d24,d26 @ Sigma1(e) - veor d29,d19 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d21,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d21,#34 - vshr.u64 d26,d21,#39 - vsli.64 d24,d21,#36 - vsli.64 d25,d21,#30 - vsli.64 d26,d21,#25 - vadd.i64 d27,d3 - vorr d30,d21,d23 - vand d29,d21,d23 - veor d20,d24,d25 - vand d30,d22 - veor d20,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d20,d27 - vadd.i64 d16,d27 - vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 4 -#if 4<16 - vld1.64 {d4},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vsli.64 d26,d16,#23 -#if 4<16 && defined(__ARMEL__) - vrev64.8 d4,d4 -#endif - vadd.i64 d27,d28,d19 - veor d29,d17,d18 - veor d24,d25 - vand d29,d16 - veor d24,d26 @ Sigma1(e) - veor d29,d18 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d20,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d20,#34 - vshr.u64 d26,d20,#39 - vsli.64 d24,d20,#36 - vsli.64 d25,d20,#30 - vsli.64 d26,d20,#25 - vadd.i64 d27,d4 - vorr d30,d20,d22 - vand d29,d20,d22 - veor d19,d24,d25 - vand d30,d21 - veor d19,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d19,d27 - vadd.i64 d23,d27 - vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 5 -#if 5<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vsli.64 d26,d23,#23 -#if 5<16 && defined(__ARMEL__) - vrev64.8 d5,d5 -#endif - vadd.i64 d27,d28,d18 - veor d29,d16,d17 - veor d24,d25 - vand d29,d23 - veor d24,d26 @ Sigma1(e) - veor d29,d17 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d19,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d19,#34 - vshr.u64 d26,d19,#39 - vsli.64 d24,d19,#36 - vsli.64 d25,d19,#30 - vsli.64 d26,d19,#25 - vadd.i64 d27,d5 - vorr d30,d19,d21 - vand d29,d19,d21 - veor d18,d24,d25 - vand d30,d20 - veor d18,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d18,d27 - vadd.i64 d22,d27 - vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 6 -#if 6<16 - vld1.64 {d6},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vsli.64 d26,d22,#23 -#if 6<16 && defined(__ARMEL__) - vrev64.8 d6,d6 -#endif - vadd.i64 d27,d28,d17 - veor d29,d23,d16 - veor d24,d25 - vand d29,d22 - veor d24,d26 @ Sigma1(e) - veor d29,d16 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d18,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d18,#34 - vshr.u64 d26,d18,#39 - vsli.64 d24,d18,#36 - vsli.64 d25,d18,#30 - vsli.64 d26,d18,#25 - vadd.i64 d27,d6 - vorr d30,d18,d20 - vand d29,d18,d20 - veor d17,d24,d25 - vand d30,d19 - veor d17,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d17,d27 - vadd.i64 d21,d27 - vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 7 -#if 7<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vsli.64 d26,d21,#23 -#if 7<16 && defined(__ARMEL__) - vrev64.8 d7,d7 -#endif - vadd.i64 d27,d28,d16 - veor d29,d22,d23 - veor d24,d25 - vand d29,d21 - veor d24,d26 @ Sigma1(e) - veor d29,d23 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d17,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d17,#34 - vshr.u64 d26,d17,#39 - vsli.64 d24,d17,#36 - vsli.64 d25,d17,#30 - vsli.64 d26,d17,#25 - vadd.i64 d27,d7 - vorr d30,d17,d19 - vand d29,d17,d19 - veor d16,d24,d25 - vand d30,d18 - veor d16,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d16,d27 - vadd.i64 d20,d27 - vadd.i64 d16,d30 - vshr.u64 d24,d20,#14 @ 8 -#if 8<16 - vld1.64 {d8},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vsli.64 d26,d20,#23 -#if 8<16 && defined(__ARMEL__) - vrev64.8 d8,d8 -#endif - vadd.i64 d27,d28,d23 - veor d29,d21,d22 - veor d24,d25 - vand d29,d20 - veor d24,d26 @ Sigma1(e) - veor d29,d22 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d16,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d16,#34 - vshr.u64 d26,d16,#39 - vsli.64 d24,d16,#36 - vsli.64 d25,d16,#30 - vsli.64 d26,d16,#25 - vadd.i64 d27,d8 - vorr d30,d16,d18 - vand d29,d16,d18 - veor d23,d24,d25 - vand d30,d17 - veor d23,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d23,d27 - vadd.i64 d19,d27 - vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 9 -#if 9<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vsli.64 d26,d19,#23 -#if 9<16 && defined(__ARMEL__) - vrev64.8 d9,d9 -#endif - vadd.i64 d27,d28,d22 - veor d29,d20,d21 - veor d24,d25 - vand d29,d19 - veor d24,d26 @ Sigma1(e) - veor d29,d21 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d23,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d23,#34 - vshr.u64 d26,d23,#39 - vsli.64 d24,d23,#36 - vsli.64 d25,d23,#30 - vsli.64 d26,d23,#25 - vadd.i64 d27,d9 - vorr d30,d23,d17 - vand d29,d23,d17 - veor d22,d24,d25 - vand d30,d16 - veor d22,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d22,d27 - vadd.i64 d18,d27 - vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 10 -#if 10<16 - vld1.64 {d10},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vsli.64 d26,d18,#23 -#if 10<16 && defined(__ARMEL__) - vrev64.8 d10,d10 -#endif - vadd.i64 d27,d28,d21 - veor d29,d19,d20 - veor d24,d25 - vand d29,d18 - veor d24,d26 @ Sigma1(e) - veor d29,d20 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d22,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d22,#34 - vshr.u64 d26,d22,#39 - vsli.64 d24,d22,#36 - vsli.64 d25,d22,#30 - vsli.64 d26,d22,#25 - vadd.i64 d27,d10 - vorr d30,d22,d16 - vand d29,d22,d16 - veor d21,d24,d25 - vand d30,d23 - veor d21,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d21,d27 - vadd.i64 d17,d27 - vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 11 -#if 11<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vsli.64 d26,d17,#23 -#if 11<16 && defined(__ARMEL__) - vrev64.8 d11,d11 -#endif - vadd.i64 d27,d28,d20 - veor d29,d18,d19 - veor d24,d25 - vand d29,d17 - veor d24,d26 @ Sigma1(e) - veor d29,d19 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d21,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d21,#34 - vshr.u64 d26,d21,#39 - vsli.64 d24,d21,#36 - vsli.64 d25,d21,#30 - vsli.64 d26,d21,#25 - vadd.i64 d27,d11 - vorr d30,d21,d23 - vand d29,d21,d23 - veor d20,d24,d25 - vand d30,d22 - veor d20,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d20,d27 - vadd.i64 d16,d27 - vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 12 -#if 12<16 - vld1.64 {d12},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vsli.64 d26,d16,#23 -#if 12<16 && defined(__ARMEL__) - vrev64.8 d12,d12 -#endif - vadd.i64 d27,d28,d19 - veor d29,d17,d18 - veor d24,d25 - vand d29,d16 - veor d24,d26 @ Sigma1(e) - veor d29,d18 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d20,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d20,#34 - vshr.u64 d26,d20,#39 - vsli.64 d24,d20,#36 - vsli.64 d25,d20,#30 - vsli.64 d26,d20,#25 - vadd.i64 d27,d12 - vorr d30,d20,d22 - vand d29,d20,d22 - veor d19,d24,d25 - vand d30,d21 - veor d19,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d19,d27 - vadd.i64 d23,d27 - vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 13 -#if 13<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vsli.64 d26,d23,#23 -#if 13<16 && defined(__ARMEL__) - vrev64.8 d13,d13 -#endif - vadd.i64 d27,d28,d18 - veor d29,d16,d17 - veor d24,d25 - vand d29,d23 - veor d24,d26 @ Sigma1(e) - veor d29,d17 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d19,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d19,#34 - vshr.u64 d26,d19,#39 - vsli.64 d24,d19,#36 - vsli.64 d25,d19,#30 - vsli.64 d26,d19,#25 - vadd.i64 d27,d13 - vorr d30,d19,d21 - vand d29,d19,d21 - veor d18,d24,d25 - vand d30,d20 - veor d18,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d18,d27 - vadd.i64 d22,d27 - vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 14 -#if 14<16 - vld1.64 {d14},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vsli.64 d26,d22,#23 -#if 14<16 && defined(__ARMEL__) - vrev64.8 d14,d14 -#endif - vadd.i64 d27,d28,d17 - veor d29,d23,d16 - veor d24,d25 - vand d29,d22 - veor d24,d26 @ Sigma1(e) - veor d29,d16 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d18,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d18,#34 - vshr.u64 d26,d18,#39 - vsli.64 d24,d18,#36 - vsli.64 d25,d18,#30 - vsli.64 d26,d18,#25 - vadd.i64 d27,d14 - vorr d30,d18,d20 - vand d29,d18,d20 - veor d17,d24,d25 - vand d30,d19 - veor d17,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d17,d27 - vadd.i64 d21,d27 - vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 15 -#if 15<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vsli.64 d26,d21,#23 -#if 15<16 && defined(__ARMEL__) - vrev64.8 d15,d15 -#endif - vadd.i64 d27,d28,d16 - veor d29,d22,d23 - veor d24,d25 - vand d29,d21 - veor d24,d26 @ Sigma1(e) - veor d29,d23 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d17,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d17,#34 - vshr.u64 d26,d17,#39 - vsli.64 d24,d17,#36 - vsli.64 d25,d17,#30 - vsli.64 d26,d17,#25 - vadd.i64 d27,d15 - vorr d30,d17,d19 - vand d29,d17,d19 - veor d16,d24,d25 - vand d30,d18 - veor d16,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d16,d27 - vadd.i64 d20,d27 - vadd.i64 d16,d30 - mov r12,#4 -.L16_79_neon: - subs r12,#1 - vshr.u64 q12,q7,#19 - vshr.u64 q13,q7,#61 - vshr.u64 q15,q7,#6 - vsli.64 q12,q7,#45 - vext.8 q14,q0,q1,#8 @ X[i+1] - vsli.64 q13,q7,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q0,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q4,q5,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q0,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q0,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vsli.64 d26,d20,#23 -#if 16<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d23 - veor d29,d21,d22 - veor d24,d25 - vand d29,d20 - veor d24,d26 @ Sigma1(e) - veor d29,d22 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d16,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d16,#34 - vshr.u64 d26,d16,#39 - vsli.64 d24,d16,#36 - vsli.64 d25,d16,#30 - vsli.64 d26,d16,#25 - vadd.i64 d27,d0 - vorr d30,d16,d18 - vand d29,d16,d18 - veor d23,d24,d25 - vand d30,d17 - veor d23,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d23,d27 - vadd.i64 d19,d27 - vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 17 -#if 17<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vsli.64 d26,d19,#23 -#if 17<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d22 - veor d29,d20,d21 - veor d24,d25 - vand d29,d19 - veor d24,d26 @ Sigma1(e) - veor d29,d21 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d23,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d23,#34 - vshr.u64 d26,d23,#39 - vsli.64 d24,d23,#36 - vsli.64 d25,d23,#30 - vsli.64 d26,d23,#25 - vadd.i64 d27,d1 - vorr d30,d23,d17 - vand d29,d23,d17 - veor d22,d24,d25 - vand d30,d16 - veor d22,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d22,d27 - vadd.i64 d18,d27 - vadd.i64 d22,d30 - vshr.u64 q12,q0,#19 - vshr.u64 q13,q0,#61 - vshr.u64 q15,q0,#6 - vsli.64 q12,q0,#45 - vext.8 q14,q1,q2,#8 @ X[i+1] - vsli.64 q13,q0,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q1,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q5,q6,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q1,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q1,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vsli.64 d26,d18,#23 -#if 18<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d21 - veor d29,d19,d20 - veor d24,d25 - vand d29,d18 - veor d24,d26 @ Sigma1(e) - veor d29,d20 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d22,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d22,#34 - vshr.u64 d26,d22,#39 - vsli.64 d24,d22,#36 - vsli.64 d25,d22,#30 - vsli.64 d26,d22,#25 - vadd.i64 d27,d2 - vorr d30,d22,d16 - vand d29,d22,d16 - veor d21,d24,d25 - vand d30,d23 - veor d21,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d21,d27 - vadd.i64 d17,d27 - vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 19 -#if 19<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vsli.64 d26,d17,#23 -#if 19<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d20 - veor d29,d18,d19 - veor d24,d25 - vand d29,d17 - veor d24,d26 @ Sigma1(e) - veor d29,d19 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d21,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d21,#34 - vshr.u64 d26,d21,#39 - vsli.64 d24,d21,#36 - vsli.64 d25,d21,#30 - vsli.64 d26,d21,#25 - vadd.i64 d27,d3 - vorr d30,d21,d23 - vand d29,d21,d23 - veor d20,d24,d25 - vand d30,d22 - veor d20,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d20,d27 - vadd.i64 d16,d27 - vadd.i64 d20,d30 - vshr.u64 q12,q1,#19 - vshr.u64 q13,q1,#61 - vshr.u64 q15,q1,#6 - vsli.64 q12,q1,#45 - vext.8 q14,q2,q3,#8 @ X[i+1] - vsli.64 q13,q1,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q2,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q6,q7,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q2,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q2,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vsli.64 d26,d16,#23 -#if 20<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d19 - veor d29,d17,d18 - veor d24,d25 - vand d29,d16 - veor d24,d26 @ Sigma1(e) - veor d29,d18 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d20,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d20,#34 - vshr.u64 d26,d20,#39 - vsli.64 d24,d20,#36 - vsli.64 d25,d20,#30 - vsli.64 d26,d20,#25 - vadd.i64 d27,d4 - vorr d30,d20,d22 - vand d29,d20,d22 - veor d19,d24,d25 - vand d30,d21 - veor d19,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d19,d27 - vadd.i64 d23,d27 - vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 21 -#if 21<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vsli.64 d26,d23,#23 -#if 21<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d18 - veor d29,d16,d17 - veor d24,d25 - vand d29,d23 - veor d24,d26 @ Sigma1(e) - veor d29,d17 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d19,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d19,#34 - vshr.u64 d26,d19,#39 - vsli.64 d24,d19,#36 - vsli.64 d25,d19,#30 - vsli.64 d26,d19,#25 - vadd.i64 d27,d5 - vorr d30,d19,d21 - vand d29,d19,d21 - veor d18,d24,d25 - vand d30,d20 - veor d18,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d18,d27 - vadd.i64 d22,d27 - vadd.i64 d18,d30 - vshr.u64 q12,q2,#19 - vshr.u64 q13,q2,#61 - vshr.u64 q15,q2,#6 - vsli.64 q12,q2,#45 - vext.8 q14,q3,q4,#8 @ X[i+1] - vsli.64 q13,q2,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q3,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q7,q0,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q3,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q3,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vsli.64 d26,d22,#23 -#if 22<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d17 - veor d29,d23,d16 - veor d24,d25 - vand d29,d22 - veor d24,d26 @ Sigma1(e) - veor d29,d16 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d18,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d18,#34 - vshr.u64 d26,d18,#39 - vsli.64 d24,d18,#36 - vsli.64 d25,d18,#30 - vsli.64 d26,d18,#25 - vadd.i64 d27,d6 - vorr d30,d18,d20 - vand d29,d18,d20 - veor d17,d24,d25 - vand d30,d19 - veor d17,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d17,d27 - vadd.i64 d21,d27 - vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 23 -#if 23<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vsli.64 d26,d21,#23 -#if 23<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d16 - veor d29,d22,d23 - veor d24,d25 - vand d29,d21 - veor d24,d26 @ Sigma1(e) - veor d29,d23 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d17,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d17,#34 - vshr.u64 d26,d17,#39 - vsli.64 d24,d17,#36 - vsli.64 d25,d17,#30 - vsli.64 d26,d17,#25 - vadd.i64 d27,d7 - vorr d30,d17,d19 - vand d29,d17,d19 - veor d16,d24,d25 - vand d30,d18 - veor d16,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d16,d27 - vadd.i64 d20,d27 - vadd.i64 d16,d30 - vshr.u64 q12,q3,#19 - vshr.u64 q13,q3,#61 - vshr.u64 q15,q3,#6 - vsli.64 q12,q3,#45 - vext.8 q14,q4,q5,#8 @ X[i+1] - vsli.64 q13,q3,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q4,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q0,q1,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q4,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q4,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vsli.64 d26,d20,#23 -#if 24<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d23 - veor d29,d21,d22 - veor d24,d25 - vand d29,d20 - veor d24,d26 @ Sigma1(e) - veor d29,d22 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d16,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d16,#34 - vshr.u64 d26,d16,#39 - vsli.64 d24,d16,#36 - vsli.64 d25,d16,#30 - vsli.64 d26,d16,#25 - vadd.i64 d27,d8 - vorr d30,d16,d18 - vand d29,d16,d18 - veor d23,d24,d25 - vand d30,d17 - veor d23,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d23,d27 - vadd.i64 d19,d27 - vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 25 -#if 25<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vsli.64 d26,d19,#23 -#if 25<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d22 - veor d29,d20,d21 - veor d24,d25 - vand d29,d19 - veor d24,d26 @ Sigma1(e) - veor d29,d21 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d23,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d23,#34 - vshr.u64 d26,d23,#39 - vsli.64 d24,d23,#36 - vsli.64 d25,d23,#30 - vsli.64 d26,d23,#25 - vadd.i64 d27,d9 - vorr d30,d23,d17 - vand d29,d23,d17 - veor d22,d24,d25 - vand d30,d16 - veor d22,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d22,d27 - vadd.i64 d18,d27 - vadd.i64 d22,d30 - vshr.u64 q12,q4,#19 - vshr.u64 q13,q4,#61 - vshr.u64 q15,q4,#6 - vsli.64 q12,q4,#45 - vext.8 q14,q5,q6,#8 @ X[i+1] - vsli.64 q13,q4,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q5,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q1,q2,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q5,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q5,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vsli.64 d26,d18,#23 -#if 26<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d21 - veor d29,d19,d20 - veor d24,d25 - vand d29,d18 - veor d24,d26 @ Sigma1(e) - veor d29,d20 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d22,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d22,#34 - vshr.u64 d26,d22,#39 - vsli.64 d24,d22,#36 - vsli.64 d25,d22,#30 - vsli.64 d26,d22,#25 - vadd.i64 d27,d10 - vorr d30,d22,d16 - vand d29,d22,d16 - veor d21,d24,d25 - vand d30,d23 - veor d21,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d21,d27 - vadd.i64 d17,d27 - vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 27 -#if 27<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vsli.64 d26,d17,#23 -#if 27<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d20 - veor d29,d18,d19 - veor d24,d25 - vand d29,d17 - veor d24,d26 @ Sigma1(e) - veor d29,d19 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d21,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d21,#34 - vshr.u64 d26,d21,#39 - vsli.64 d24,d21,#36 - vsli.64 d25,d21,#30 - vsli.64 d26,d21,#25 - vadd.i64 d27,d11 - vorr d30,d21,d23 - vand d29,d21,d23 - veor d20,d24,d25 - vand d30,d22 - veor d20,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d20,d27 - vadd.i64 d16,d27 - vadd.i64 d20,d30 - vshr.u64 q12,q5,#19 - vshr.u64 q13,q5,#61 - vshr.u64 q15,q5,#6 - vsli.64 q12,q5,#45 - vext.8 q14,q6,q7,#8 @ X[i+1] - vsli.64 q13,q5,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q6,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q2,q3,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q6,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q6,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vsli.64 d26,d16,#23 -#if 28<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d19 - veor d29,d17,d18 - veor d24,d25 - vand d29,d16 - veor d24,d26 @ Sigma1(e) - veor d29,d18 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d20,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d20,#34 - vshr.u64 d26,d20,#39 - vsli.64 d24,d20,#36 - vsli.64 d25,d20,#30 - vsli.64 d26,d20,#25 - vadd.i64 d27,d12 - vorr d30,d20,d22 - vand d29,d20,d22 - veor d19,d24,d25 - vand d30,d21 - veor d19,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d19,d27 - vadd.i64 d23,d27 - vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 29 -#if 29<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vsli.64 d26,d23,#23 -#if 29<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d18 - veor d29,d16,d17 - veor d24,d25 - vand d29,d23 - veor d24,d26 @ Sigma1(e) - veor d29,d17 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d19,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d19,#34 - vshr.u64 d26,d19,#39 - vsli.64 d24,d19,#36 - vsli.64 d25,d19,#30 - vsli.64 d26,d19,#25 - vadd.i64 d27,d13 - vorr d30,d19,d21 - vand d29,d19,d21 - veor d18,d24,d25 - vand d30,d20 - veor d18,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d18,d27 - vadd.i64 d22,d27 - vadd.i64 d18,d30 - vshr.u64 q12,q6,#19 - vshr.u64 q13,q6,#61 - vshr.u64 q15,q6,#6 - vsli.64 q12,q6,#45 - vext.8 q14,q7,q0,#8 @ X[i+1] - vsli.64 q13,q6,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q7,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q3,q4,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q7,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q7,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vsli.64 d26,d22,#23 -#if 30<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d17 - veor d29,d23,d16 - veor d24,d25 - vand d29,d22 - veor d24,d26 @ Sigma1(e) - veor d29,d16 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d18,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d18,#34 - vshr.u64 d26,d18,#39 - vsli.64 d24,d18,#36 - vsli.64 d25,d18,#30 - vsli.64 d26,d18,#25 - vadd.i64 d27,d14 - vorr d30,d18,d20 - vand d29,d18,d20 - veor d17,d24,d25 - vand d30,d19 - veor d17,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d17,d27 - vadd.i64 d21,d27 - vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 31 -#if 31<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vsli.64 d26,d21,#23 -#if 31<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d16 - veor d29,d22,d23 - veor d24,d25 - vand d29,d21 - veor d24,d26 @ Sigma1(e) - veor d29,d23 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d17,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d17,#34 - vshr.u64 d26,d17,#39 - vsli.64 d24,d17,#36 - vsli.64 d25,d17,#30 - vsli.64 d26,d17,#25 - vadd.i64 d27,d15 - vorr d30,d17,d19 - vand d29,d17,d19 - veor d16,d24,d25 - vand d30,d18 - veor d16,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d16,d27 - vadd.i64 d20,d27 - vadd.i64 d16,d30 - bne .L16_79_neon - - vldmia r0,{d24-d31} @ load context to temp - vadd.i64 q8,q12 @ vectorized accumulate - vadd.i64 q9,q13 - vadd.i64 q10,q14 - vadd.i64 q11,q15 - vstmia r0,{d16-d23} @ save context - teq r1,r2 - sub r3,#640 @ rewind K512 - bne .Loop_neon - - vldmia sp!,{d8-d15} @ epilogue - .word 0xe12fff1e -#endif -.size sha512_block_data_order,.-sha512_block_data_order -.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" -.align 2 -.comm OPENSSL_armcap_P,4,4 diff --git a/app/openssl/crypto/sha/asm/sha512-armv8.S b/app/openssl/crypto/sha/asm/sha512-armv8.S new file mode 100644 index 00000000..6b0d1940 --- /dev/null +++ b/app/openssl/crypto/sha/asm/sha512-armv8.S @@ -0,0 +1,1021 @@ +#include "arm_arch.h" + +.text + +.globl sha512_block_data_order +.type sha512_block_data_order,%function +.align 6 +sha512_block_data_order: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adr x30,K512 + stp x0,x2,[x29,#96] + +.Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __ARMEB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +.Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size sha512_block_data_order,.-sha512_block_data_order + +.align 6 +.type K512,%object +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +.size K512,.-K512 +.align 3 +.LOPENSSL_armcap_P: + .quad OPENSSL_armcap_P-. +.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +.comm OPENSSL_armcap_P,4,4 diff --git a/app/openssl/crypto/sha/asm/sha512-armv8.pl b/app/openssl/crypto/sha/asm/sha512-armv8.pl new file mode 100644 index 00000000..6935ed65 --- /dev/null +++ b/app/openssl/crypto/sha/asm/sha512-armv8.pl @@ -0,0 +1,414 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# SHA256-hw SHA256(*) SHA512 +# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +# Cortex-A5x n/a n/a n/a +# +# (*) Software SHA256 results are of lesser relevance, presented +# mostly for informational purposes. +# (**) The result is a trade-off: it's possible to improve it by +# 10%, but at the cost of 20% loss on Cortex-A5x. + +$flavour=shift; +$output=shift; +open STDOUT,">$output"; + +if ($output =~ /512/) { + $BITS=512; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $reg_t="x"; +} else { + $BITS=256; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $reg_t="w"; +} + +$func="sha${BITS}_block_data_order"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); + $T0=@X[$i+3] if ($i<11); + +$code.=<<___ if ($i<16); +#ifndef __ARMEB__ + rev @X[$i],@X[$i] // $i +#endif +___ +$code.=<<___ if ($i<13 && ($i&1)); + ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___ if ($i==13); + ldp @X[14],@X[15],[$inp] +___ +$code.=<<___ if ($i>=14); + ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___ if ($i>0 && $i<16); + add $a,$a,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=11); + str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___ if ($i<15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` + and $t1,$f,$e + bic $t2,$g,$e + add $h,$h,@X[$i&15] // h+=X[i] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) + ror $T0,$a,#$Sigma0[0] + add $h,$h,$t1 // h+=Ch(e,f,g) + eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` + add $h,$h,$t0 // h+=Sigma1(e) + and $t3,$t3,$t2 // (b^c)&=(a^b) + add $d,$d,$h // d+=h + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + //add $h,$h,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + ror $T1,@X[($j+1)&15],#$sigma0[0] + and $t1,$f,$e + ror $T2,@X[($j+14)&15],#$sigma1[0] + bic $t2,$g,$e + ror $T0,$a,#$Sigma0[0] + add $h,$h,@X[$i&15] // h+=X[i] + eor $t0,$t0,$e,ror#$Sigma1[1] + eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) + eor $T0,$T0,$a,ror#$Sigma0[1] + add $h,$h,$t1 // h+=Ch(e,f,g) + and $t3,$t3,$t2 // (b^c)&=(a^b) + eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] + eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) + add $h,$h,$t0 // h+=Sigma1(e) + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) + eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) + add @X[$j],@X[$j],@X[($j+9)&15] + add $d,$d,$h // d+=h + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + add @X[$j],@X[$j],$T1 + add $h,$h,$t1 // h+=Sigma0(a) + add @X[$j],@X[$j],$T2 +___ + ($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#include "arm_arch.h" + +.text + +.globl $func +.type $func,%function +.align 6 +$func: +___ +$code.=<<___ if ($SZ==4); + ldr x16,.LOPENSSL_armcap_P + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry +___ +$code.=<<___; + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*$SZ + + ldp $A,$B,[$ctx] // load context + ldp $C,$D,[$ctx,#2*$SZ] + ldp $E,$F,[$ctx,#4*$SZ] + add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input + ldp $G,$H,[$ctx,#6*$SZ] + adr $Ktbl,K$BITS + stp $ctx,$num,[x29,#96] + +.Loop: + ldp @X[0],@X[1],[$inp],#2*$SZ + ldr $t2,[$Ktbl],#$SZ // *K++ + eor $t3,$B,$C // magic seed + str $inp,[x29,#112] +___ +for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + cbnz $t2,.Loop_16_xx + + ldp $ctx,$num,[x29,#96] + ldr $inp,[x29,#112] + sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind + + ldp @X[0],@X[1],[$ctx] + ldp @X[2],@X[3],[$ctx,#2*$SZ] + add $inp,$inp,#14*$SZ // advance input pointer + ldp @X[4],@X[5],[$ctx,#4*$SZ] + add $A,$A,@X[0] + ldp @X[6],@X[7],[$ctx,#6*$SZ] + add $B,$B,@X[1] + add $C,$C,@X[2] + add $D,$D,@X[3] + stp $A,$B,[$ctx] + add $E,$E,@X[4] + add $F,$F,@X[5] + stp $C,$D,[$ctx,#2*$SZ] + add $G,$G,@X[6] + add $H,$H,@X[7] + cmp $inp,$num + stp $E,$F,[$ctx,#4*$SZ] + stp $G,$H,[$ctx,#6*$SZ] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*$SZ + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size $func,.-$func + +.align 6 +.type K$BITS,%object +K$BITS: +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +___ +$code.=<<___; +.size K$BITS,.-K$BITS +.align 3 +.LOPENSSL_armcap_P: + .quad OPENSSL_armcap_P-. +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,K256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +___ +} + +$code.=<<___; +.comm OPENSSL_armcap_P,4,4 +___ + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; + + s/\.\w?32\b//o and s/\.16b/\.4s/go; + m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; + + print $_,"\n"; +} + +close STDOUT; diff --git a/app/openssl/crypto/srp/srp_vfy.c b/app/openssl/crypto/srp/srp_vfy.c index 4a3d13ed..fdca19ff 100644 --- a/app/openssl/crypto/srp/srp_vfy.c +++ b/app/openssl/crypto/srp/srp_vfy.c @@ -93,6 +93,9 @@ static int t_fromb64(unsigned char *a, const char *src) else a[i] = loc - b64table; ++i; } + /* if nothing valid to process we have a zero length response */ + if (i == 0) + return 0; size = i; i = size - 1; j = size; diff --git a/app/openssl/crypto/x509v3/v3_purp.c b/app/openssl/crypto/x509v3/v3_purp.c index ad688657..f59bfc18 100644 --- a/app/openssl/crypto/x509v3/v3_purp.c +++ b/app/openssl/crypto/x509v3/v3_purp.c @@ -389,8 +389,8 @@ static void x509v3_cache_extensions(X509 *x) /* Handle proxy certificates */ if((pci=X509_get_ext_d2i(x, NID_proxyCertInfo, NULL, NULL))) { if (x->ex_flags & EXFLAG_CA - || X509_get_ext_by_NID(x, NID_subject_alt_name, 0) >= 0 - || X509_get_ext_by_NID(x, NID_issuer_alt_name, 0) >= 0) { + || X509_get_ext_by_NID(x, NID_subject_alt_name, -1) >= 0 + || X509_get_ext_by_NID(x, NID_issuer_alt_name, -1) >= 0) { x->ex_flags |= EXFLAG_INVALID; } if (pci->pcPathLengthConstraint) { @@ -670,7 +670,7 @@ static int check_purpose_timestamp_sign(const X509_PURPOSE *xp, const X509 *x, return 0; /* Extended Key Usage MUST be critical */ - i_ext = X509_get_ext_by_NID((X509 *) x, NID_ext_key_usage, 0); + i_ext = X509_get_ext_by_NID((X509 *) x, NID_ext_key_usage, -1); if (i_ext >= 0) { X509_EXTENSION *ext = X509_get_ext((X509 *) x, i_ext); diff --git a/app/openssl/crypto/x86cpuid.S b/app/openssl/crypto/x86cpuid.S index 87a46d4b..10be221c 100644 --- a/app/openssl/crypto/x86cpuid.S +++ b/app/openssl/crypto/x86cpuid.S @@ -341,6 +341,8 @@ OPENSSL_ia32_rdrand: cmovel %ecx,%eax ret .size OPENSSL_ia32_rdrand,.-.L_OPENSSL_ia32_rdrand_begin +.hidden OPENSSL_cpuid_setup +.hidden OPENSSL_ia32cap_P .comm OPENSSL_ia32cap_P,8,4 .section .init call OPENSSL_cpuid_setup diff --git a/app/openssl/crypto/x86cpuid.pl b/app/openssl/crypto/x86cpuid.pl index b270b443..0212a5b6 100644 --- a/app/openssl/crypto/x86cpuid.pl +++ b/app/openssl/crypto/x86cpuid.pl @@ -355,4 +355,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &initseg("OPENSSL_cpuid_setup"); +&hidden("OPENSSL_cpuid_setup"); +&hidden("OPENSSL_ia32cap_P"); + &asm_finish(); |