From e1bf276c709a1f26fd99540081128bff879650ce Mon Sep 17 00:00:00 2001 From: Arne Schwabe Date: Wed, 2 Jul 2014 14:38:56 +0200 Subject: Rename .s files to .S to fix build under linux (closes issue #261) --HG-- rename : main/openssl/crypto/aes/asm/aes-armv4.s => main/openssl/crypto/aes/asm/aes-armv4.S rename : main/openssl/crypto/bn/asm/armv4-mont.s => main/openssl/crypto/bn/asm/armv4-mont.S rename : main/openssl/crypto/bn/asm/mips3.s => main/openssl/crypto/bn/asm/mips3.S rename : main/openssl/crypto/bn/asm/pa-risc2.s => main/openssl/crypto/bn/asm/pa-risc2.S rename : main/openssl/crypto/bn/asm/pa-risc2W.s => main/openssl/crypto/bn/asm/pa-risc2W.S rename : main/openssl/crypto/sha/asm/sha1-armv4-large.s => main/openssl/crypto/sha/asm/sha1-armv4-large.S rename : main/openssl/crypto/sha/asm/sha256-armv4.s => main/openssl/crypto/sha/asm/sha256-armv4.S rename : main/openssl/crypto/sha/asm/sha512-armv4.s => main/openssl/crypto/sha/asm/sha512-armv4.S --- main/openssl/crypto/aes/asm/aes-armv4.S | 1177 +++++++++++ main/openssl/crypto/aes/asm/aes-armv4.s | 1177 ----------- main/openssl/crypto/bn/asm/armv4-mont.S | 579 +++++ main/openssl/crypto/bn/asm/armv4-mont.s | 579 ----- main/openssl/crypto/bn/asm/mips3.S | 2201 +++++++++++++++++++ main/openssl/crypto/bn/asm/mips3.s | 2201 ------------------- main/openssl/crypto/bn/asm/pa-risc2.S | 1618 ++++++++++++++ main/openssl/crypto/bn/asm/pa-risc2.s | 1618 -------------- main/openssl/crypto/bn/asm/pa-risc2W.S | 1605 ++++++++++++++ main/openssl/crypto/bn/asm/pa-risc2W.s | 1605 -------------- main/openssl/crypto/sha/asm/sha1-armv4-large.S | 1450 +++++++++++++ main/openssl/crypto/sha/asm/sha1-armv4-large.s | 1450 ------------- main/openssl/crypto/sha/asm/sha256-armv4.S | 2690 ++++++++++++++++++++++++ main/openssl/crypto/sha/asm/sha256-armv4.s | 2690 ------------------------ main/openssl/crypto/sha/asm/sha512-armv4.S | 1783 ++++++++++++++++ main/openssl/crypto/sha/asm/sha512-armv4.s | 1783 ---------------- 16 files changed, 13103 insertions(+), 13103 deletions(-) create mode 100644 main/openssl/crypto/aes/asm/aes-armv4.S delete mode 100644 main/openssl/crypto/aes/asm/aes-armv4.s create mode 100644 main/openssl/crypto/bn/asm/armv4-mont.S delete mode 100644 main/openssl/crypto/bn/asm/armv4-mont.s create mode 100644 main/openssl/crypto/bn/asm/mips3.S delete mode 100644 main/openssl/crypto/bn/asm/mips3.s create mode 100644 main/openssl/crypto/bn/asm/pa-risc2.S delete mode 100644 main/openssl/crypto/bn/asm/pa-risc2.s create mode 100644 main/openssl/crypto/bn/asm/pa-risc2W.S delete mode 100644 main/openssl/crypto/bn/asm/pa-risc2W.s create mode 100644 main/openssl/crypto/sha/asm/sha1-armv4-large.S delete mode 100644 main/openssl/crypto/sha/asm/sha1-armv4-large.s create mode 100644 main/openssl/crypto/sha/asm/sha256-armv4.S delete mode 100644 main/openssl/crypto/sha/asm/sha256-armv4.s create mode 100644 main/openssl/crypto/sha/asm/sha512-armv4.S delete mode 100644 main/openssl/crypto/sha/asm/sha512-armv4.s diff --git a/main/openssl/crypto/aes/asm/aes-armv4.S b/main/openssl/crypto/aes/asm/aes-armv4.S new file mode 100644 index 00000000..333a5227 --- /dev/null +++ b/main/openssl/crypto/aes/asm/aes-armv4.S @@ -0,0 +1,1177 @@ + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ AES for ARMv4 + +@ January 2007. +@ +@ Code uses single 1K S-box and is >2 times faster than code generated +@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which +@ allows to merge logical or arithmetic operation with shift or rotate +@ in one instruction and emit combined result every cycle. The module +@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit +@ key [on single-issue Xscale PXA250 core]. + +@ May 2007. +@ +@ AES_set_[en|de]crypt_key is added. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 12% improvement on +@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~21.5 cycles per byte. + +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif + +.type AES_Te,%object +.align 5 +AES_Te: +.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d +.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 +.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d +.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a +.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 +.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b +.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea +.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b +.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a +.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f +.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 +.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f +.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e +.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 +.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d +.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f +.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e +.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb +.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce +.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 +.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c +.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed +.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b +.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a +.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 +.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 +.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 +.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 +.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a +.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 +.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 +.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d +.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f +.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 +.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 +.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 +.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f +.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 +.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c +.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 +.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e +.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 +.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 +.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b +.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 +.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 +.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 +.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 +.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 +.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 +.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 +.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 +.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa +.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 +.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 +.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 +.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 +.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 +.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 +.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a +.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 +.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 +.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 +.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a +@ Te4[256] +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +@ rcon[] +.word 0x01000000, 0x02000000, 0x04000000, 0x08000000 +.word 0x10000000, 0x20000000, 0x40000000, 0x80000000 +.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 +.size AES_Te,.-AES_Te + +@ void AES_encrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.global AES_encrypt +.type AES_encrypt,%function +.align 5 +AES_encrypt: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ AES_encrypt +#else + adr r3,AES_encrypt +#endif + stmdb sp!,{r1,r4-r12,lr} + mov r12,r0 @ inp + mov r11,r2 + sub r10,r3,#AES_encrypt-AES_Te @ Te +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_encrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size AES_encrypt,.-AES_encrypt + +.type _armv4_AES_encrypt,%function +.align 2 +_armv4_AES_encrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4-r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0 + and r8,lr,r0,lsr#8 + and r9,lr,r0,lsr#16 + mov r0,r0,lsr#24 +.Lenc_loop: + ldr r4,[r10,r7,lsl#2] @ Te3[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldr r5,[r10,r8,lsl#2] @ Te2[s0>>8] + and r8,lr,r1 + ldr r6,[r10,r9,lsl#2] @ Te1[s0>>16] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Te0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Te1[s1>>16] + ldr r8,[r10,r8,lsl#2] @ Te3[s1>>0] + ldr r9,[r10,r9,lsl#2] @ Te2[s1>>8] + eor r0,r0,r7,ror#8 + ldr r1,[r10,r1,lsl#2] @ Te0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,ror#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r6,r9,ror#8 + and r9,lr,r2 + ldr r7,[r10,r7,lsl#2] @ Te2[s2>>8] + eor r1,r1,r4,ror#24 + ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te3[s2>>0] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Te0[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,ror#8 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r6,r9,ror#16 + and r9,lr,r3,lsr#16 @ i2 + ldr r7,[r10,r7,lsl#2] @ Te3[s3>>0] + eor r2,r2,r5,ror#16 + ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te1[s3>>16] + eor r0,r0,r7,ror#24 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24] + eor r2,r2,r9,ror#8 + ldr r4,[r11,#-12] + eor r3,r3,r6,ror#8 + + ldr r5,[r11,#-8] + eor r0,r0,r7 + ldr r6,[r11,#-4] + and r7,lr,r0 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0,lsr#16 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne .Lenc_loop + + add r10,r10,#2 + + ldrb r4,[r10,r7,lsl#2] @ Te4[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldrb r5,[r10,r8,lsl#2] @ Te4[s0>>8] + and r8,lr,r1 + ldrb r6,[r10,r9,lsl#2] @ Te4[s0>>16] + and r9,lr,r1,lsr#8 + ldrb r0,[r10,r0,lsl#2] @ Te4[s0>>24] + mov r1,r1,lsr#24 + + ldrb r7,[r10,r7,lsl#2] @ Te4[s1>>16] + ldrb r8,[r10,r8,lsl#2] @ Te4[s1>>0] + ldrb r9,[r10,r9,lsl#2] @ Te4[s1>>8] + eor r0,r7,r0,lsl#8 + ldrb r1,[r10,r1,lsl#2] @ Te4[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,lsl#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s2>>8] + eor r1,r4,r1,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16] + mov r2,r2,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s2>>0] + eor r0,r7,r0,lsl#8 + ldrb r2,[r10,r2,lsl#2] @ Te4[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r3,lsr#16 @ i2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s3>>0] + eor r2,r5,r2,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8] + mov r3,r3,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s3>>16] + eor r0,r7,r0,lsl#8 + ldr r7,[r11,#0] + ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r2,r9,lsl#16 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#2 + ldr pc,[sp],#4 @ pop and return +.size _armv4_AES_encrypt,.-_armv4_AES_encrypt + +.global private_AES_set_encrypt_key +.type private_AES_set_encrypt_key,%function +.align 5 +private_AES_set_encrypt_key: +_armv4_AES_set_encrypt_key: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ AES_set_encrypt_key +#else + adr r3,private_AES_set_encrypt_key +#endif + teq r0,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif + moveq r0,#-1 + beq .Labrt + teq r2,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif + moveq r0,#-1 + beq .Labrt + + teq r1,#128 + beq .Lok + teq r1,#192 + beq .Lok + teq r1,#256 +#if __ARM_ARCH__>=7 + itt ne @ Thumb2 thing, sanity check in ARM +#endif + movne r0,#-1 + bne .Labrt + +.Lok: stmdb sp!,{r4-r12,lr} + sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 + + mov r12,r0 @ inp + mov lr,r1 @ bits + mov r11,r2 @ key + +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + str r0,[r11],#16 + orr r3,r3,r5,lsl#16 + str r1,[r11,#-12] + orr r3,r3,r6,lsl#24 + str r2,[r11,#-8] + str r3,[r11,#-4] +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r11],#16 + str r1,[r11,#-12] + str r2,[r11,#-8] + str r3,[r11,#-4] +#endif + + teq lr,#128 + bne .Lnot128 + mov r12,#10 + str r12,[r11,#240-16] + add r6,r10,#256 @ rcon + mov lr,#255 + +.L128_loop: + and r5,lr,r3,lsr#24 + and r7,lr,r3,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r3 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r5,r5,r4 + eor r0,r0,r5 @ rk[4]=rk[0]^... + eor r1,r1,r0 @ rk[5]=rk[1]^rk[4] + str r0,[r11],#16 + eor r2,r2,r1 @ rk[6]=rk[2]^rk[5] + str r1,[r11,#-12] + eor r3,r3,r2 @ rk[7]=rk[3]^rk[6] + str r2,[r11,#-8] + subs r12,r12,#1 + str r3,[r11,#-4] + bne .L128_loop + sub r2,r11,#176 + b .Ldone + +.Lnot128: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#19] + ldrb r4,[r12,#18] + ldrb r5,[r12,#17] + ldrb r6,[r12,#16] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#23] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#22] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#21] + ldrb r6,[r12,#20] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#16] + ldr r9,[r12,#20] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + teq lr,#192 + bne .Lnot192 + mov r12,#12 + str r12,[r11,#240-24] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#8 + +.L192_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[6]=rk[0]^... + eor r1,r1,r0 @ rk[7]=rk[1]^rk[6] + str r0,[r11],#24 + eor r2,r2,r1 @ rk[8]=rk[2]^rk[7] + str r1,[r11,#-20] + eor r3,r3,r2 @ rk[9]=rk[3]^rk[8] + str r2,[r11,#-16] + subs r12,r12,#1 + str r3,[r11,#-12] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif + subeq r2,r11,#216 + beq .Ldone + + ldr r7,[r11,#-32] + ldr r8,[r11,#-28] + eor r7,r7,r3 @ rk[10]=rk[4]^rk[9] + eor r9,r8,r7 @ rk[11]=rk[5]^rk[10] + str r7,[r11,#-8] + str r9,[r11,#-4] + b .L192_loop + +.Lnot192: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#27] + ldrb r4,[r12,#26] + ldrb r5,[r12,#25] + ldrb r6,[r12,#24] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#31] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#30] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#29] + ldrb r6,[r12,#28] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#24] + ldr r9,[r12,#28] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + mov r12,#14 + str r12,[r11,#240-32] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#7 + +.L256_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[8]=rk[0]^... + eor r1,r1,r0 @ rk[9]=rk[1]^rk[8] + str r0,[r11],#32 + eor r2,r2,r1 @ rk[10]=rk[2]^rk[9] + str r1,[r11,#-28] + eor r3,r3,r2 @ rk[11]=rk[3]^rk[10] + str r2,[r11,#-24] + subs r12,r12,#1 + str r3,[r11,#-20] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif + subeq r2,r11,#256 + beq .Ldone + + and r5,lr,r3 + and r7,lr,r3,lsr#8 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#16 + ldrb r7,[r10,r7] + and r9,lr,r3,lsr#24 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#8 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r11,#-48] + orr r5,r5,r9,lsl#24 + + ldr r7,[r11,#-44] + ldr r8,[r11,#-40] + eor r4,r4,r5 @ rk[12]=rk[4]^... + ldr r9,[r11,#-36] + eor r7,r7,r4 @ rk[13]=rk[5]^rk[12] + str r4,[r11,#-16] + eor r8,r8,r7 @ rk[14]=rk[6]^rk[13] + str r7,[r11,#-12] + eor r9,r9,r8 @ rk[15]=rk[7]^rk[14] + str r8,[r11,#-8] + str r9,[r11,#-4] + b .L256_loop + +.align 2 +.Ldone: mov r0,#0 + ldmia sp!,{r4-r12,lr} +.Labrt: +#if __ARM_ARCH__>=5 + bx lr @ .word 0xe12fff1e +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key + +.global private_AES_set_decrypt_key +.type private_AES_set_decrypt_key,%function +.align 5 +private_AES_set_decrypt_key: + str lr,[sp,#-4]! @ push lr + bl _armv4_AES_set_encrypt_key + teq r0,#0 + ldr lr,[sp],#4 @ pop lr + bne .Labrt + + mov r0,r2 @ AES_set_encrypt_key preserves r2, + mov r1,r2 @ which is AES_KEY *key + b _armv4_AES_set_enc2dec_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key + +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.global AES_set_enc2dec_key +.type AES_set_enc2dec_key,%function +.align 5 +AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: + stmdb sp!,{r4-r12,lr} + + ldr r12,[r0,#240] + mov r7,r0 @ input + add r8,r0,r12,lsl#4 + mov r11,r1 @ ouput + add r10,r1,r12,lsl#4 + str r12,[r1,#240] + +.Linv: ldr r0,[r7],#16 + ldr r1,[r7,#-12] + ldr r2,[r7,#-8] + ldr r3,[r7,#-4] + ldr r4,[r8],#-16 + ldr r5,[r8,#16+4] + ldr r6,[r8,#16+8] + ldr r9,[r8,#16+12] + str r0,[r10],#-16 + str r1,[r10,#16+4] + str r2,[r10,#16+8] + str r3,[r10,#16+12] + str r4,[r11],#16 + str r5,[r11,#-12] + str r6,[r11,#-8] + str r9,[r11,#-4] + teq r7,r8 + bne .Linv + + ldr r0,[r7] + ldr r1,[r7,#4] + ldr r2,[r7,#8] + ldr r3,[r7,#12] + str r0,[r11] + str r1,[r11,#4] + str r2,[r11,#8] + str r3,[r11,#12] + sub r11,r11,r12,lsl#3 + ldr r0,[r11,#16]! @ prefetch tp1 + mov r7,#0x80 + mov r8,#0x1b + orr r7,r7,#0x8000 + orr r8,r8,#0x1b00 + orr r7,r7,r7,lsl#16 + orr r8,r8,r8,lsl#16 + sub r12,r12,#1 + mvn r9,r7 + mov r12,r12,lsl#2 @ (rounds-1)*4 + +.Lmix: and r4,r0,r7 + and r1,r0,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r1,r4,r1,lsl#1 @ tp2 + + and r4,r1,r7 + and r2,r1,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r2,r4,r2,lsl#1 @ tp4 + + and r4,r2,r7 + and r3,r2,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r3,r4,r3,lsl#1 @ tp8 + + eor r4,r1,r2 + eor r5,r0,r3 @ tp9 + eor r4,r4,r3 @ tpe + eor r4,r4,r1,ror#24 + eor r4,r4,r5,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8) + eor r4,r4,r2,ror#16 + eor r4,r4,r5,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16) + eor r4,r4,r5,ror#8 @ ^= ROTATE(tp9,24) + + ldr r0,[r11,#4] @ prefetch tp1 + str r4,[r11],#4 + subs r12,r12,#1 + bne .Lmix + + mov r0,#0 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size AES_set_enc2dec_key,.-AES_set_enc2dec_key + +.type AES_Td,%object +.align 5 +AES_Td: +.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 +.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 +.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 +.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f +.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 +.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 +.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da +.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 +.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd +.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 +.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 +.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 +.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 +.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a +.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 +.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c +.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 +.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a +.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 +.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 +.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 +.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff +.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 +.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb +.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 +.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e +.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 +.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a +.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e +.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 +.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d +.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 +.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd +.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 +.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 +.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 +.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d +.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 +.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 +.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef +.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 +.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 +.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 +.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 +.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 +.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b +.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 +.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 +.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 +.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 +.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 +.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f +.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df +.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f +.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e +.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 +.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 +.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c +.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf +.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 +.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f +.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 +.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 +.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 +@ Td4[256] +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +.size AES_Td,.-AES_Td + +@ void AES_decrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.global AES_decrypt +.type AES_decrypt,%function +.align 5 +AES_decrypt: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ AES_decrypt +#else + adr r3,AES_decrypt +#endif + stmdb sp!,{r1,r4-r12,lr} + mov r12,r0 @ inp + mov r11,r2 + sub r10,r3,#AES_decrypt-AES_Td @ Td +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_decrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size AES_decrypt,.-AES_decrypt + +.type _armv4_AES_decrypt,%function +.align 2 +_armv4_AES_decrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4-r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0,lsr#16 + and r8,lr,r0,lsr#8 + and r9,lr,r0 + mov r0,r0,lsr#24 +.Ldec_loop: + ldr r4,[r10,r7,lsl#2] @ Td1[s0>>16] + and r7,lr,r1 @ i0 + ldr r5,[r10,r8,lsl#2] @ Td2[s0>>8] + and r8,lr,r1,lsr#16 + ldr r6,[r10,r9,lsl#2] @ Td3[s0>>0] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Td0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Td3[s1>>0] + ldr r8,[r10,r8,lsl#2] @ Td1[s1>>16] + ldr r9,[r10,r9,lsl#2] @ Td2[s1>>8] + eor r0,r0,r7,ror#24 + ldr r1,[r10,r1,lsl#2] @ Td0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,ror#8 + and r8,lr,r2 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r2,lsr#16 + ldr r7,[r10,r7,lsl#2] @ Td2[s2>>8] + eor r1,r1,r4,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td1[s2>>16] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Td0[s2>>24] + and r7,lr,r3,lsr#16 @ i0 + eor r1,r1,r8,ror#24 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r3 @ i2 + ldr r7,[r10,r7,lsl#2] @ Td1[s3>>16] + eor r2,r2,r5,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td3[s3>>0] + eor r0,r0,r7,ror#8 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24] + eor r2,r2,r9,ror#24 + + ldr r4,[r11,#-12] + eor r0,r0,r7 + ldr r5,[r11,#-8] + eor r3,r3,r6,ror#8 + ldr r6,[r11,#-4] + and r7,lr,r0,lsr#16 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne .Ldec_loop + + add r10,r10,#1024 + + ldr r5,[r10,#0] @ prefetch Td4 + ldr r6,[r10,#32] + ldr r4,[r10,#64] + ldr r5,[r10,#96] + ldr r6,[r10,#128] + ldr r4,[r10,#160] + ldr r5,[r10,#192] + ldr r6,[r10,#224] + + ldrb r0,[r10,r0] @ Td4[s0>>24] + ldrb r4,[r10,r7] @ Td4[s0>>16] + and r7,lr,r1 @ i0 + ldrb r5,[r10,r8] @ Td4[s0>>8] + and r8,lr,r1,lsr#16 + ldrb r6,[r10,r9] @ Td4[s0>>0] + and r9,lr,r1,lsr#8 + + add r1,r10,r1,lsr#24 + ldrb r7,[r10,r7] @ Td4[s1>>0] + ldrb r1,[r1] @ Td4[s1>>24] + ldrb r8,[r10,r8] @ Td4[s1>>16] + eor r0,r7,r0,lsl#24 + ldrb r9,[r10,r9] @ Td4[s1>>8] + eor r1,r4,r1,lsl#8 + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,lsl#8 + and r8,lr,r2 @ i1 + ldrb r7,[r10,r7] @ Td4[s2>>8] + eor r6,r6,r9,lsl#8 + ldrb r8,[r10,r8] @ Td4[s2>>0] + and r9,lr,r2,lsr#16 + + add r2,r10,r2,lsr#24 + ldrb r2,[r2] @ Td4[s2>>24] + eor r0,r0,r7,lsl#8 + ldrb r9,[r10,r9] @ Td4[s2>>16] + eor r1,r8,r1,lsl#16 + and r7,lr,r3,lsr#16 @ i0 + eor r2,r5,r2,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + ldrb r7,[r10,r7] @ Td4[s3>>16] + eor r6,r6,r9,lsl#16 + ldrb r8,[r10,r8] @ Td4[s3>>8] + and r9,lr,r3 @ i2 + + add r3,r10,r3,lsr#24 + ldrb r9,[r10,r9] @ Td4[s3>>0] + ldrb r3,[r3] @ Td4[s3>>24] + eor r0,r0,r7,lsl#16 + ldr r7,[r11,#0] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r9,r2,lsl#8 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#1024 + ldr pc,[sp],#4 @ pop and return +.size _armv4_AES_decrypt,.-_armv4_AES_decrypt +.asciz "AES for ARMv4, CRYPTOGAMS by " +.align 2 diff --git a/main/openssl/crypto/aes/asm/aes-armv4.s b/main/openssl/crypto/aes/asm/aes-armv4.s deleted file mode 100644 index 333a5227..00000000 --- a/main/openssl/crypto/aes/asm/aes-armv4.s +++ /dev/null @@ -1,1177 +0,0 @@ - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ ==================================================================== - -@ AES for ARMv4 - -@ January 2007. -@ -@ Code uses single 1K S-box and is >2 times faster than code generated -@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which -@ allows to merge logical or arithmetic operation with shift or rotate -@ in one instruction and emit combined result every cycle. The module -@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit -@ key [on single-issue Xscale PXA250 core]. - -@ May 2007. -@ -@ AES_set_[en|de]crypt_key is added. - -@ July 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 12% improvement on -@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 16% -@ improvement on Cortex A8 core and ~21.5 cycles per byte. - -#ifndef __KERNEL__ -# include "arm_arch.h" -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -#endif - -.text -#if __ARM_ARCH__<7 -.code 32 -#else -.syntax unified -# ifdef __thumb2__ -.thumb -# else -.code 32 -# endif -#endif - -.type AES_Te,%object -.align 5 -AES_Te: -.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d -.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 -.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d -.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a -.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 -.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b -.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea -.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b -.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a -.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f -.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 -.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f -.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e -.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 -.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d -.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f -.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e -.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb -.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce -.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 -.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c -.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed -.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b -.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a -.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 -.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 -.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 -.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 -.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a -.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 -.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 -.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d -.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f -.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 -.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 -.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 -.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f -.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 -.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c -.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 -.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e -.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 -.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 -.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b -.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 -.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 -.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 -.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 -.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 -.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 -.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 -.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 -.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa -.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 -.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 -.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 -.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 -.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 -.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 -.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a -.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 -.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 -.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 -.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a -@ Te4[256] -.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 -.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 -.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 -.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 -.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc -.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 -.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a -.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 -.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 -.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 -.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b -.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf -.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 -.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 -.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 -.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 -.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 -.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 -.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 -.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb -.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c -.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 -.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 -.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 -.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 -.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a -.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e -.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e -.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 -.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf -.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 -.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 -@ rcon[] -.word 0x01000000, 0x02000000, 0x04000000, 0x08000000 -.word 0x10000000, 0x20000000, 0x40000000, 0x80000000 -.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 -.size AES_Te,.-AES_Te - -@ void AES_encrypt(const unsigned char *in, unsigned char *out, -@ const AES_KEY *key) { -.global AES_encrypt -.type AES_encrypt,%function -.align 5 -AES_encrypt: -#if __ARM_ARCH__<7 - sub r3,pc,#8 @ AES_encrypt -#else - adr r3,AES_encrypt -#endif - stmdb sp!,{r1,r4-r12,lr} - mov r12,r0 @ inp - mov r11,r2 - sub r10,r3,#AES_encrypt-AES_Te @ Te -#if __ARM_ARCH__<7 - ldrb r0,[r12,#3] @ load input data in endian-neutral - ldrb r4,[r12,#2] @ manner... - ldrb r5,[r12,#1] - ldrb r6,[r12,#0] - orr r0,r0,r4,lsl#8 - ldrb r1,[r12,#7] - orr r0,r0,r5,lsl#16 - ldrb r4,[r12,#6] - orr r0,r0,r6,lsl#24 - ldrb r5,[r12,#5] - ldrb r6,[r12,#4] - orr r1,r1,r4,lsl#8 - ldrb r2,[r12,#11] - orr r1,r1,r5,lsl#16 - ldrb r4,[r12,#10] - orr r1,r1,r6,lsl#24 - ldrb r5,[r12,#9] - ldrb r6,[r12,#8] - orr r2,r2,r4,lsl#8 - ldrb r3,[r12,#15] - orr r2,r2,r5,lsl#16 - ldrb r4,[r12,#14] - orr r2,r2,r6,lsl#24 - ldrb r5,[r12,#13] - ldrb r6,[r12,#12] - orr r3,r3,r4,lsl#8 - orr r3,r3,r5,lsl#16 - orr r3,r3,r6,lsl#24 -#else - ldr r0,[r12,#0] - ldr r1,[r12,#4] - ldr r2,[r12,#8] - ldr r3,[r12,#12] -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif -#endif - bl _armv4_AES_encrypt - - ldr r12,[sp],#4 @ pop out -#if __ARM_ARCH__>=7 -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif - str r0,[r12,#0] - str r1,[r12,#4] - str r2,[r12,#8] - str r3,[r12,#12] -#else - mov r4,r0,lsr#24 @ write output in endian-neutral - mov r5,r0,lsr#16 @ manner... - mov r6,r0,lsr#8 - strb r4,[r12,#0] - strb r5,[r12,#1] - mov r4,r1,lsr#24 - strb r6,[r12,#2] - mov r5,r1,lsr#16 - strb r0,[r12,#3] - mov r6,r1,lsr#8 - strb r4,[r12,#4] - strb r5,[r12,#5] - mov r4,r2,lsr#24 - strb r6,[r12,#6] - mov r5,r2,lsr#16 - strb r1,[r12,#7] - mov r6,r2,lsr#8 - strb r4,[r12,#8] - strb r5,[r12,#9] - mov r4,r3,lsr#24 - strb r6,[r12,#10] - mov r5,r3,lsr#16 - strb r2,[r12,#11] - mov r6,r3,lsr#8 - strb r4,[r12,#12] - strb r5,[r12,#13] - strb r6,[r12,#14] - strb r3,[r12,#15] -#endif -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size AES_encrypt,.-AES_encrypt - -.type _armv4_AES_encrypt,%function -.align 2 -_armv4_AES_encrypt: - str lr,[sp,#-4]! @ push lr - ldmia r11!,{r4-r7} - eor r0,r0,r4 - ldr r12,[r11,#240-16] - eor r1,r1,r5 - eor r2,r2,r6 - eor r3,r3,r7 - sub r12,r12,#1 - mov lr,#255 - - and r7,lr,r0 - and r8,lr,r0,lsr#8 - and r9,lr,r0,lsr#16 - mov r0,r0,lsr#24 -.Lenc_loop: - ldr r4,[r10,r7,lsl#2] @ Te3[s0>>0] - and r7,lr,r1,lsr#16 @ i0 - ldr r5,[r10,r8,lsl#2] @ Te2[s0>>8] - and r8,lr,r1 - ldr r6,[r10,r9,lsl#2] @ Te1[s0>>16] - and r9,lr,r1,lsr#8 - ldr r0,[r10,r0,lsl#2] @ Te0[s0>>24] - mov r1,r1,lsr#24 - - ldr r7,[r10,r7,lsl#2] @ Te1[s1>>16] - ldr r8,[r10,r8,lsl#2] @ Te3[s1>>0] - ldr r9,[r10,r9,lsl#2] @ Te2[s1>>8] - eor r0,r0,r7,ror#8 - ldr r1,[r10,r1,lsl#2] @ Te0[s1>>24] - and r7,lr,r2,lsr#8 @ i0 - eor r5,r5,r8,ror#8 - and r8,lr,r2,lsr#16 @ i1 - eor r6,r6,r9,ror#8 - and r9,lr,r2 - ldr r7,[r10,r7,lsl#2] @ Te2[s2>>8] - eor r1,r1,r4,ror#24 - ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16] - mov r2,r2,lsr#24 - - ldr r9,[r10,r9,lsl#2] @ Te3[s2>>0] - eor r0,r0,r7,ror#16 - ldr r2,[r10,r2,lsl#2] @ Te0[s2>>24] - and r7,lr,r3 @ i0 - eor r1,r1,r8,ror#8 - and r8,lr,r3,lsr#8 @ i1 - eor r6,r6,r9,ror#16 - and r9,lr,r3,lsr#16 @ i2 - ldr r7,[r10,r7,lsl#2] @ Te3[s3>>0] - eor r2,r2,r5,ror#16 - ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8] - mov r3,r3,lsr#24 - - ldr r9,[r10,r9,lsl#2] @ Te1[s3>>16] - eor r0,r0,r7,ror#24 - ldr r7,[r11],#16 - eor r1,r1,r8,ror#16 - ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24] - eor r2,r2,r9,ror#8 - ldr r4,[r11,#-12] - eor r3,r3,r6,ror#8 - - ldr r5,[r11,#-8] - eor r0,r0,r7 - ldr r6,[r11,#-4] - and r7,lr,r0 - eor r1,r1,r4 - and r8,lr,r0,lsr#8 - eor r2,r2,r5 - and r9,lr,r0,lsr#16 - eor r3,r3,r6 - mov r0,r0,lsr#24 - - subs r12,r12,#1 - bne .Lenc_loop - - add r10,r10,#2 - - ldrb r4,[r10,r7,lsl#2] @ Te4[s0>>0] - and r7,lr,r1,lsr#16 @ i0 - ldrb r5,[r10,r8,lsl#2] @ Te4[s0>>8] - and r8,lr,r1 - ldrb r6,[r10,r9,lsl#2] @ Te4[s0>>16] - and r9,lr,r1,lsr#8 - ldrb r0,[r10,r0,lsl#2] @ Te4[s0>>24] - mov r1,r1,lsr#24 - - ldrb r7,[r10,r7,lsl#2] @ Te4[s1>>16] - ldrb r8,[r10,r8,lsl#2] @ Te4[s1>>0] - ldrb r9,[r10,r9,lsl#2] @ Te4[s1>>8] - eor r0,r7,r0,lsl#8 - ldrb r1,[r10,r1,lsl#2] @ Te4[s1>>24] - and r7,lr,r2,lsr#8 @ i0 - eor r5,r8,r5,lsl#8 - and r8,lr,r2,lsr#16 @ i1 - eor r6,r9,r6,lsl#8 - and r9,lr,r2 - ldrb r7,[r10,r7,lsl#2] @ Te4[s2>>8] - eor r1,r4,r1,lsl#24 - ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16] - mov r2,r2,lsr#24 - - ldrb r9,[r10,r9,lsl#2] @ Te4[s2>>0] - eor r0,r7,r0,lsl#8 - ldrb r2,[r10,r2,lsl#2] @ Te4[s2>>24] - and r7,lr,r3 @ i0 - eor r1,r1,r8,lsl#16 - and r8,lr,r3,lsr#8 @ i1 - eor r6,r9,r6,lsl#8 - and r9,lr,r3,lsr#16 @ i2 - ldrb r7,[r10,r7,lsl#2] @ Te4[s3>>0] - eor r2,r5,r2,lsl#24 - ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8] - mov r3,r3,lsr#24 - - ldrb r9,[r10,r9,lsl#2] @ Te4[s3>>16] - eor r0,r7,r0,lsl#8 - ldr r7,[r11,#0] - ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24] - eor r1,r1,r8,lsl#8 - ldr r4,[r11,#4] - eor r2,r2,r9,lsl#16 - ldr r5,[r11,#8] - eor r3,r6,r3,lsl#24 - ldr r6,[r11,#12] - - eor r0,r0,r7 - eor r1,r1,r4 - eor r2,r2,r5 - eor r3,r3,r6 - - sub r10,r10,#2 - ldr pc,[sp],#4 @ pop and return -.size _armv4_AES_encrypt,.-_armv4_AES_encrypt - -.global private_AES_set_encrypt_key -.type private_AES_set_encrypt_key,%function -.align 5 -private_AES_set_encrypt_key: -_armv4_AES_set_encrypt_key: -#if __ARM_ARCH__<7 - sub r3,pc,#8 @ AES_set_encrypt_key -#else - adr r3,private_AES_set_encrypt_key -#endif - teq r0,#0 -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif - moveq r0,#-1 - beq .Labrt - teq r2,#0 -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif - moveq r0,#-1 - beq .Labrt - - teq r1,#128 - beq .Lok - teq r1,#192 - beq .Lok - teq r1,#256 -#if __ARM_ARCH__>=7 - itt ne @ Thumb2 thing, sanity check in ARM -#endif - movne r0,#-1 - bne .Labrt - -.Lok: stmdb sp!,{r4-r12,lr} - sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 - - mov r12,r0 @ inp - mov lr,r1 @ bits - mov r11,r2 @ key - -#if __ARM_ARCH__<7 - ldrb r0,[r12,#3] @ load input data in endian-neutral - ldrb r4,[r12,#2] @ manner... - ldrb r5,[r12,#1] - ldrb r6,[r12,#0] - orr r0,r0,r4,lsl#8 - ldrb r1,[r12,#7] - orr r0,r0,r5,lsl#16 - ldrb r4,[r12,#6] - orr r0,r0,r6,lsl#24 - ldrb r5,[r12,#5] - ldrb r6,[r12,#4] - orr r1,r1,r4,lsl#8 - ldrb r2,[r12,#11] - orr r1,r1,r5,lsl#16 - ldrb r4,[r12,#10] - orr r1,r1,r6,lsl#24 - ldrb r5,[r12,#9] - ldrb r6,[r12,#8] - orr r2,r2,r4,lsl#8 - ldrb r3,[r12,#15] - orr r2,r2,r5,lsl#16 - ldrb r4,[r12,#14] - orr r2,r2,r6,lsl#24 - ldrb r5,[r12,#13] - ldrb r6,[r12,#12] - orr r3,r3,r4,lsl#8 - str r0,[r11],#16 - orr r3,r3,r5,lsl#16 - str r1,[r11,#-12] - orr r3,r3,r6,lsl#24 - str r2,[r11,#-8] - str r3,[r11,#-4] -#else - ldr r0,[r12,#0] - ldr r1,[r12,#4] - ldr r2,[r12,#8] - ldr r3,[r12,#12] -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif - str r0,[r11],#16 - str r1,[r11,#-12] - str r2,[r11,#-8] - str r3,[r11,#-4] -#endif - - teq lr,#128 - bne .Lnot128 - mov r12,#10 - str r12,[r11,#240-16] - add r6,r10,#256 @ rcon - mov lr,#255 - -.L128_loop: - and r5,lr,r3,lsr#24 - and r7,lr,r3,lsr#16 - ldrb r5,[r10,r5] - and r8,lr,r3,lsr#8 - ldrb r7,[r10,r7] - and r9,lr,r3 - ldrb r8,[r10,r8] - orr r5,r5,r7,lsl#24 - ldrb r9,[r10,r9] - orr r5,r5,r8,lsl#16 - ldr r4,[r6],#4 @ rcon[i++] - orr r5,r5,r9,lsl#8 - eor r5,r5,r4 - eor r0,r0,r5 @ rk[4]=rk[0]^... - eor r1,r1,r0 @ rk[5]=rk[1]^rk[4] - str r0,[r11],#16 - eor r2,r2,r1 @ rk[6]=rk[2]^rk[5] - str r1,[r11,#-12] - eor r3,r3,r2 @ rk[7]=rk[3]^rk[6] - str r2,[r11,#-8] - subs r12,r12,#1 - str r3,[r11,#-4] - bne .L128_loop - sub r2,r11,#176 - b .Ldone - -.Lnot128: -#if __ARM_ARCH__<7 - ldrb r8,[r12,#19] - ldrb r4,[r12,#18] - ldrb r5,[r12,#17] - ldrb r6,[r12,#16] - orr r8,r8,r4,lsl#8 - ldrb r9,[r12,#23] - orr r8,r8,r5,lsl#16 - ldrb r4,[r12,#22] - orr r8,r8,r6,lsl#24 - ldrb r5,[r12,#21] - ldrb r6,[r12,#20] - orr r9,r9,r4,lsl#8 - orr r9,r9,r5,lsl#16 - str r8,[r11],#8 - orr r9,r9,r6,lsl#24 - str r9,[r11,#-4] -#else - ldr r8,[r12,#16] - ldr r9,[r12,#20] -#ifdef __ARMEL__ - rev r8,r8 - rev r9,r9 -#endif - str r8,[r11],#8 - str r9,[r11,#-4] -#endif - - teq lr,#192 - bne .Lnot192 - mov r12,#12 - str r12,[r11,#240-24] - add r6,r10,#256 @ rcon - mov lr,#255 - mov r12,#8 - -.L192_loop: - and r5,lr,r9,lsr#24 - and r7,lr,r9,lsr#16 - ldrb r5,[r10,r5] - and r8,lr,r9,lsr#8 - ldrb r7,[r10,r7] - and r9,lr,r9 - ldrb r8,[r10,r8] - orr r5,r5,r7,lsl#24 - ldrb r9,[r10,r9] - orr r5,r5,r8,lsl#16 - ldr r4,[r6],#4 @ rcon[i++] - orr r5,r5,r9,lsl#8 - eor r9,r5,r4 - eor r0,r0,r9 @ rk[6]=rk[0]^... - eor r1,r1,r0 @ rk[7]=rk[1]^rk[6] - str r0,[r11],#24 - eor r2,r2,r1 @ rk[8]=rk[2]^rk[7] - str r1,[r11,#-20] - eor r3,r3,r2 @ rk[9]=rk[3]^rk[8] - str r2,[r11,#-16] - subs r12,r12,#1 - str r3,[r11,#-12] -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif - subeq r2,r11,#216 - beq .Ldone - - ldr r7,[r11,#-32] - ldr r8,[r11,#-28] - eor r7,r7,r3 @ rk[10]=rk[4]^rk[9] - eor r9,r8,r7 @ rk[11]=rk[5]^rk[10] - str r7,[r11,#-8] - str r9,[r11,#-4] - b .L192_loop - -.Lnot192: -#if __ARM_ARCH__<7 - ldrb r8,[r12,#27] - ldrb r4,[r12,#26] - ldrb r5,[r12,#25] - ldrb r6,[r12,#24] - orr r8,r8,r4,lsl#8 - ldrb r9,[r12,#31] - orr r8,r8,r5,lsl#16 - ldrb r4,[r12,#30] - orr r8,r8,r6,lsl#24 - ldrb r5,[r12,#29] - ldrb r6,[r12,#28] - orr r9,r9,r4,lsl#8 - orr r9,r9,r5,lsl#16 - str r8,[r11],#8 - orr r9,r9,r6,lsl#24 - str r9,[r11,#-4] -#else - ldr r8,[r12,#24] - ldr r9,[r12,#28] -#ifdef __ARMEL__ - rev r8,r8 - rev r9,r9 -#endif - str r8,[r11],#8 - str r9,[r11,#-4] -#endif - - mov r12,#14 - str r12,[r11,#240-32] - add r6,r10,#256 @ rcon - mov lr,#255 - mov r12,#7 - -.L256_loop: - and r5,lr,r9,lsr#24 - and r7,lr,r9,lsr#16 - ldrb r5,[r10,r5] - and r8,lr,r9,lsr#8 - ldrb r7,[r10,r7] - and r9,lr,r9 - ldrb r8,[r10,r8] - orr r5,r5,r7,lsl#24 - ldrb r9,[r10,r9] - orr r5,r5,r8,lsl#16 - ldr r4,[r6],#4 @ rcon[i++] - orr r5,r5,r9,lsl#8 - eor r9,r5,r4 - eor r0,r0,r9 @ rk[8]=rk[0]^... - eor r1,r1,r0 @ rk[9]=rk[1]^rk[8] - str r0,[r11],#32 - eor r2,r2,r1 @ rk[10]=rk[2]^rk[9] - str r1,[r11,#-28] - eor r3,r3,r2 @ rk[11]=rk[3]^rk[10] - str r2,[r11,#-24] - subs r12,r12,#1 - str r3,[r11,#-20] -#if __ARM_ARCH__>=7 - itt eq @ Thumb2 thing, sanity check in ARM -#endif - subeq r2,r11,#256 - beq .Ldone - - and r5,lr,r3 - and r7,lr,r3,lsr#8 - ldrb r5,[r10,r5] - and r8,lr,r3,lsr#16 - ldrb r7,[r10,r7] - and r9,lr,r3,lsr#24 - ldrb r8,[r10,r8] - orr r5,r5,r7,lsl#8 - ldrb r9,[r10,r9] - orr r5,r5,r8,lsl#16 - ldr r4,[r11,#-48] - orr r5,r5,r9,lsl#24 - - ldr r7,[r11,#-44] - ldr r8,[r11,#-40] - eor r4,r4,r5 @ rk[12]=rk[4]^... - ldr r9,[r11,#-36] - eor r7,r7,r4 @ rk[13]=rk[5]^rk[12] - str r4,[r11,#-16] - eor r8,r8,r7 @ rk[14]=rk[6]^rk[13] - str r7,[r11,#-12] - eor r9,r9,r8 @ rk[15]=rk[7]^rk[14] - str r8,[r11,#-8] - str r9,[r11,#-4] - b .L256_loop - -.align 2 -.Ldone: mov r0,#0 - ldmia sp!,{r4-r12,lr} -.Labrt: -#if __ARM_ARCH__>=5 - bx lr @ .word 0xe12fff1e -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key - -.global private_AES_set_decrypt_key -.type private_AES_set_decrypt_key,%function -.align 5 -private_AES_set_decrypt_key: - str lr,[sp,#-4]! @ push lr - bl _armv4_AES_set_encrypt_key - teq r0,#0 - ldr lr,[sp],#4 @ pop lr - bne .Labrt - - mov r0,r2 @ AES_set_encrypt_key preserves r2, - mov r1,r2 @ which is AES_KEY *key - b _armv4_AES_set_enc2dec_key -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key - -@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) -.global AES_set_enc2dec_key -.type AES_set_enc2dec_key,%function -.align 5 -AES_set_enc2dec_key: -_armv4_AES_set_enc2dec_key: - stmdb sp!,{r4-r12,lr} - - ldr r12,[r0,#240] - mov r7,r0 @ input - add r8,r0,r12,lsl#4 - mov r11,r1 @ ouput - add r10,r1,r12,lsl#4 - str r12,[r1,#240] - -.Linv: ldr r0,[r7],#16 - ldr r1,[r7,#-12] - ldr r2,[r7,#-8] - ldr r3,[r7,#-4] - ldr r4,[r8],#-16 - ldr r5,[r8,#16+4] - ldr r6,[r8,#16+8] - ldr r9,[r8,#16+12] - str r0,[r10],#-16 - str r1,[r10,#16+4] - str r2,[r10,#16+8] - str r3,[r10,#16+12] - str r4,[r11],#16 - str r5,[r11,#-12] - str r6,[r11,#-8] - str r9,[r11,#-4] - teq r7,r8 - bne .Linv - - ldr r0,[r7] - ldr r1,[r7,#4] - ldr r2,[r7,#8] - ldr r3,[r7,#12] - str r0,[r11] - str r1,[r11,#4] - str r2,[r11,#8] - str r3,[r11,#12] - sub r11,r11,r12,lsl#3 - ldr r0,[r11,#16]! @ prefetch tp1 - mov r7,#0x80 - mov r8,#0x1b - orr r7,r7,#0x8000 - orr r8,r8,#0x1b00 - orr r7,r7,r7,lsl#16 - orr r8,r8,r8,lsl#16 - sub r12,r12,#1 - mvn r9,r7 - mov r12,r12,lsl#2 @ (rounds-1)*4 - -.Lmix: and r4,r0,r7 - and r1,r0,r9 - sub r4,r4,r4,lsr#7 - and r4,r4,r8 - eor r1,r4,r1,lsl#1 @ tp2 - - and r4,r1,r7 - and r2,r1,r9 - sub r4,r4,r4,lsr#7 - and r4,r4,r8 - eor r2,r4,r2,lsl#1 @ tp4 - - and r4,r2,r7 - and r3,r2,r9 - sub r4,r4,r4,lsr#7 - and r4,r4,r8 - eor r3,r4,r3,lsl#1 @ tp8 - - eor r4,r1,r2 - eor r5,r0,r3 @ tp9 - eor r4,r4,r3 @ tpe - eor r4,r4,r1,ror#24 - eor r4,r4,r5,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8) - eor r4,r4,r2,ror#16 - eor r4,r4,r5,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16) - eor r4,r4,r5,ror#8 @ ^= ROTATE(tp9,24) - - ldr r0,[r11,#4] @ prefetch tp1 - str r4,[r11],#4 - subs r12,r12,#1 - bne .Lmix - - mov r0,#0 -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size AES_set_enc2dec_key,.-AES_set_enc2dec_key - -.type AES_Td,%object -.align 5 -AES_Td: -.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 -.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 -.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 -.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f -.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 -.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 -.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da -.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 -.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd -.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 -.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 -.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 -.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 -.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a -.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 -.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c -.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 -.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a -.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 -.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 -.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 -.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff -.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 -.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb -.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 -.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e -.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 -.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a -.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e -.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 -.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d -.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 -.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd -.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 -.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 -.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 -.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d -.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 -.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 -.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef -.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 -.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 -.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 -.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 -.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 -.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b -.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 -.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 -.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 -.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 -.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 -.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f -.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df -.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f -.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e -.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 -.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 -.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c -.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf -.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 -.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f -.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 -.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 -.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 -@ Td4[256] -.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 -.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb -.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 -.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb -.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d -.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e -.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 -.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 -.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 -.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 -.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda -.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 -.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a -.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 -.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 -.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b -.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea -.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 -.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 -.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e -.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 -.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b -.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 -.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 -.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 -.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f -.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d -.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef -.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 -.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 -.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 -.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d -.size AES_Td,.-AES_Td - -@ void AES_decrypt(const unsigned char *in, unsigned char *out, -@ const AES_KEY *key) { -.global AES_decrypt -.type AES_decrypt,%function -.align 5 -AES_decrypt: -#if __ARM_ARCH__<7 - sub r3,pc,#8 @ AES_decrypt -#else - adr r3,AES_decrypt -#endif - stmdb sp!,{r1,r4-r12,lr} - mov r12,r0 @ inp - mov r11,r2 - sub r10,r3,#AES_decrypt-AES_Td @ Td -#if __ARM_ARCH__<7 - ldrb r0,[r12,#3] @ load input data in endian-neutral - ldrb r4,[r12,#2] @ manner... - ldrb r5,[r12,#1] - ldrb r6,[r12,#0] - orr r0,r0,r4,lsl#8 - ldrb r1,[r12,#7] - orr r0,r0,r5,lsl#16 - ldrb r4,[r12,#6] - orr r0,r0,r6,lsl#24 - ldrb r5,[r12,#5] - ldrb r6,[r12,#4] - orr r1,r1,r4,lsl#8 - ldrb r2,[r12,#11] - orr r1,r1,r5,lsl#16 - ldrb r4,[r12,#10] - orr r1,r1,r6,lsl#24 - ldrb r5,[r12,#9] - ldrb r6,[r12,#8] - orr r2,r2,r4,lsl#8 - ldrb r3,[r12,#15] - orr r2,r2,r5,lsl#16 - ldrb r4,[r12,#14] - orr r2,r2,r6,lsl#24 - ldrb r5,[r12,#13] - ldrb r6,[r12,#12] - orr r3,r3,r4,lsl#8 - orr r3,r3,r5,lsl#16 - orr r3,r3,r6,lsl#24 -#else - ldr r0,[r12,#0] - ldr r1,[r12,#4] - ldr r2,[r12,#8] - ldr r3,[r12,#12] -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif -#endif - bl _armv4_AES_decrypt - - ldr r12,[sp],#4 @ pop out -#if __ARM_ARCH__>=7 -#ifdef __ARMEL__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif - str r0,[r12,#0] - str r1,[r12,#4] - str r2,[r12,#8] - str r3,[r12,#12] -#else - mov r4,r0,lsr#24 @ write output in endian-neutral - mov r5,r0,lsr#16 @ manner... - mov r6,r0,lsr#8 - strb r4,[r12,#0] - strb r5,[r12,#1] - mov r4,r1,lsr#24 - strb r6,[r12,#2] - mov r5,r1,lsr#16 - strb r0,[r12,#3] - mov r6,r1,lsr#8 - strb r4,[r12,#4] - strb r5,[r12,#5] - mov r4,r2,lsr#24 - strb r6,[r12,#6] - mov r5,r2,lsr#16 - strb r1,[r12,#7] - mov r6,r2,lsr#8 - strb r4,[r12,#8] - strb r5,[r12,#9] - mov r4,r3,lsr#24 - strb r6,[r12,#10] - mov r5,r3,lsr#16 - strb r2,[r12,#11] - mov r6,r3,lsr#8 - strb r4,[r12,#12] - strb r5,[r12,#13] - strb r6,[r12,#14] - strb r3,[r12,#15] -#endif -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size AES_decrypt,.-AES_decrypt - -.type _armv4_AES_decrypt,%function -.align 2 -_armv4_AES_decrypt: - str lr,[sp,#-4]! @ push lr - ldmia r11!,{r4-r7} - eor r0,r0,r4 - ldr r12,[r11,#240-16] - eor r1,r1,r5 - eor r2,r2,r6 - eor r3,r3,r7 - sub r12,r12,#1 - mov lr,#255 - - and r7,lr,r0,lsr#16 - and r8,lr,r0,lsr#8 - and r9,lr,r0 - mov r0,r0,lsr#24 -.Ldec_loop: - ldr r4,[r10,r7,lsl#2] @ Td1[s0>>16] - and r7,lr,r1 @ i0 - ldr r5,[r10,r8,lsl#2] @ Td2[s0>>8] - and r8,lr,r1,lsr#16 - ldr r6,[r10,r9,lsl#2] @ Td3[s0>>0] - and r9,lr,r1,lsr#8 - ldr r0,[r10,r0,lsl#2] @ Td0[s0>>24] - mov r1,r1,lsr#24 - - ldr r7,[r10,r7,lsl#2] @ Td3[s1>>0] - ldr r8,[r10,r8,lsl#2] @ Td1[s1>>16] - ldr r9,[r10,r9,lsl#2] @ Td2[s1>>8] - eor r0,r0,r7,ror#24 - ldr r1,[r10,r1,lsl#2] @ Td0[s1>>24] - and r7,lr,r2,lsr#8 @ i0 - eor r5,r8,r5,ror#8 - and r8,lr,r2 @ i1 - eor r6,r9,r6,ror#8 - and r9,lr,r2,lsr#16 - ldr r7,[r10,r7,lsl#2] @ Td2[s2>>8] - eor r1,r1,r4,ror#8 - ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0] - mov r2,r2,lsr#24 - - ldr r9,[r10,r9,lsl#2] @ Td1[s2>>16] - eor r0,r0,r7,ror#16 - ldr r2,[r10,r2,lsl#2] @ Td0[s2>>24] - and r7,lr,r3,lsr#16 @ i0 - eor r1,r1,r8,ror#24 - and r8,lr,r3,lsr#8 @ i1 - eor r6,r9,r6,ror#8 - and r9,lr,r3 @ i2 - ldr r7,[r10,r7,lsl#2] @ Td1[s3>>16] - eor r2,r2,r5,ror#8 - ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8] - mov r3,r3,lsr#24 - - ldr r9,[r10,r9,lsl#2] @ Td3[s3>>0] - eor r0,r0,r7,ror#8 - ldr r7,[r11],#16 - eor r1,r1,r8,ror#16 - ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24] - eor r2,r2,r9,ror#24 - - ldr r4,[r11,#-12] - eor r0,r0,r7 - ldr r5,[r11,#-8] - eor r3,r3,r6,ror#8 - ldr r6,[r11,#-4] - and r7,lr,r0,lsr#16 - eor r1,r1,r4 - and r8,lr,r0,lsr#8 - eor r2,r2,r5 - and r9,lr,r0 - eor r3,r3,r6 - mov r0,r0,lsr#24 - - subs r12,r12,#1 - bne .Ldec_loop - - add r10,r10,#1024 - - ldr r5,[r10,#0] @ prefetch Td4 - ldr r6,[r10,#32] - ldr r4,[r10,#64] - ldr r5,[r10,#96] - ldr r6,[r10,#128] - ldr r4,[r10,#160] - ldr r5,[r10,#192] - ldr r6,[r10,#224] - - ldrb r0,[r10,r0] @ Td4[s0>>24] - ldrb r4,[r10,r7] @ Td4[s0>>16] - and r7,lr,r1 @ i0 - ldrb r5,[r10,r8] @ Td4[s0>>8] - and r8,lr,r1,lsr#16 - ldrb r6,[r10,r9] @ Td4[s0>>0] - and r9,lr,r1,lsr#8 - - add r1,r10,r1,lsr#24 - ldrb r7,[r10,r7] @ Td4[s1>>0] - ldrb r1,[r1] @ Td4[s1>>24] - ldrb r8,[r10,r8] @ Td4[s1>>16] - eor r0,r7,r0,lsl#24 - ldrb r9,[r10,r9] @ Td4[s1>>8] - eor r1,r4,r1,lsl#8 - and r7,lr,r2,lsr#8 @ i0 - eor r5,r5,r8,lsl#8 - and r8,lr,r2 @ i1 - ldrb r7,[r10,r7] @ Td4[s2>>8] - eor r6,r6,r9,lsl#8 - ldrb r8,[r10,r8] @ Td4[s2>>0] - and r9,lr,r2,lsr#16 - - add r2,r10,r2,lsr#24 - ldrb r2,[r2] @ Td4[s2>>24] - eor r0,r0,r7,lsl#8 - ldrb r9,[r10,r9] @ Td4[s2>>16] - eor r1,r8,r1,lsl#16 - and r7,lr,r3,lsr#16 @ i0 - eor r2,r5,r2,lsl#16 - and r8,lr,r3,lsr#8 @ i1 - ldrb r7,[r10,r7] @ Td4[s3>>16] - eor r6,r6,r9,lsl#16 - ldrb r8,[r10,r8] @ Td4[s3>>8] - and r9,lr,r3 @ i2 - - add r3,r10,r3,lsr#24 - ldrb r9,[r10,r9] @ Td4[s3>>0] - ldrb r3,[r3] @ Td4[s3>>24] - eor r0,r0,r7,lsl#16 - ldr r7,[r11,#0] - eor r1,r1,r8,lsl#8 - ldr r4,[r11,#4] - eor r2,r9,r2,lsl#8 - ldr r5,[r11,#8] - eor r3,r6,r3,lsl#24 - ldr r6,[r11,#12] - - eor r0,r0,r7 - eor r1,r1,r4 - eor r2,r2,r5 - eor r3,r3,r6 - - sub r10,r10,#1024 - ldr pc,[sp],#4 @ pop and return -.size _armv4_AES_decrypt,.-_armv4_AES_decrypt -.asciz "AES for ARMv4, CRYPTOGAMS by " -.align 2 diff --git a/main/openssl/crypto/bn/asm/armv4-mont.S b/main/openssl/crypto/bn/asm/armv4-mont.S new file mode 100644 index 00000000..fecae15e --- /dev/null +++ b/main/openssl/crypto/bn/asm/armv4-mont.S @@ -0,0 +1,579 @@ +#include "arm_arch.h" + +.text +.code 32 + +#if __ARM_ARCH__>=7 +.align 5 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-bn_mul_mont +#endif + +.global bn_mul_mont +.type bn_mul_mont,%function + +.align 5 +bn_mul_mont: + ldr ip,[sp,#4] @ load num + stmdb sp!,{r0,r2} @ sp points at argument block +#if __ARM_ARCH__>=7 + tst ip,#7 + bne .Lialu + adr r0,bn_mul_mont + ldr r2,.LOPENSSL_armcap + ldr r0,[r0,r2] + tst r0,#1 @ NEON available? + ldmia sp, {r0,r2} + beq .Lialu + add sp,sp,#8 + b bn_mul8x_mont_neon +.align 4 +.Lialu: +#endif + cmp ip,#2 + mov r0,ip @ load num + movlt r0,#0 + addlt sp,sp,#2*4 + blt .Labrt + + stmdb sp!,{r4-r12,lr} @ save 10 registers + + mov r0,r0,lsl#2 @ rescale r0 for byte count + sub sp,sp,r0 @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub r0,r0,#4 @ "num=num-1" + add r4,r2,r0 @ &bp[num-1] + + add r0,sp,r0 @ r0 to point at &tp[num-1] + ldr r8,[r0,#14*4] @ &n0 + ldr r2,[r2] @ bp[0] + ldr r5,[r1],#4 @ ap[0],ap++ + ldr r6,[r3],#4 @ np[0],np++ + ldr r8,[r8] @ *n0 + str r4,[r0,#15*4] @ save &bp[num] + + umull r10,r11,r5,r2 @ ap[0]*bp[0] + str r8,[r0,#14*4] @ save n0 value + mul r8,r10,r8 @ "tp[0]"*n0 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" + mov r4,sp + +.L1st: + ldr r5,[r1],#4 @ ap[j],ap++ + mov r10,r11 + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[0] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .L1st + + adds r12,r12,r11 + ldr r4,[r0,#13*4] @ restore bp + mov r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + str r14,[r0,#4] @ tp[num]= + +.Louter: + sub r7,r0,sp @ "original" r0-1 value + sub r1,r1,r7 @ "rewind" ap to &ap[1] + ldr r2,[r4,#4]! @ *(++bp) + sub r3,r3,r7 @ "rewind" np to &np[1] + ldr r5,[r1,#-4] @ ap[0] + ldr r10,[sp] @ tp[0] + ldr r6,[r3,#-4] @ np[0] + ldr r7,[sp,#4] @ tp[1] + + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] + str r4,[r0,#13*4] @ save bp + mul r8,r10,r8 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" + mov r4,sp + +.Linner: + ldr r5,[r1],#4 @ ap[j],ap++ + adds r10,r11,r7 @ +=tp[j] + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[i] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adc r11,r11,#0 + ldr r7,[r4,#8] @ tp[j+1] + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .Linner + + adds r12,r12,r11 + mov r14,#0 + ldr r4,[r0,#13*4] @ restore bp + adc r14,r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adds r12,r12,r7 + ldr r7,[r0,#15*4] @ restore &bp[num] + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + str r14,[r0,#4] @ tp[num]= + + cmp r4,r7 + bne .Louter + + ldr r2,[r0,#12*4] @ pull rp + add r0,r0,#4 @ r0 to point at &tp[num] + sub r5,r0,sp @ "original" num value + mov r4,sp @ "rewind" r4 + mov r1,r4 @ "borrow" r1 + sub r3,r3,r5 @ "rewind" r3 to &np[0] + + subs r7,r7,r7 @ "clear" carry flag +.Lsub: ldr r7,[r4],#4 + ldr r6,[r3],#4 + sbcs r7,r7,r6 @ tp[j]-np[j] + str r7,[r2],#4 @ rp[j]= + teq r4,r0 @ preserve carry + bne .Lsub + sbcs r14,r14,#0 @ upmost carry + mov r4,sp @ "rewind" r4 + sub r2,r2,r5 @ "rewind" r2 + + and r1,r4,r14 + bic r3,r2,r14 + orr r1,r1,r3 @ ap=borrow?tp:rp + +.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh + str sp,[r4],#4 @ zap tp + str r7,[r2],#4 + cmp r4,r0 + bne .Lcopy + + add sp,r0,#4 @ skip over tp[num+1] + ldmia sp!,{r4-r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +.Labrt: +#if __ARM_ARCH__>=5 + bx lr @ .word 0xe12fff1e +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size bn_mul_mont,.-bn_mul_mont +#if __ARM_ARCH__>=7 +.fpu neon + +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4-r11} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load rest of parameter block + + sub r7,sp,#16 + vld1.32 {d28[0]}, [r2,:32]! + sub r7,r7,r5,lsl#4 + vld1.32 {d0-d3}, [r1]! @ can't specify :32 :-( + and r7,r7,#-64 + vld1.32 {d30[0]}, [r4,:32] + mov sp,r7 @ alloca + veor d8,d8,d8 + subs r8,r5,#8 + vzip.16 d28,d8 + + vmull.u32 q6,d28,d0[0] + vmull.u32 q7,d28,d0[1] + vmull.u32 q8,d28,d1[0] + vshl.i64 d10,d13,#16 + vmull.u32 q9,d28,d1[1] + + vadd.u64 d10,d10,d12 + veor d8,d8,d8 + vmul.u32 d29,d10,d30 + + vmull.u32 q10,d28,d2[0] + vld1.32 {d4-d7}, [r3]! + vmull.u32 q11,d28,d2[1] + vmull.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmull.u32 q13,d28,d3[1] + + bne .LNEON_1st + + @ special case for num=8, everything is in register bank... + + vmlal.u32 q6,d29,d4[0] + sub r9,r5,#1 + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + b .LNEON_outer8 + +.align 4 +.LNEON_outer8: + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + vzip.16 d28,d8 + vadd.u64 d12,d12,d10 + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + vmlal.u32 q8,d28,d1[0] + vshl.i64 d10,d13,#16 + vmlal.u32 q9,d28,d1[1] + + vadd.u64 d10,d10,d12 + veor d8,d8,d8 + subs r9,r9,#1 + vmul.u32 d29,d10,d30 + + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + bne .LNEON_outer8 + + vadd.u64 d12,d12,d10 + mov r7,sp + vshr.u64 d10,d12,#16 + mov r8,r5 + vadd.u64 d13,d13,d10 + add r6,sp,#16 + vshr.u64 d10,d13,#16 + vzip.16 d12,d13 + + b .LNEON_tail2 + +.align 4 +.LNEON_1st: + vmlal.u32 q6,d29,d4[0] + vld1.32 {d0-d3}, [r1]! + vmlal.u32 q7,d29,d4[1] + subs r8,r8,#8 + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vld1.32 {d4-d5}, [r3]! + vmlal.u32 q11,d29,d6[1] + vst1.64 {q6-q7}, [r7,:256]! + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vst1.64 {q8-q9}, [r7,:256]! + + vmull.u32 q6,d28,d0[0] + vld1.32 {d6-d7}, [r3]! + vmull.u32 q7,d28,d0[1] + vst1.64 {q10-q11}, [r7,:256]! + vmull.u32 q8,d28,d1[0] + vmull.u32 q9,d28,d1[1] + vst1.64 {q12-q13}, [r7,:256]! + + vmull.u32 q10,d28,d2[0] + vmull.u32 q11,d28,d2[1] + vmull.u32 q12,d28,d3[0] + vmull.u32 q13,d28,d3[1] + + bne .LNEON_1st + + vmlal.u32 q6,d29,d4[0] + add r6,sp,#16 + vmlal.u32 q7,d29,d4[1] + sub r1,r1,r5,lsl#2 @ rewind r1 + vmlal.u32 q8,d29,d5[0] + vld1.64 {q5}, [sp,:128] + vmlal.u32 q9,d29,d5[1] + sub r9,r5,#1 + + vmlal.u32 q10,d29,d6[0] + vst1.64 {q6-q7}, [r7,:256]! + vmlal.u32 q11,d29,d6[1] + vshr.u64 d10,d10,#16 + vld1.64 {q6}, [r6, :128]! + vmlal.u32 q12,d29,d7[0] + vst1.64 {q8-q9}, [r7,:256]! + vmlal.u32 q13,d29,d7[1] + + vst1.64 {q10-q11}, [r7,:256]! + vadd.u64 d10,d10,d11 + veor q4,q4,q4 + vst1.64 {q12-q13}, [r7,:256]! + vld1.64 {q7-q8}, [r6, :256]! + vst1.64 {q4}, [r7,:128] + vshr.u64 d10,d10,#16 + + b .LNEON_outer + +.align 4 +.LNEON_outer: + vld1.32 {d28[0]}, [r2,:32]! + sub r3,r3,r5,lsl#2 @ rewind r3 + vld1.32 {d0-d3}, [r1]! + veor d8,d8,d8 + mov r7,sp + vzip.16 d28,d8 + sub r8,r5,#8 + vadd.u64 d12,d12,d10 + + vmlal.u32 q6,d28,d0[0] + vld1.64 {q9-q10},[r6,:256]! + vmlal.u32 q7,d28,d0[1] + vmlal.u32 q8,d28,d1[0] + vld1.64 {q11-q12},[r6,:256]! + vmlal.u32 q9,d28,d1[1] + + vshl.i64 d10,d13,#16 + veor d8,d8,d8 + vadd.u64 d10,d10,d12 + vld1.64 {q13},[r6,:128]! + vmul.u32 d29,d10,d30 + + vmlal.u32 q10,d28,d2[0] + vld1.32 {d4-d7}, [r3]! + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + +.LNEON_inner: + vmlal.u32 q6,d29,d4[0] + vld1.32 {d0-d3}, [r1]! + vmlal.u32 q7,d29,d4[1] + subs r8,r8,#8 + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + vst1.64 {q6-q7}, [r7,:256]! + + vmlal.u32 q10,d29,d6[0] + vld1.64 {q6}, [r6, :128]! + vmlal.u32 q11,d29,d6[1] + vst1.64 {q8-q9}, [r7,:256]! + vmlal.u32 q12,d29,d7[0] + vld1.64 {q7-q8}, [r6, :256]! + vmlal.u32 q13,d29,d7[1] + vst1.64 {q10-q11}, [r7,:256]! + + vmlal.u32 q6,d28,d0[0] + vld1.64 {q9-q10}, [r6, :256]! + vmlal.u32 q7,d28,d0[1] + vst1.64 {q12-q13}, [r7,:256]! + vmlal.u32 q8,d28,d1[0] + vld1.64 {q11-q12}, [r6, :256]! + vmlal.u32 q9,d28,d1[1] + vld1.32 {d4-d7}, [r3]! + + vmlal.u32 q10,d28,d2[0] + vld1.64 {q13}, [r6, :128]! + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vmlal.u32 q13,d28,d3[1] + + bne .LNEON_inner + + vmlal.u32 q6,d29,d4[0] + add r6,sp,#16 + vmlal.u32 q7,d29,d4[1] + sub r1,r1,r5,lsl#2 @ rewind r1 + vmlal.u32 q8,d29,d5[0] + vld1.64 {q5}, [sp,:128] + vmlal.u32 q9,d29,d5[1] + subs r9,r9,#1 + + vmlal.u32 q10,d29,d6[0] + vst1.64 {q6-q7}, [r7,:256]! + vmlal.u32 q11,d29,d6[1] + vld1.64 {q6}, [r6, :128]! + vshr.u64 d10,d10,#16 + vst1.64 {q8-q9}, [r7,:256]! + vmlal.u32 q12,d29,d7[0] + vld1.64 {q7-q8}, [r6, :256]! + vmlal.u32 q13,d29,d7[1] + + vst1.64 {q10-q11}, [r7,:256]! + vadd.u64 d10,d10,d11 + vst1.64 {q12-q13}, [r7,:256]! + vshr.u64 d10,d10,#16 + + bne .LNEON_outer + + mov r7,sp + mov r8,r5 + +.LNEON_tail: + vadd.u64 d12,d12,d10 + vld1.64 {q9-q10}, [r6, :256]! + vshr.u64 d10,d12,#16 + vadd.u64 d13,d13,d10 + vld1.64 {q11-q12}, [r6, :256]! + vshr.u64 d10,d13,#16 + vld1.64 {q13}, [r6, :128]! + vzip.16 d12,d13 + +.LNEON_tail2: + vadd.u64 d14,d14,d10 + vst1.32 {d12[0]}, [r7, :32]! + vshr.u64 d10,d14,#16 + vadd.u64 d15,d15,d10 + vshr.u64 d10,d15,#16 + vzip.16 d14,d15 + + vadd.u64 d16,d16,d10 + vst1.32 {d14[0]}, [r7, :32]! + vshr.u64 d10,d16,#16 + vadd.u64 d17,d17,d10 + vshr.u64 d10,d17,#16 + vzip.16 d16,d17 + + vadd.u64 d18,d18,d10 + vst1.32 {d16[0]}, [r7, :32]! + vshr.u64 d10,d18,#16 + vadd.u64 d19,d19,d10 + vshr.u64 d10,d19,#16 + vzip.16 d18,d19 + + vadd.u64 d20,d20,d10 + vst1.32 {d18[0]}, [r7, :32]! + vshr.u64 d10,d20,#16 + vadd.u64 d21,d21,d10 + vshr.u64 d10,d21,#16 + vzip.16 d20,d21 + + vadd.u64 d22,d22,d10 + vst1.32 {d20[0]}, [r7, :32]! + vshr.u64 d10,d22,#16 + vadd.u64 d23,d23,d10 + vshr.u64 d10,d23,#16 + vzip.16 d22,d23 + + vadd.u64 d24,d24,d10 + vst1.32 {d22[0]}, [r7, :32]! + vshr.u64 d10,d24,#16 + vadd.u64 d25,d25,d10 + vld1.64 {q6}, [r6, :128]! + vshr.u64 d10,d25,#16 + vzip.16 d24,d25 + + vadd.u64 d26,d26,d10 + vst1.32 {d24[0]}, [r7, :32]! + vshr.u64 d10,d26,#16 + vadd.u64 d27,d27,d10 + vld1.64 {q7-q8}, [r6, :256]! + vshr.u64 d10,d27,#16 + vzip.16 d26,d27 + subs r8,r8,#8 + vst1.32 {d26[0]}, [r7, :32]! + + bne .LNEON_tail + + vst1.32 {d10[0]}, [r7, :32] @ top-most bit + sub r3,r3,r5,lsl#2 @ rewind r3 + subs r1,sp,#0 @ clear carry flag + add r2,sp,r5,lsl#2 + +.LNEON_sub: + ldmia r1!, {r4-r7} + ldmia r3!, {r8-r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq r1,r2 @ preserves carry + stmia r0!, {r8-r11} + bne .LNEON_sub + + ldr r10, [r1] @ load top-most bit + veor q0,q0,q0 + sub r11,r2,sp @ this is num*4 + veor q1,q1,q1 + mov r1,sp + sub r0,r0,r11 @ rewind r0 + mov r3,r2 @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +.LNEON_copy_n_zap: + ldmia r1!, {r4-r7} + ldmia r0, {r8-r11} + movcc r8, r4 + vst1.64 {q0-q1}, [r3,:256]! @ wipe + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [r3,:256]! @ wipe + movcc r11,r7 + ldmia r1, {r4-r7} + stmia r0!, {r8-r11} + sub r1,r1,#16 + ldmia r0, {r8-r11} + movcc r8, r4 + vst1.64 {q0-q1}, [r1,:256]! @ wipe + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [r3,:256]! @ wipe + movcc r11,r7 + teq r1,r2 @ preserves carry + stmia r0!, {r8-r11} + bne .LNEON_copy_n_zap + + sub sp,ip,#96 + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r11} + bx lr @ .word 0xe12fff1e +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by " +.align 2 +#if __ARM_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +#endif diff --git a/main/openssl/crypto/bn/asm/armv4-mont.s b/main/openssl/crypto/bn/asm/armv4-mont.s deleted file mode 100644 index fecae15e..00000000 --- a/main/openssl/crypto/bn/asm/armv4-mont.s +++ /dev/null @@ -1,579 +0,0 @@ -#include "arm_arch.h" - -.text -.code 32 - -#if __ARM_ARCH__>=7 -.align 5 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-bn_mul_mont -#endif - -.global bn_mul_mont -.type bn_mul_mont,%function - -.align 5 -bn_mul_mont: - ldr ip,[sp,#4] @ load num - stmdb sp!,{r0,r2} @ sp points at argument block -#if __ARM_ARCH__>=7 - tst ip,#7 - bne .Lialu - adr r0,bn_mul_mont - ldr r2,.LOPENSSL_armcap - ldr r0,[r0,r2] - tst r0,#1 @ NEON available? - ldmia sp, {r0,r2} - beq .Lialu - add sp,sp,#8 - b bn_mul8x_mont_neon -.align 4 -.Lialu: -#endif - cmp ip,#2 - mov r0,ip @ load num - movlt r0,#0 - addlt sp,sp,#2*4 - blt .Labrt - - stmdb sp!,{r4-r12,lr} @ save 10 registers - - mov r0,r0,lsl#2 @ rescale r0 for byte count - sub sp,sp,r0 @ alloca(4*num) - sub sp,sp,#4 @ +extra dword - sub r0,r0,#4 @ "num=num-1" - add r4,r2,r0 @ &bp[num-1] - - add r0,sp,r0 @ r0 to point at &tp[num-1] - ldr r8,[r0,#14*4] @ &n0 - ldr r2,[r2] @ bp[0] - ldr r5,[r1],#4 @ ap[0],ap++ - ldr r6,[r3],#4 @ np[0],np++ - ldr r8,[r8] @ *n0 - str r4,[r0,#15*4] @ save &bp[num] - - umull r10,r11,r5,r2 @ ap[0]*bp[0] - str r8,[r0,#14*4] @ save n0 value - mul r8,r10,r8 @ "tp[0]"*n0 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" - mov r4,sp - -.L1st: - ldr r5,[r1],#4 @ ap[j],ap++ - mov r10,r11 - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[0] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne .L1st - - adds r12,r12,r11 - ldr r4,[r0,#13*4] @ restore bp - mov r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - str r14,[r0,#4] @ tp[num]= - -.Louter: - sub r7,r0,sp @ "original" r0-1 value - sub r1,r1,r7 @ "rewind" ap to &ap[1] - ldr r2,[r4,#4]! @ *(++bp) - sub r3,r3,r7 @ "rewind" np to &np[1] - ldr r5,[r1,#-4] @ ap[0] - ldr r10,[sp] @ tp[0] - ldr r6,[r3,#-4] @ np[0] - ldr r7,[sp,#4] @ tp[1] - - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] - str r4,[r0,#13*4] @ save bp - mul r8,r10,r8 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" - mov r4,sp - -.Linner: - ldr r5,[r1],#4 @ ap[j],ap++ - adds r10,r11,r7 @ +=tp[j] - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[i] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adc r11,r11,#0 - ldr r7,[r4,#8] @ tp[j+1] - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne .Linner - - adds r12,r12,r11 - mov r14,#0 - ldr r4,[r0,#13*4] @ restore bp - adc r14,r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adds r12,r12,r7 - ldr r7,[r0,#15*4] @ restore &bp[num] - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - str r14,[r0,#4] @ tp[num]= - - cmp r4,r7 - bne .Louter - - ldr r2,[r0,#12*4] @ pull rp - add r0,r0,#4 @ r0 to point at &tp[num] - sub r5,r0,sp @ "original" num value - mov r4,sp @ "rewind" r4 - mov r1,r4 @ "borrow" r1 - sub r3,r3,r5 @ "rewind" r3 to &np[0] - - subs r7,r7,r7 @ "clear" carry flag -.Lsub: ldr r7,[r4],#4 - ldr r6,[r3],#4 - sbcs r7,r7,r6 @ tp[j]-np[j] - str r7,[r2],#4 @ rp[j]= - teq r4,r0 @ preserve carry - bne .Lsub - sbcs r14,r14,#0 @ upmost carry - mov r4,sp @ "rewind" r4 - sub r2,r2,r5 @ "rewind" r2 - - and r1,r4,r14 - bic r3,r2,r14 - orr r1,r1,r3 @ ap=borrow?tp:rp - -.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh - str sp,[r4],#4 @ zap tp - str r7,[r2],#4 - cmp r4,r0 - bne .Lcopy - - add sp,r0,#4 @ skip over tp[num+1] - ldmia sp!,{r4-r12,lr} @ restore registers - add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -.Labrt: -#if __ARM_ARCH__>=5 - bx lr @ .word 0xe12fff1e -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size bn_mul_mont,.-bn_mul_mont -#if __ARM_ARCH__>=7 -.fpu neon - -.type bn_mul8x_mont_neon,%function -.align 5 -bn_mul8x_mont_neon: - mov ip,sp - stmdb sp!,{r4-r11} - vstmdb sp!,{d8-d15} @ ABI specification says so - ldmia ip,{r4-r5} @ load rest of parameter block - - sub r7,sp,#16 - vld1.32 {d28[0]}, [r2,:32]! - sub r7,r7,r5,lsl#4 - vld1.32 {d0-d3}, [r1]! @ can't specify :32 :-( - and r7,r7,#-64 - vld1.32 {d30[0]}, [r4,:32] - mov sp,r7 @ alloca - veor d8,d8,d8 - subs r8,r5,#8 - vzip.16 d28,d8 - - vmull.u32 q6,d28,d0[0] - vmull.u32 q7,d28,d0[1] - vmull.u32 q8,d28,d1[0] - vshl.i64 d10,d13,#16 - vmull.u32 q9,d28,d1[1] - - vadd.u64 d10,d10,d12 - veor d8,d8,d8 - vmul.u32 d29,d10,d30 - - vmull.u32 q10,d28,d2[0] - vld1.32 {d4-d7}, [r3]! - vmull.u32 q11,d28,d2[1] - vmull.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmull.u32 q13,d28,d3[1] - - bne .LNEON_1st - - @ special case for num=8, everything is in register bank... - - vmlal.u32 q6,d29,d4[0] - sub r9,r5,#1 - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - b .LNEON_outer8 - -.align 4 -.LNEON_outer8: - vld1.32 {d28[0]}, [r2,:32]! - veor d8,d8,d8 - vzip.16 d28,d8 - vadd.u64 d12,d12,d10 - - vmlal.u32 q6,d28,d0[0] - vmlal.u32 q7,d28,d0[1] - vmlal.u32 q8,d28,d1[0] - vshl.i64 d10,d13,#16 - vmlal.u32 q9,d28,d1[1] - - vadd.u64 d10,d10,d12 - veor d8,d8,d8 - subs r9,r9,#1 - vmul.u32 d29,d10,d30 - - vmlal.u32 q10,d28,d2[0] - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - - vmlal.u32 q6,d29,d4[0] - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - bne .LNEON_outer8 - - vadd.u64 d12,d12,d10 - mov r7,sp - vshr.u64 d10,d12,#16 - mov r8,r5 - vadd.u64 d13,d13,d10 - add r6,sp,#16 - vshr.u64 d10,d13,#16 - vzip.16 d12,d13 - - b .LNEON_tail2 - -.align 4 -.LNEON_1st: - vmlal.u32 q6,d29,d4[0] - vld1.32 {d0-d3}, [r1]! - vmlal.u32 q7,d29,d4[1] - subs r8,r8,#8 - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vld1.32 {d4-d5}, [r3]! - vmlal.u32 q11,d29,d6[1] - vst1.64 {q6-q7}, [r7,:256]! - vmlal.u32 q12,d29,d7[0] - vmlal.u32 q13,d29,d7[1] - vst1.64 {q8-q9}, [r7,:256]! - - vmull.u32 q6,d28,d0[0] - vld1.32 {d6-d7}, [r3]! - vmull.u32 q7,d28,d0[1] - vst1.64 {q10-q11}, [r7,:256]! - vmull.u32 q8,d28,d1[0] - vmull.u32 q9,d28,d1[1] - vst1.64 {q12-q13}, [r7,:256]! - - vmull.u32 q10,d28,d2[0] - vmull.u32 q11,d28,d2[1] - vmull.u32 q12,d28,d3[0] - vmull.u32 q13,d28,d3[1] - - bne .LNEON_1st - - vmlal.u32 q6,d29,d4[0] - add r6,sp,#16 - vmlal.u32 q7,d29,d4[1] - sub r1,r1,r5,lsl#2 @ rewind r1 - vmlal.u32 q8,d29,d5[0] - vld1.64 {q5}, [sp,:128] - vmlal.u32 q9,d29,d5[1] - sub r9,r5,#1 - - vmlal.u32 q10,d29,d6[0] - vst1.64 {q6-q7}, [r7,:256]! - vmlal.u32 q11,d29,d6[1] - vshr.u64 d10,d10,#16 - vld1.64 {q6}, [r6, :128]! - vmlal.u32 q12,d29,d7[0] - vst1.64 {q8-q9}, [r7,:256]! - vmlal.u32 q13,d29,d7[1] - - vst1.64 {q10-q11}, [r7,:256]! - vadd.u64 d10,d10,d11 - veor q4,q4,q4 - vst1.64 {q12-q13}, [r7,:256]! - vld1.64 {q7-q8}, [r6, :256]! - vst1.64 {q4}, [r7,:128] - vshr.u64 d10,d10,#16 - - b .LNEON_outer - -.align 4 -.LNEON_outer: - vld1.32 {d28[0]}, [r2,:32]! - sub r3,r3,r5,lsl#2 @ rewind r3 - vld1.32 {d0-d3}, [r1]! - veor d8,d8,d8 - mov r7,sp - vzip.16 d28,d8 - sub r8,r5,#8 - vadd.u64 d12,d12,d10 - - vmlal.u32 q6,d28,d0[0] - vld1.64 {q9-q10},[r6,:256]! - vmlal.u32 q7,d28,d0[1] - vmlal.u32 q8,d28,d1[0] - vld1.64 {q11-q12},[r6,:256]! - vmlal.u32 q9,d28,d1[1] - - vshl.i64 d10,d13,#16 - veor d8,d8,d8 - vadd.u64 d10,d10,d12 - vld1.64 {q13},[r6,:128]! - vmul.u32 d29,d10,d30 - - vmlal.u32 q10,d28,d2[0] - vld1.32 {d4-d7}, [r3]! - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - -.LNEON_inner: - vmlal.u32 q6,d29,d4[0] - vld1.32 {d0-d3}, [r1]! - vmlal.u32 q7,d29,d4[1] - subs r8,r8,#8 - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - vst1.64 {q6-q7}, [r7,:256]! - - vmlal.u32 q10,d29,d6[0] - vld1.64 {q6}, [r6, :128]! - vmlal.u32 q11,d29,d6[1] - vst1.64 {q8-q9}, [r7,:256]! - vmlal.u32 q12,d29,d7[0] - vld1.64 {q7-q8}, [r6, :256]! - vmlal.u32 q13,d29,d7[1] - vst1.64 {q10-q11}, [r7,:256]! - - vmlal.u32 q6,d28,d0[0] - vld1.64 {q9-q10}, [r6, :256]! - vmlal.u32 q7,d28,d0[1] - vst1.64 {q12-q13}, [r7,:256]! - vmlal.u32 q8,d28,d1[0] - vld1.64 {q11-q12}, [r6, :256]! - vmlal.u32 q9,d28,d1[1] - vld1.32 {d4-d7}, [r3]! - - vmlal.u32 q10,d28,d2[0] - vld1.64 {q13}, [r6, :128]! - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vmlal.u32 q13,d28,d3[1] - - bne .LNEON_inner - - vmlal.u32 q6,d29,d4[0] - add r6,sp,#16 - vmlal.u32 q7,d29,d4[1] - sub r1,r1,r5,lsl#2 @ rewind r1 - vmlal.u32 q8,d29,d5[0] - vld1.64 {q5}, [sp,:128] - vmlal.u32 q9,d29,d5[1] - subs r9,r9,#1 - - vmlal.u32 q10,d29,d6[0] - vst1.64 {q6-q7}, [r7,:256]! - vmlal.u32 q11,d29,d6[1] - vld1.64 {q6}, [r6, :128]! - vshr.u64 d10,d10,#16 - vst1.64 {q8-q9}, [r7,:256]! - vmlal.u32 q12,d29,d7[0] - vld1.64 {q7-q8}, [r6, :256]! - vmlal.u32 q13,d29,d7[1] - - vst1.64 {q10-q11}, [r7,:256]! - vadd.u64 d10,d10,d11 - vst1.64 {q12-q13}, [r7,:256]! - vshr.u64 d10,d10,#16 - - bne .LNEON_outer - - mov r7,sp - mov r8,r5 - -.LNEON_tail: - vadd.u64 d12,d12,d10 - vld1.64 {q9-q10}, [r6, :256]! - vshr.u64 d10,d12,#16 - vadd.u64 d13,d13,d10 - vld1.64 {q11-q12}, [r6, :256]! - vshr.u64 d10,d13,#16 - vld1.64 {q13}, [r6, :128]! - vzip.16 d12,d13 - -.LNEON_tail2: - vadd.u64 d14,d14,d10 - vst1.32 {d12[0]}, [r7, :32]! - vshr.u64 d10,d14,#16 - vadd.u64 d15,d15,d10 - vshr.u64 d10,d15,#16 - vzip.16 d14,d15 - - vadd.u64 d16,d16,d10 - vst1.32 {d14[0]}, [r7, :32]! - vshr.u64 d10,d16,#16 - vadd.u64 d17,d17,d10 - vshr.u64 d10,d17,#16 - vzip.16 d16,d17 - - vadd.u64 d18,d18,d10 - vst1.32 {d16[0]}, [r7, :32]! - vshr.u64 d10,d18,#16 - vadd.u64 d19,d19,d10 - vshr.u64 d10,d19,#16 - vzip.16 d18,d19 - - vadd.u64 d20,d20,d10 - vst1.32 {d18[0]}, [r7, :32]! - vshr.u64 d10,d20,#16 - vadd.u64 d21,d21,d10 - vshr.u64 d10,d21,#16 - vzip.16 d20,d21 - - vadd.u64 d22,d22,d10 - vst1.32 {d20[0]}, [r7, :32]! - vshr.u64 d10,d22,#16 - vadd.u64 d23,d23,d10 - vshr.u64 d10,d23,#16 - vzip.16 d22,d23 - - vadd.u64 d24,d24,d10 - vst1.32 {d22[0]}, [r7, :32]! - vshr.u64 d10,d24,#16 - vadd.u64 d25,d25,d10 - vld1.64 {q6}, [r6, :128]! - vshr.u64 d10,d25,#16 - vzip.16 d24,d25 - - vadd.u64 d26,d26,d10 - vst1.32 {d24[0]}, [r7, :32]! - vshr.u64 d10,d26,#16 - vadd.u64 d27,d27,d10 - vld1.64 {q7-q8}, [r6, :256]! - vshr.u64 d10,d27,#16 - vzip.16 d26,d27 - subs r8,r8,#8 - vst1.32 {d26[0]}, [r7, :32]! - - bne .LNEON_tail - - vst1.32 {d10[0]}, [r7, :32] @ top-most bit - sub r3,r3,r5,lsl#2 @ rewind r3 - subs r1,sp,#0 @ clear carry flag - add r2,sp,r5,lsl#2 - -.LNEON_sub: - ldmia r1!, {r4-r7} - ldmia r3!, {r8-r11} - sbcs r8, r4,r8 - sbcs r9, r5,r9 - sbcs r10,r6,r10 - sbcs r11,r7,r11 - teq r1,r2 @ preserves carry - stmia r0!, {r8-r11} - bne .LNEON_sub - - ldr r10, [r1] @ load top-most bit - veor q0,q0,q0 - sub r11,r2,sp @ this is num*4 - veor q1,q1,q1 - mov r1,sp - sub r0,r0,r11 @ rewind r0 - mov r3,r2 @ second 3/4th of frame - sbcs r10,r10,#0 @ result is carry flag - -.LNEON_copy_n_zap: - ldmia r1!, {r4-r7} - ldmia r0, {r8-r11} - movcc r8, r4 - vst1.64 {q0-q1}, [r3,:256]! @ wipe - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0-q1}, [r3,:256]! @ wipe - movcc r11,r7 - ldmia r1, {r4-r7} - stmia r0!, {r8-r11} - sub r1,r1,#16 - ldmia r0, {r8-r11} - movcc r8, r4 - vst1.64 {q0-q1}, [r1,:256]! @ wipe - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0-q1}, [r3,:256]! @ wipe - movcc r11,r7 - teq r1,r2 @ preserves carry - stmia r0!, {r8-r11} - bne .LNEON_copy_n_zap - - sub sp,ip,#96 - vldmia sp!,{d8-d15} - ldmia sp!,{r4-r11} - bx lr @ .word 0xe12fff1e -.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon -#endif -.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by " -.align 2 -#if __ARM_ARCH__>=7 -.comm OPENSSL_armcap_P,4,4 -#endif diff --git a/main/openssl/crypto/bn/asm/mips3.S b/main/openssl/crypto/bn/asm/mips3.S new file mode 100644 index 00000000..dca4105c --- /dev/null +++ b/main/openssl/crypto/bn/asm/mips3.S @@ -0,0 +1,2201 @@ +.rdata +.asciiz "mips3.s, Version 1.1" +.asciiz "MIPS III/IV ISA artwork by Andy Polyakov " + +/* + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. + * + * Rights for redistribution and usage in source and binary forms are + * granted according to the OpenSSL license. Warranty of any kind is + * disclaimed. + * ==================================================================== + */ + +/* + * This is my modest contributon to the OpenSSL project (see + * http://www.openssl.org/ for more information about it) and is + * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c + * module. For updates see http://fy.chalmers.se/~appro/hpe/. + * + * The module is designed to work with either of the "new" MIPS ABI(5), + * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under + * IRIX 5.x not only because it doesn't support new ABIs but also + * because 5.x kernels put R4x00 CPU into 32-bit mode and all those + * 64-bit instructions (daddu, dmultu, etc.) found below gonna only + * cause illegal instruction exception:-( + * + * In addition the code depends on preprocessor flags set up by MIPSpro + * compiler driver (either as or cc) and therefore (probably?) can't be + * compiled by the GNU assembler. GNU C driver manages fine though... + * I mean as long as -mmips-as is specified or is the default option, + * because then it simply invokes /usr/bin/as which in turn takes + * perfect care of the preprocessor definitions. Another neat feature + * offered by the MIPSpro assembler is an optimization pass. This gave + * me the opportunity to have the code looking more regular as all those + * architecture dependent instruction rescheduling details were left to + * the assembler. Cool, huh? + * + * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' + * goes way over 3 times faster! + * + * + */ +#include +#include + +#if _MIPS_ISA>=4 +#define MOVNZ(cond,dst,src) \ + movn dst,src,cond +#else +#define MOVNZ(cond,dst,src) \ + .set noreorder; \ + bnezl cond,.+8; \ + move dst,src; \ + .set reorder +#endif + +.text + +.set noat +.set reorder + +#define MINUS4 v1 + +.align 5 +LEAF(bn_mul_add_words) + .set noreorder + bgtzl a2,.L_bn_mul_add_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_mul_add_words_proceed: + li MINUS4,-4 + and ta0,a2,MINUS4 + move v0,zero + beqz ta0,.L_bn_mul_add_words_tail + +.L_bn_mul_add_words_loop: + dmultu t0,a3 + ld t1,0(a0) + ld t2,8(a1) + ld t3,8(a0) + ld ta0,16(a1) + ld ta1,16(a0) + daddu t1,v0 + sltu v0,t1,v0 /* All manuals say it "compares 32-bit + * values", but it seems to work fine + * even on 64-bit registers. */ + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,0(a0) + daddu v0,AT + + dmultu t2,a3 + ld ta2,24(a1) + ld ta3,24(a0) + daddu t3,v0 + sltu v0,t3,v0 + mflo AT + mfhi t2 + daddu t3,AT + daddu v0,t2 + sltu AT,t3,AT + sd t3,8(a0) + daddu v0,AT + + dmultu ta0,a3 + subu a2,4 + PTR_ADD a0,32 + PTR_ADD a1,32 + daddu ta1,v0 + sltu v0,ta1,v0 + mflo AT + mfhi ta0 + daddu ta1,AT + daddu v0,ta0 + sltu AT,ta1,AT + sd ta1,-16(a0) + daddu v0,AT + + + dmultu ta2,a3 + and ta0,a2,MINUS4 + daddu ta3,v0 + sltu v0,ta3,v0 + mflo AT + mfhi ta2 + daddu ta3,AT + daddu v0,ta2 + sltu AT,ta3,AT + sd ta3,-8(a0) + daddu v0,AT + .set noreorder + bgtzl ta0,.L_bn_mul_add_words_loop + ld t0,0(a1) + + bnezl a2,.L_bn_mul_add_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_mul_add_words_return: + jr ra + +.L_bn_mul_add_words_tail: + dmultu t0,a3 + ld t1,0(a0) + subu a2,1 + daddu t1,v0 + sltu v0,t1,v0 + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,0(a0) + daddu v0,AT + beqz a2,.L_bn_mul_add_words_return + + ld t0,8(a1) + dmultu t0,a3 + ld t1,8(a0) + subu a2,1 + daddu t1,v0 + sltu v0,t1,v0 + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,8(a0) + daddu v0,AT + beqz a2,.L_bn_mul_add_words_return + + ld t0,16(a1) + dmultu t0,a3 + ld t1,16(a0) + daddu t1,v0 + sltu v0,t1,v0 + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,16(a0) + daddu v0,AT + jr ra +END(bn_mul_add_words) + +.align 5 +LEAF(bn_mul_words) + .set noreorder + bgtzl a2,.L_bn_mul_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_mul_words_proceed: + li MINUS4,-4 + and ta0,a2,MINUS4 + move v0,zero + beqz ta0,.L_bn_mul_words_tail + +.L_bn_mul_words_loop: + dmultu t0,a3 + ld t2,8(a1) + ld ta0,16(a1) + ld ta2,24(a1) + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,0(a0) + daddu v0,t1,t0 + + dmultu t2,a3 + subu a2,4 + PTR_ADD a0,32 + PTR_ADD a1,32 + mflo AT + mfhi t2 + daddu v0,AT + sltu t3,v0,AT + sd v0,-24(a0) + daddu v0,t3,t2 + + dmultu ta0,a3 + mflo AT + mfhi ta0 + daddu v0,AT + sltu ta1,v0,AT + sd v0,-16(a0) + daddu v0,ta1,ta0 + + + dmultu ta2,a3 + and ta0,a2,MINUS4 + mflo AT + mfhi ta2 + daddu v0,AT + sltu ta3,v0,AT + sd v0,-8(a0) + daddu v0,ta3,ta2 + .set noreorder + bgtzl ta0,.L_bn_mul_words_loop + ld t0,0(a1) + + bnezl a2,.L_bn_mul_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_mul_words_return: + jr ra + +.L_bn_mul_words_tail: + dmultu t0,a3 + subu a2,1 + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,0(a0) + daddu v0,t1,t0 + beqz a2,.L_bn_mul_words_return + + ld t0,8(a1) + dmultu t0,a3 + subu a2,1 + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,8(a0) + daddu v0,t1,t0 + beqz a2,.L_bn_mul_words_return + + ld t0,16(a1) + dmultu t0,a3 + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,16(a0) + daddu v0,t1,t0 + jr ra +END(bn_mul_words) + +.align 5 +LEAF(bn_sqr_words) + .set noreorder + bgtzl a2,.L_bn_sqr_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_sqr_words_proceed: + li MINUS4,-4 + and ta0,a2,MINUS4 + move v0,zero + beqz ta0,.L_bn_sqr_words_tail + +.L_bn_sqr_words_loop: + dmultu t0,t0 + ld t2,8(a1) + ld ta0,16(a1) + ld ta2,24(a1) + mflo t1 + mfhi t0 + sd t1,0(a0) + sd t0,8(a0) + + dmultu t2,t2 + subu a2,4 + PTR_ADD a0,64 + PTR_ADD a1,32 + mflo t3 + mfhi t2 + sd t3,-48(a0) + sd t2,-40(a0) + + dmultu ta0,ta0 + mflo ta1 + mfhi ta0 + sd ta1,-32(a0) + sd ta0,-24(a0) + + + dmultu ta2,ta2 + and ta0,a2,MINUS4 + mflo ta3 + mfhi ta2 + sd ta3,-16(a0) + sd ta2,-8(a0) + + .set noreorder + bgtzl ta0,.L_bn_sqr_words_loop + ld t0,0(a1) + + bnezl a2,.L_bn_sqr_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_sqr_words_return: + move v0,zero + jr ra + +.L_bn_sqr_words_tail: + dmultu t0,t0 + subu a2,1 + mflo t1 + mfhi t0 + sd t1,0(a0) + sd t0,8(a0) + beqz a2,.L_bn_sqr_words_return + + ld t0,8(a1) + dmultu t0,t0 + subu a2,1 + mflo t1 + mfhi t0 + sd t1,16(a0) + sd t0,24(a0) + beqz a2,.L_bn_sqr_words_return + + ld t0,16(a1) + dmultu t0,t0 + mflo t1 + mfhi t0 + sd t1,32(a0) + sd t0,40(a0) + jr ra +END(bn_sqr_words) + +.align 5 +LEAF(bn_add_words) + .set noreorder + bgtzl a3,.L_bn_add_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_add_words_proceed: + li MINUS4,-4 + and AT,a3,MINUS4 + move v0,zero + beqz AT,.L_bn_add_words_tail + +.L_bn_add_words_loop: + ld ta0,0(a2) + subu a3,4 + ld t1,8(a1) + and AT,a3,MINUS4 + ld t2,16(a1) + PTR_ADD a2,32 + ld t3,24(a1) + PTR_ADD a0,32 + ld ta1,-24(a2) + PTR_ADD a1,32 + ld ta2,-16(a2) + ld ta3,-8(a2) + daddu ta0,t0 + sltu t8,ta0,t0 + daddu t0,ta0,v0 + sltu v0,t0,ta0 + sd t0,-32(a0) + daddu v0,t8 + + daddu ta1,t1 + sltu t9,ta1,t1 + daddu t1,ta1,v0 + sltu v0,t1,ta1 + sd t1,-24(a0) + daddu v0,t9 + + daddu ta2,t2 + sltu t8,ta2,t2 + daddu t2,ta2,v0 + sltu v0,t2,ta2 + sd t2,-16(a0) + daddu v0,t8 + + daddu ta3,t3 + sltu t9,ta3,t3 + daddu t3,ta3,v0 + sltu v0,t3,ta3 + sd t3,-8(a0) + daddu v0,t9 + + .set noreorder + bgtzl AT,.L_bn_add_words_loop + ld t0,0(a1) + + bnezl a3,.L_bn_add_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_add_words_return: + jr ra + +.L_bn_add_words_tail: + ld ta0,0(a2) + daddu ta0,t0 + subu a3,1 + sltu t8,ta0,t0 + daddu t0,ta0,v0 + sltu v0,t0,ta0 + sd t0,0(a0) + daddu v0,t8 + beqz a3,.L_bn_add_words_return + + ld t1,8(a1) + ld ta1,8(a2) + daddu ta1,t1 + subu a3,1 + sltu t9,ta1,t1 + daddu t1,ta1,v0 + sltu v0,t1,ta1 + sd t1,8(a0) + daddu v0,t9 + beqz a3,.L_bn_add_words_return + + ld t2,16(a1) + ld ta2,16(a2) + daddu ta2,t2 + sltu t8,ta2,t2 + daddu t2,ta2,v0 + sltu v0,t2,ta2 + sd t2,16(a0) + daddu v0,t8 + jr ra +END(bn_add_words) + +.align 5 +LEAF(bn_sub_words) + .set noreorder + bgtzl a3,.L_bn_sub_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_sub_words_proceed: + li MINUS4,-4 + and AT,a3,MINUS4 + move v0,zero + beqz AT,.L_bn_sub_words_tail + +.L_bn_sub_words_loop: + ld ta0,0(a2) + subu a3,4 + ld t1,8(a1) + and AT,a3,MINUS4 + ld t2,16(a1) + PTR_ADD a2,32 + ld t3,24(a1) + PTR_ADD a0,32 + ld ta1,-24(a2) + PTR_ADD a1,32 + ld ta2,-16(a2) + ld ta3,-8(a2) + sltu t8,t0,ta0 + dsubu t0,ta0 + dsubu ta0,t0,v0 + sd ta0,-32(a0) + MOVNZ (t0,v0,t8) + + sltu t9,t1,ta1 + dsubu t1,ta1 + dsubu ta1,t1,v0 + sd ta1,-24(a0) + MOVNZ (t1,v0,t9) + + + sltu t8,t2,ta2 + dsubu t2,ta2 + dsubu ta2,t2,v0 + sd ta2,-16(a0) + MOVNZ (t2,v0,t8) + + sltu t9,t3,ta3 + dsubu t3,ta3 + dsubu ta3,t3,v0 + sd ta3,-8(a0) + MOVNZ (t3,v0,t9) + + .set noreorder + bgtzl AT,.L_bn_sub_words_loop + ld t0,0(a1) + + bnezl a3,.L_bn_sub_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_sub_words_return: + jr ra + +.L_bn_sub_words_tail: + ld ta0,0(a2) + subu a3,1 + sltu t8,t0,ta0 + dsubu t0,ta0 + dsubu ta0,t0,v0 + MOVNZ (t0,v0,t8) + sd ta0,0(a0) + beqz a3,.L_bn_sub_words_return + + ld t1,8(a1) + subu a3,1 + ld ta1,8(a2) + sltu t9,t1,ta1 + dsubu t1,ta1 + dsubu ta1,t1,v0 + MOVNZ (t1,v0,t9) + sd ta1,8(a0) + beqz a3,.L_bn_sub_words_return + + ld t2,16(a1) + ld ta2,16(a2) + sltu t8,t2,ta2 + dsubu t2,ta2 + dsubu ta2,t2,v0 + MOVNZ (t2,v0,t8) + sd ta2,16(a0) + jr ra +END(bn_sub_words) + +#undef MINUS4 + +.align 5 +LEAF(bn_div_3_words) + .set reorder + move a3,a0 /* we know that bn_div_words doesn't + * touch a3, ta2, ta3 and preserves a2 + * so that we can save two arguments + * and return address in registers + * instead of stack:-) + */ + ld a0,(a3) + move ta2,a1 + ld a1,-8(a3) + bne a0,a2,.L_bn_div_3_words_proceed + li v0,-1 + jr ra +.L_bn_div_3_words_proceed: + move ta3,ra + bal bn_div_words + move ra,ta3 + dmultu ta2,v0 + ld t2,-16(a3) + move ta0,zero + mfhi t1 + mflo t0 + sltu t8,t1,v1 +.L_bn_div_3_words_inner_loop: + bnez t8,.L_bn_div_3_words_inner_loop_done + sgeu AT,t2,t0 + seq t9,t1,v1 + and AT,t9 + sltu t3,t0,ta2 + daddu v1,a2 + dsubu t1,t3 + dsubu t0,ta2 + sltu t8,t1,v1 + sltu ta0,v1,a2 + or t8,ta0 + .set noreorder + beqzl AT,.L_bn_div_3_words_inner_loop + dsubu v0,1 + .set reorder +.L_bn_div_3_words_inner_loop_done: + jr ra +END(bn_div_3_words) + +.align 5 +LEAF(bn_div_words) + .set noreorder + bnezl a2,.L_bn_div_words_proceed + move v1,zero + jr ra + li v0,-1 /* I'd rather signal div-by-zero + * which can be done with 'break 7' */ + +.L_bn_div_words_proceed: + bltz a2,.L_bn_div_words_body + move t9,v1 + dsll a2,1 + bgtz a2,.-4 + addu t9,1 + + .set reorder + negu t1,t9 + li t2,-1 + dsll t2,t1 + and t2,a0 + dsrl AT,a1,t1 + .set noreorder + bnezl t2,.+8 + break 6 /* signal overflow */ + .set reorder + dsll a0,t9 + dsll a1,t9 + or a0,AT + +#define QT ta0 +#define HH ta1 +#define DH v1 +.L_bn_div_words_body: + dsrl DH,a2,32 + sgeu AT,a0,a2 + .set noreorder + bnezl AT,.+8 + dsubu a0,a2 + .set reorder + + li QT,-1 + dsrl HH,a0,32 + dsrl QT,32 /* q=0xffffffff */ + beq DH,HH,.L_bn_div_words_skip_div1 + ddivu zero,a0,DH + mflo QT +.L_bn_div_words_skip_div1: + dmultu a2,QT + dsll t3,a0,32 + dsrl AT,a1,32 + or t3,AT + mflo t0 + mfhi t1 +.L_bn_div_words_inner_loop1: + sltu t2,t3,t0 + seq t8,HH,t1 + sltu AT,HH,t1 + and t2,t8 + sltu v0,t0,a2 + or AT,t2 + .set noreorder + beqz AT,.L_bn_div_words_inner_loop1_done + dsubu t1,v0 + dsubu t0,a2 + b .L_bn_div_words_inner_loop1 + dsubu QT,1 + .set reorder +.L_bn_div_words_inner_loop1_done: + + dsll a1,32 + dsubu a0,t3,t0 + dsll v0,QT,32 + + li QT,-1 + dsrl HH,a0,32 + dsrl QT,32 /* q=0xffffffff */ + beq DH,HH,.L_bn_div_words_skip_div2 + ddivu zero,a0,DH + mflo QT +.L_bn_div_words_skip_div2: +#undef DH + dmultu a2,QT + dsll t3,a0,32 + dsrl AT,a1,32 + or t3,AT + mflo t0 + mfhi t1 +.L_bn_div_words_inner_loop2: + sltu t2,t3,t0 + seq t8,HH,t1 + sltu AT,HH,t1 + and t2,t8 + sltu v1,t0,a2 + or AT,t2 + .set noreorder + beqz AT,.L_bn_div_words_inner_loop2_done + dsubu t1,v1 + dsubu t0,a2 + b .L_bn_div_words_inner_loop2 + dsubu QT,1 + .set reorder +.L_bn_div_words_inner_loop2_done: +#undef HH + + dsubu a0,t3,t0 + or v0,QT + dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */ + dsrl a2,t9 /* restore a2 */ + jr ra +#undef QT +END(bn_div_words) + +#define a_0 t0 +#define a_1 t1 +#define a_2 t2 +#define a_3 t3 +#define b_0 ta0 +#define b_1 ta1 +#define b_2 ta2 +#define b_3 ta3 + +#define a_4 s0 +#define a_5 s2 +#define a_6 s4 +#define a_7 a1 /* once we load a[7] we don't need a anymore */ +#define b_4 s1 +#define b_5 s3 +#define b_6 s5 +#define b_7 a2 /* once we load b[7] we don't need b anymore */ + +#define t_1 t8 +#define t_2 t9 + +#define c_1 v0 +#define c_2 v1 +#define c_3 a3 + +#define FRAME_SIZE 48 + +.align 5 +LEAF(bn_mul_comba8) + .set noreorder + PTR_SUB sp,FRAME_SIZE + .frame sp,64,ra + .set reorder + ld a_0,0(a1) /* If compiled with -mips3 option on + * R5000 box assembler barks on this + * line with "shouldn't have mult/div + * as last instruction in bb (R10K + * bug)" warning. If anybody out there + * has a clue about how to circumvent + * this do send me a note. + * + */ + ld b_0,0(a2) + ld a_1,8(a1) + ld a_2,16(a1) + ld a_3,24(a1) + ld b_1,8(a2) + ld b_2,16(a2) + ld b_3,24(a2) + dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + sd s0,0(sp) + sd s1,8(sp) + sd s2,16(sp) + sd s3,24(sp) + sd s4,32(sp) + sd s5,40(sp) + mflo c_1 + mfhi c_2 + + dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ + ld a_4,32(a1) + ld a_5,40(a1) + ld a_6,48(a1) + ld a_7,56(a1) + ld b_4,32(a2) + ld b_5,40(a2) + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ + ld b_6,48(a2) + ld b_7,56(a2) + sd c_1,0(a0) /* r[0]=c1; */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,8(a0) /* r[1]=c2; */ + + dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) /* r[2]=c3; */ + + dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) /* r[3]=c1; */ + + dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) /* r[4]=c2; */ + + dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) /* r[5]=c3; */ + + dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,48(a0) /* r[6]=c1; */ + + dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,56(a0) /* r[7]=c2; */ + + dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,64(a0) /* r[8]=c3; */ + + dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,72(a0) /* r[9]=c1; */ + + dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,80(a0) /* r[10]=c2; */ + + dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,88(a0) /* r[11]=c3; */ + + dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,96(a0) /* r[12]=c1; */ + + dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,104(a0) /* r[13]=c2; */ + + dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ + ld s0,0(sp) + ld s1,8(sp) + ld s2,16(sp) + ld s3,24(sp) + ld s4,32(sp) + ld s5,40(sp) + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sd c_3,112(a0) /* r[14]=c3; */ + sd c_1,120(a0) /* r[15]=c1; */ + + PTR_ADD sp,FRAME_SIZE + + jr ra +END(bn_mul_comba8) + +.align 5 +LEAF(bn_mul_comba4) + .set reorder + ld a_0,0(a1) + ld b_0,0(a2) + ld a_1,8(a1) + ld a_2,16(a1) + dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + ld a_3,24(a1) + ld b_1,8(a2) + ld b_2,16(a2) + ld b_3,24(a2) + mflo c_1 + mfhi c_2 + sd c_1,0(a0) + + dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,8(a0) + + dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) + + dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) + + dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) + + dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) + + dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sd c_1,48(a0) + sd c_2,56(a0) + + jr ra +END(bn_mul_comba4) + +#undef a_4 +#undef a_5 +#undef a_6 +#undef a_7 +#define a_4 b_0 +#define a_5 b_1 +#define a_6 b_2 +#define a_7 b_3 + +.align 5 +LEAF(bn_sqr_comba8) + .set reorder + ld a_0,0(a1) + ld a_1,8(a1) + ld a_2,16(a1) + ld a_3,24(a1) + + dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + ld a_4,32(a1) + ld a_5,40(a1) + ld a_6,48(a1) + ld a_7,56(a1) + mflo c_1 + mfhi c_2 + sd c_1,0(a0) + + dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + sd c_2,8(a0) + + dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) + + dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) + + dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) + + dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) + + dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,48(a0) + + dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,56(a0) + + dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,64(a0) + + dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,72(a0) + + dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,80(a0) + + dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,88(a0) + + dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,96(a0) + + dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,104(a0) + + dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sd c_3,112(a0) + sd c_1,120(a0) + + jr ra +END(bn_sqr_comba8) + +.align 5 +LEAF(bn_sqr_comba4) + .set reorder + ld a_0,0(a1) + ld a_1,8(a1) + ld a_2,16(a1) + ld a_3,24(a1) + dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + mflo c_1 + mfhi c_2 + sd c_1,0(a0) + + dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + sd c_2,8(a0) + + dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) + + dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) + + dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) + + dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) + + dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sd c_1,48(a0) + sd c_2,56(a0) + + jr ra +END(bn_sqr_comba4) diff --git a/main/openssl/crypto/bn/asm/mips3.s b/main/openssl/crypto/bn/asm/mips3.s deleted file mode 100644 index dca4105c..00000000 --- a/main/openssl/crypto/bn/asm/mips3.s +++ /dev/null @@ -1,2201 +0,0 @@ -.rdata -.asciiz "mips3.s, Version 1.1" -.asciiz "MIPS III/IV ISA artwork by Andy Polyakov " - -/* - * ==================================================================== - * Written by Andy Polyakov for the OpenSSL - * project. - * - * Rights for redistribution and usage in source and binary forms are - * granted according to the OpenSSL license. Warranty of any kind is - * disclaimed. - * ==================================================================== - */ - -/* - * This is my modest contributon to the OpenSSL project (see - * http://www.openssl.org/ for more information about it) and is - * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c - * module. For updates see http://fy.chalmers.se/~appro/hpe/. - * - * The module is designed to work with either of the "new" MIPS ABI(5), - * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under - * IRIX 5.x not only because it doesn't support new ABIs but also - * because 5.x kernels put R4x00 CPU into 32-bit mode and all those - * 64-bit instructions (daddu, dmultu, etc.) found below gonna only - * cause illegal instruction exception:-( - * - * In addition the code depends on preprocessor flags set up by MIPSpro - * compiler driver (either as or cc) and therefore (probably?) can't be - * compiled by the GNU assembler. GNU C driver manages fine though... - * I mean as long as -mmips-as is specified or is the default option, - * because then it simply invokes /usr/bin/as which in turn takes - * perfect care of the preprocessor definitions. Another neat feature - * offered by the MIPSpro assembler is an optimization pass. This gave - * me the opportunity to have the code looking more regular as all those - * architecture dependent instruction rescheduling details were left to - * the assembler. Cool, huh? - * - * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' - * goes way over 3 times faster! - * - * - */ -#include -#include - -#if _MIPS_ISA>=4 -#define MOVNZ(cond,dst,src) \ - movn dst,src,cond -#else -#define MOVNZ(cond,dst,src) \ - .set noreorder; \ - bnezl cond,.+8; \ - move dst,src; \ - .set reorder -#endif - -.text - -.set noat -.set reorder - -#define MINUS4 v1 - -.align 5 -LEAF(bn_mul_add_words) - .set noreorder - bgtzl a2,.L_bn_mul_add_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_mul_add_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_mul_add_words_tail - -.L_bn_mul_add_words_loop: - dmultu t0,a3 - ld t1,0(a0) - ld t2,8(a1) - ld t3,8(a0) - ld ta0,16(a1) - ld ta1,16(a0) - daddu t1,v0 - sltu v0,t1,v0 /* All manuals say it "compares 32-bit - * values", but it seems to work fine - * even on 64-bit registers. */ - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,0(a0) - daddu v0,AT - - dmultu t2,a3 - ld ta2,24(a1) - ld ta3,24(a0) - daddu t3,v0 - sltu v0,t3,v0 - mflo AT - mfhi t2 - daddu t3,AT - daddu v0,t2 - sltu AT,t3,AT - sd t3,8(a0) - daddu v0,AT - - dmultu ta0,a3 - subu a2,4 - PTR_ADD a0,32 - PTR_ADD a1,32 - daddu ta1,v0 - sltu v0,ta1,v0 - mflo AT - mfhi ta0 - daddu ta1,AT - daddu v0,ta0 - sltu AT,ta1,AT - sd ta1,-16(a0) - daddu v0,AT - - - dmultu ta2,a3 - and ta0,a2,MINUS4 - daddu ta3,v0 - sltu v0,ta3,v0 - mflo AT - mfhi ta2 - daddu ta3,AT - daddu v0,ta2 - sltu AT,ta3,AT - sd ta3,-8(a0) - daddu v0,AT - .set noreorder - bgtzl ta0,.L_bn_mul_add_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_mul_add_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_mul_add_words_return: - jr ra - -.L_bn_mul_add_words_tail: - dmultu t0,a3 - ld t1,0(a0) - subu a2,1 - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,0(a0) - daddu v0,AT - beqz a2,.L_bn_mul_add_words_return - - ld t0,8(a1) - dmultu t0,a3 - ld t1,8(a0) - subu a2,1 - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,8(a0) - daddu v0,AT - beqz a2,.L_bn_mul_add_words_return - - ld t0,16(a1) - dmultu t0,a3 - ld t1,16(a0) - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,16(a0) - daddu v0,AT - jr ra -END(bn_mul_add_words) - -.align 5 -LEAF(bn_mul_words) - .set noreorder - bgtzl a2,.L_bn_mul_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_mul_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_mul_words_tail - -.L_bn_mul_words_loop: - dmultu t0,a3 - ld t2,8(a1) - ld ta0,16(a1) - ld ta2,24(a1) - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,0(a0) - daddu v0,t1,t0 - - dmultu t2,a3 - subu a2,4 - PTR_ADD a0,32 - PTR_ADD a1,32 - mflo AT - mfhi t2 - daddu v0,AT - sltu t3,v0,AT - sd v0,-24(a0) - daddu v0,t3,t2 - - dmultu ta0,a3 - mflo AT - mfhi ta0 - daddu v0,AT - sltu ta1,v0,AT - sd v0,-16(a0) - daddu v0,ta1,ta0 - - - dmultu ta2,a3 - and ta0,a2,MINUS4 - mflo AT - mfhi ta2 - daddu v0,AT - sltu ta3,v0,AT - sd v0,-8(a0) - daddu v0,ta3,ta2 - .set noreorder - bgtzl ta0,.L_bn_mul_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_mul_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_mul_words_return: - jr ra - -.L_bn_mul_words_tail: - dmultu t0,a3 - subu a2,1 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,0(a0) - daddu v0,t1,t0 - beqz a2,.L_bn_mul_words_return - - ld t0,8(a1) - dmultu t0,a3 - subu a2,1 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,8(a0) - daddu v0,t1,t0 - beqz a2,.L_bn_mul_words_return - - ld t0,16(a1) - dmultu t0,a3 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,16(a0) - daddu v0,t1,t0 - jr ra -END(bn_mul_words) - -.align 5 -LEAF(bn_sqr_words) - .set noreorder - bgtzl a2,.L_bn_sqr_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_sqr_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_sqr_words_tail - -.L_bn_sqr_words_loop: - dmultu t0,t0 - ld t2,8(a1) - ld ta0,16(a1) - ld ta2,24(a1) - mflo t1 - mfhi t0 - sd t1,0(a0) - sd t0,8(a0) - - dmultu t2,t2 - subu a2,4 - PTR_ADD a0,64 - PTR_ADD a1,32 - mflo t3 - mfhi t2 - sd t3,-48(a0) - sd t2,-40(a0) - - dmultu ta0,ta0 - mflo ta1 - mfhi ta0 - sd ta1,-32(a0) - sd ta0,-24(a0) - - - dmultu ta2,ta2 - and ta0,a2,MINUS4 - mflo ta3 - mfhi ta2 - sd ta3,-16(a0) - sd ta2,-8(a0) - - .set noreorder - bgtzl ta0,.L_bn_sqr_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_sqr_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_sqr_words_return: - move v0,zero - jr ra - -.L_bn_sqr_words_tail: - dmultu t0,t0 - subu a2,1 - mflo t1 - mfhi t0 - sd t1,0(a0) - sd t0,8(a0) - beqz a2,.L_bn_sqr_words_return - - ld t0,8(a1) - dmultu t0,t0 - subu a2,1 - mflo t1 - mfhi t0 - sd t1,16(a0) - sd t0,24(a0) - beqz a2,.L_bn_sqr_words_return - - ld t0,16(a1) - dmultu t0,t0 - mflo t1 - mfhi t0 - sd t1,32(a0) - sd t0,40(a0) - jr ra -END(bn_sqr_words) - -.align 5 -LEAF(bn_add_words) - .set noreorder - bgtzl a3,.L_bn_add_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_add_words_proceed: - li MINUS4,-4 - and AT,a3,MINUS4 - move v0,zero - beqz AT,.L_bn_add_words_tail - -.L_bn_add_words_loop: - ld ta0,0(a2) - subu a3,4 - ld t1,8(a1) - and AT,a3,MINUS4 - ld t2,16(a1) - PTR_ADD a2,32 - ld t3,24(a1) - PTR_ADD a0,32 - ld ta1,-24(a2) - PTR_ADD a1,32 - ld ta2,-16(a2) - ld ta3,-8(a2) - daddu ta0,t0 - sltu t8,ta0,t0 - daddu t0,ta0,v0 - sltu v0,t0,ta0 - sd t0,-32(a0) - daddu v0,t8 - - daddu ta1,t1 - sltu t9,ta1,t1 - daddu t1,ta1,v0 - sltu v0,t1,ta1 - sd t1,-24(a0) - daddu v0,t9 - - daddu ta2,t2 - sltu t8,ta2,t2 - daddu t2,ta2,v0 - sltu v0,t2,ta2 - sd t2,-16(a0) - daddu v0,t8 - - daddu ta3,t3 - sltu t9,ta3,t3 - daddu t3,ta3,v0 - sltu v0,t3,ta3 - sd t3,-8(a0) - daddu v0,t9 - - .set noreorder - bgtzl AT,.L_bn_add_words_loop - ld t0,0(a1) - - bnezl a3,.L_bn_add_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_add_words_return: - jr ra - -.L_bn_add_words_tail: - ld ta0,0(a2) - daddu ta0,t0 - subu a3,1 - sltu t8,ta0,t0 - daddu t0,ta0,v0 - sltu v0,t0,ta0 - sd t0,0(a0) - daddu v0,t8 - beqz a3,.L_bn_add_words_return - - ld t1,8(a1) - ld ta1,8(a2) - daddu ta1,t1 - subu a3,1 - sltu t9,ta1,t1 - daddu t1,ta1,v0 - sltu v0,t1,ta1 - sd t1,8(a0) - daddu v0,t9 - beqz a3,.L_bn_add_words_return - - ld t2,16(a1) - ld ta2,16(a2) - daddu ta2,t2 - sltu t8,ta2,t2 - daddu t2,ta2,v0 - sltu v0,t2,ta2 - sd t2,16(a0) - daddu v0,t8 - jr ra -END(bn_add_words) - -.align 5 -LEAF(bn_sub_words) - .set noreorder - bgtzl a3,.L_bn_sub_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_sub_words_proceed: - li MINUS4,-4 - and AT,a3,MINUS4 - move v0,zero - beqz AT,.L_bn_sub_words_tail - -.L_bn_sub_words_loop: - ld ta0,0(a2) - subu a3,4 - ld t1,8(a1) - and AT,a3,MINUS4 - ld t2,16(a1) - PTR_ADD a2,32 - ld t3,24(a1) - PTR_ADD a0,32 - ld ta1,-24(a2) - PTR_ADD a1,32 - ld ta2,-16(a2) - ld ta3,-8(a2) - sltu t8,t0,ta0 - dsubu t0,ta0 - dsubu ta0,t0,v0 - sd ta0,-32(a0) - MOVNZ (t0,v0,t8) - - sltu t9,t1,ta1 - dsubu t1,ta1 - dsubu ta1,t1,v0 - sd ta1,-24(a0) - MOVNZ (t1,v0,t9) - - - sltu t8,t2,ta2 - dsubu t2,ta2 - dsubu ta2,t2,v0 - sd ta2,-16(a0) - MOVNZ (t2,v0,t8) - - sltu t9,t3,ta3 - dsubu t3,ta3 - dsubu ta3,t3,v0 - sd ta3,-8(a0) - MOVNZ (t3,v0,t9) - - .set noreorder - bgtzl AT,.L_bn_sub_words_loop - ld t0,0(a1) - - bnezl a3,.L_bn_sub_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_sub_words_return: - jr ra - -.L_bn_sub_words_tail: - ld ta0,0(a2) - subu a3,1 - sltu t8,t0,ta0 - dsubu t0,ta0 - dsubu ta0,t0,v0 - MOVNZ (t0,v0,t8) - sd ta0,0(a0) - beqz a3,.L_bn_sub_words_return - - ld t1,8(a1) - subu a3,1 - ld ta1,8(a2) - sltu t9,t1,ta1 - dsubu t1,ta1 - dsubu ta1,t1,v0 - MOVNZ (t1,v0,t9) - sd ta1,8(a0) - beqz a3,.L_bn_sub_words_return - - ld t2,16(a1) - ld ta2,16(a2) - sltu t8,t2,ta2 - dsubu t2,ta2 - dsubu ta2,t2,v0 - MOVNZ (t2,v0,t8) - sd ta2,16(a0) - jr ra -END(bn_sub_words) - -#undef MINUS4 - -.align 5 -LEAF(bn_div_3_words) - .set reorder - move a3,a0 /* we know that bn_div_words doesn't - * touch a3, ta2, ta3 and preserves a2 - * so that we can save two arguments - * and return address in registers - * instead of stack:-) - */ - ld a0,(a3) - move ta2,a1 - ld a1,-8(a3) - bne a0,a2,.L_bn_div_3_words_proceed - li v0,-1 - jr ra -.L_bn_div_3_words_proceed: - move ta3,ra - bal bn_div_words - move ra,ta3 - dmultu ta2,v0 - ld t2,-16(a3) - move ta0,zero - mfhi t1 - mflo t0 - sltu t8,t1,v1 -.L_bn_div_3_words_inner_loop: - bnez t8,.L_bn_div_3_words_inner_loop_done - sgeu AT,t2,t0 - seq t9,t1,v1 - and AT,t9 - sltu t3,t0,ta2 - daddu v1,a2 - dsubu t1,t3 - dsubu t0,ta2 - sltu t8,t1,v1 - sltu ta0,v1,a2 - or t8,ta0 - .set noreorder - beqzl AT,.L_bn_div_3_words_inner_loop - dsubu v0,1 - .set reorder -.L_bn_div_3_words_inner_loop_done: - jr ra -END(bn_div_3_words) - -.align 5 -LEAF(bn_div_words) - .set noreorder - bnezl a2,.L_bn_div_words_proceed - move v1,zero - jr ra - li v0,-1 /* I'd rather signal div-by-zero - * which can be done with 'break 7' */ - -.L_bn_div_words_proceed: - bltz a2,.L_bn_div_words_body - move t9,v1 - dsll a2,1 - bgtz a2,.-4 - addu t9,1 - - .set reorder - negu t1,t9 - li t2,-1 - dsll t2,t1 - and t2,a0 - dsrl AT,a1,t1 - .set noreorder - bnezl t2,.+8 - break 6 /* signal overflow */ - .set reorder - dsll a0,t9 - dsll a1,t9 - or a0,AT - -#define QT ta0 -#define HH ta1 -#define DH v1 -.L_bn_div_words_body: - dsrl DH,a2,32 - sgeu AT,a0,a2 - .set noreorder - bnezl AT,.+8 - dsubu a0,a2 - .set reorder - - li QT,-1 - dsrl HH,a0,32 - dsrl QT,32 /* q=0xffffffff */ - beq DH,HH,.L_bn_div_words_skip_div1 - ddivu zero,a0,DH - mflo QT -.L_bn_div_words_skip_div1: - dmultu a2,QT - dsll t3,a0,32 - dsrl AT,a1,32 - or t3,AT - mflo t0 - mfhi t1 -.L_bn_div_words_inner_loop1: - sltu t2,t3,t0 - seq t8,HH,t1 - sltu AT,HH,t1 - and t2,t8 - sltu v0,t0,a2 - or AT,t2 - .set noreorder - beqz AT,.L_bn_div_words_inner_loop1_done - dsubu t1,v0 - dsubu t0,a2 - b .L_bn_div_words_inner_loop1 - dsubu QT,1 - .set reorder -.L_bn_div_words_inner_loop1_done: - - dsll a1,32 - dsubu a0,t3,t0 - dsll v0,QT,32 - - li QT,-1 - dsrl HH,a0,32 - dsrl QT,32 /* q=0xffffffff */ - beq DH,HH,.L_bn_div_words_skip_div2 - ddivu zero,a0,DH - mflo QT -.L_bn_div_words_skip_div2: -#undef DH - dmultu a2,QT - dsll t3,a0,32 - dsrl AT,a1,32 - or t3,AT - mflo t0 - mfhi t1 -.L_bn_div_words_inner_loop2: - sltu t2,t3,t0 - seq t8,HH,t1 - sltu AT,HH,t1 - and t2,t8 - sltu v1,t0,a2 - or AT,t2 - .set noreorder - beqz AT,.L_bn_div_words_inner_loop2_done - dsubu t1,v1 - dsubu t0,a2 - b .L_bn_div_words_inner_loop2 - dsubu QT,1 - .set reorder -.L_bn_div_words_inner_loop2_done: -#undef HH - - dsubu a0,t3,t0 - or v0,QT - dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */ - dsrl a2,t9 /* restore a2 */ - jr ra -#undef QT -END(bn_div_words) - -#define a_0 t0 -#define a_1 t1 -#define a_2 t2 -#define a_3 t3 -#define b_0 ta0 -#define b_1 ta1 -#define b_2 ta2 -#define b_3 ta3 - -#define a_4 s0 -#define a_5 s2 -#define a_6 s4 -#define a_7 a1 /* once we load a[7] we don't need a anymore */ -#define b_4 s1 -#define b_5 s3 -#define b_6 s5 -#define b_7 a2 /* once we load b[7] we don't need b anymore */ - -#define t_1 t8 -#define t_2 t9 - -#define c_1 v0 -#define c_2 v1 -#define c_3 a3 - -#define FRAME_SIZE 48 - -.align 5 -LEAF(bn_mul_comba8) - .set noreorder - PTR_SUB sp,FRAME_SIZE - .frame sp,64,ra - .set reorder - ld a_0,0(a1) /* If compiled with -mips3 option on - * R5000 box assembler barks on this - * line with "shouldn't have mult/div - * as last instruction in bb (R10K - * bug)" warning. If anybody out there - * has a clue about how to circumvent - * this do send me a note. - * - */ - ld b_0,0(a2) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - ld b_1,8(a2) - ld b_2,16(a2) - ld b_3,24(a2) - dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - sd s0,0(sp) - sd s1,8(sp) - sd s2,16(sp) - sd s3,24(sp) - sd s4,32(sp) - sd s5,40(sp) - mflo c_1 - mfhi c_2 - - dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ - ld a_4,32(a1) - ld a_5,40(a1) - ld a_6,48(a1) - ld a_7,56(a1) - ld b_4,32(a2) - ld b_5,40(a2) - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ - ld b_6,48(a2) - ld b_7,56(a2) - sd c_1,0(a0) /* r[0]=c1; */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - sd c_2,8(a0) /* r[1]=c2; */ - - dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) /* r[2]=c3; */ - - dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) /* r[3]=c1; */ - - dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) /* r[4]=c2; */ - - dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) /* r[5]=c3; */ - - dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,48(a0) /* r[6]=c1; */ - - dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,56(a0) /* r[7]=c2; */ - - dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,64(a0) /* r[8]=c3; */ - - dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,72(a0) /* r[9]=c1; */ - - dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,80(a0) /* r[10]=c2; */ - - dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,88(a0) /* r[11]=c3; */ - - dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,96(a0) /* r[12]=c1; */ - - dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,104(a0) /* r[13]=c2; */ - - dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ - ld s0,0(sp) - ld s1,8(sp) - ld s2,16(sp) - ld s3,24(sp) - ld s4,32(sp) - ld s5,40(sp) - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sd c_3,112(a0) /* r[14]=c3; */ - sd c_1,120(a0) /* r[15]=c1; */ - - PTR_ADD sp,FRAME_SIZE - - jr ra -END(bn_mul_comba8) - -.align 5 -LEAF(bn_mul_comba4) - .set reorder - ld a_0,0(a1) - ld b_0,0(a2) - ld a_1,8(a1) - ld a_2,16(a1) - dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - ld a_3,24(a1) - ld b_1,8(a2) - ld b_2,16(a2) - ld b_3,24(a2) - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - sd c_2,8(a0) - - dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sd c_1,48(a0) - sd c_2,56(a0) - - jr ra -END(bn_mul_comba4) - -#undef a_4 -#undef a_5 -#undef a_6 -#undef a_7 -#define a_4 b_0 -#define a_5 b_1 -#define a_6 b_2 -#define a_7 b_3 - -.align 5 -LEAF(bn_sqr_comba8) - .set reorder - ld a_0,0(a1) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - - dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - ld a_4,32(a1) - ld a_5,40(a1) - ld a_6,48(a1) - ld a_7,56(a1) - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - sd c_2,8(a0) - - dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,48(a0) - - dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,56(a0) - - dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,64(a0) - - dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,72(a0) - - dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,80(a0) - - dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,88(a0) - - dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,96(a0) - - dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,104(a0) - - dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sd c_3,112(a0) - sd c_1,120(a0) - - jr ra -END(bn_sqr_comba8) - -.align 5 -LEAF(bn_sqr_comba4) - .set reorder - ld a_0,0(a1) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - sd c_2,8(a0) - - dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sd c_1,48(a0) - sd c_2,56(a0) - - jr ra -END(bn_sqr_comba4) diff --git a/main/openssl/crypto/bn/asm/pa-risc2.S b/main/openssl/crypto/bn/asm/pa-risc2.S new file mode 100644 index 00000000..f3b16290 --- /dev/null +++ b/main/openssl/crypto/bn/asm/pa-risc2.S @@ -0,0 +1,1618 @@ +; +; PA-RISC 2.0 implementation of bn_asm code, based on the +; 64-bit version of the code. This code is effectively the +; same as the 64-bit version except the register model is +; slightly different given all values must be 32-bit between +; function calls. Thus the 64-bit return values are returned +; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit +; +; +; This code is approximately 2x faster than the C version +; for RSA/DSA. +; +; See http://devresource.hp.com/ for more details on the PA-RISC +; architecture. Also see the book "PA-RISC 2.0 Architecture" +; by Gerry Kane for information on the instruction set architecture. +; +; Code written by Chris Ruemmler (with some help from the HP C +; compiler). +; +; The code compiles with HP's assembler +; + + .level 2.0N + .space $TEXT$ + .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY + +; +; Global Register definitions used for the routines. +; +; Some information about HP's runtime architecture for 32-bits. +; +; "Caller save" means the calling function must save the register +; if it wants the register to be preserved. +; "Callee save" means if a function uses the register, it must save +; the value before using it. +; +; For the floating point registers +; +; "caller save" registers: fr4-fr11, fr22-fr31 +; "callee save" registers: fr12-fr21 +; "special" registers: fr0-fr3 (status and exception registers) +; +; For the integer registers +; value zero : r0 +; "caller save" registers: r1,r19-r26 +; "callee save" registers: r3-r18 +; return register : r2 (rp) +; return values ; r28,r29 (ret0,ret1) +; Stack pointer ; r30 (sp) +; millicode return ptr ; r31 (also a caller save register) + + +; +; Arguments to the routines +; +r_ptr .reg %r26 +a_ptr .reg %r25 +b_ptr .reg %r24 +num .reg %r24 +n .reg %r23 + +; +; Note that the "w" argument for bn_mul_add_words and bn_mul_words +; is passed on the stack at a delta of -56 from the top of stack +; as the routine is entered. +; + +; +; Globals used in some routines +; + +top_overflow .reg %r23 +high_mask .reg %r22 ; value 0xffffffff80000000L + + +;------------------------------------------------------------------------------ +; +; bn_mul_add_words +; +;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, +; int num, BN_ULONG w) +; +; arg0 = r_ptr +; arg1 = a_ptr +; arg3 = num +; -56(sp) = w +; +; Local register definitions +; + +fm1 .reg %fr22 +fm .reg %fr23 +ht_temp .reg %fr24 +ht_temp_1 .reg %fr25 +lt_temp .reg %fr26 +lt_temp_1 .reg %fr27 +fm1_1 .reg %fr28 +fm_1 .reg %fr29 + +fw_h .reg %fr7L +fw_l .reg %fr7R +fw .reg %fr7 + +fht_0 .reg %fr8L +flt_0 .reg %fr8R +t_float_0 .reg %fr8 + +fht_1 .reg %fr9L +flt_1 .reg %fr9R +t_float_1 .reg %fr9 + +tmp_0 .reg %r31 +tmp_1 .reg %r21 +m_0 .reg %r20 +m_1 .reg %r19 +ht_0 .reg %r1 +ht_1 .reg %r3 +lt_0 .reg %r4 +lt_1 .reg %r5 +m1_0 .reg %r6 +m1_1 .reg %r7 +rp_val .reg %r8 +rp_val_1 .reg %r9 + +bn_mul_add_words + .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN + .proc + .callinfo frame=128 + .entry + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + NOP ; Needed to make the loop 16-byte aligned + NOP ; needed to make the loop 16-byte aligned + + STD %r5,16(%sp) ; save r5 + NOP + STD %r6,24(%sp) ; save r6 + STD %r7,32(%sp) ; save r7 + + STD %r8,40(%sp) ; save r8 + STD %r9,48(%sp) ; save r9 + COPY %r0,%ret1 ; return 0 by default + DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 + + CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit + LDO 128(%sp),%sp ; bump stack + + ; + ; The loop is unrolled twice, so if there is only 1 number + ; then go straight to the cleanup code. + ; + CMPIB,= 1,num,bn_mul_add_words_single_top + FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; + ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus + ; two 32-bit mutiplies can be issued per cycle. + ; +bn_mul_add_words_unroll2 + + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) + LDD 0(r_ptr),rp_val ; rp[0] + LDD 8(r_ptr),rp_val_1 ; rp[1] + + XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l + XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l + FSTD fm1,-16(%sp) ; -16(sp) = m1[0] + FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] + + XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h + XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h + FSTD fm,-8(%sp) ; -8(sp) = m[0] + FSTD fm_1,-40(%sp) ; -40(sp) = m[1] + + XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h + XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h + FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp + FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 + + XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l + XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l + FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp + FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 + + LDD -8(%sp),m_0 ; m[0] + LDD -40(%sp),m_1 ; m[1] + LDD -16(%sp),m1_0 ; m1[0] + LDD -48(%sp),m1_1 ; m1[1] + + LDD -24(%sp),ht_0 ; ht[0] + LDD -56(%sp),ht_1 ; ht[1] + ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; + ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; + + LDD -32(%sp),lt_0 + LDD -64(%sp),lt_1 + CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) + ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) + + CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) + ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) + EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 + DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 + + EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 + DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 + ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) + ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) + + ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; + ADD,DC ht_0,%r0,ht_0 ; ht[0]++ + ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; + ADD,DC ht_1,%r0,ht_1 ; ht[1]++ + + ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c; + ADD,DC ht_0,%r0,ht_0 ; ht[0]++ + ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] + ADD,DC ht_0,%r0,ht_0 ; ht[0]++ + + LDO -2(num),num ; num = num - 2; + ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); + ADD,DC ht_1,%r0,ht_1 ; ht[1]++ + STD lt_0,0(r_ptr) ; rp[0] = lt[0] + + ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] + ADD,DC ht_1,%r0,%ret1 ; ht[1]++ + LDO 16(a_ptr),a_ptr ; a_ptr += 2 + + STD lt_1,8(r_ptr) ; rp[1] = lt[1] + CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do + LDO 16(r_ptr),r_ptr ; r_ptr += 2 + + CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one + + ; + ; Top of loop aligned on 64-byte boundary + ; +bn_mul_add_words_single_top + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + LDD 0(r_ptr),rp_val ; rp[0] + LDO 8(a_ptr),a_ptr ; a_ptr++ + XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l + FSTD fm1,-16(%sp) ; -16(sp) = m1 + XMPYU flt_0,fw_h,fm ; m = lt*fw_h + FSTD fm,-8(%sp) ; -8(sp) = m + XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h + FSTD ht_temp,-24(%sp) ; -24(sp) = ht + XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l + FSTD lt_temp,-32(%sp) ; -32(sp) = lt + + LDD -8(%sp),m_0 + LDD -16(%sp),m1_0 ; m1 = temp1 + ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; + LDD -24(%sp),ht_0 + LDD -32(%sp),lt_0 + + CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) + ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) + + EXTRD,U tmp_0,31,32,m_0 ; m>>32 + DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 + + ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) + ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; + ADD,DC ht_0,%r0,ht_0 ; ht++ + ADD %ret1,tmp_0,lt_0 ; lt = lt + c; + ADD,DC ht_0,%r0,ht_0 ; ht++ + ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] + ADD,DC ht_0,%r0,%ret1 ; ht++ + STD lt_0,0(r_ptr) ; rp[0] = lt + +bn_mul_add_words_exit + .EXIT + + EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 + LDD -80(%sp),%r9 ; restore r9 + LDD -88(%sp),%r8 ; restore r8 + LDD -96(%sp),%r7 ; restore r7 + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 ; restore r3 + .PROCEND ;in=23,24,25,26,29;out=28; + +;---------------------------------------------------------------------------- +; +;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) +; +; arg0 = rp +; arg1 = ap +; arg3 = num +; w on stack at -56(sp) + +bn_mul_words + .proc + .callinfo frame=128 + .entry + .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + NOP + STD %r5,16(%sp) ; save r5 + + STD %r6,24(%sp) ; save r6 + STD %r7,32(%sp) ; save r7 + COPY %r0,%ret1 ; return 0 by default + DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 + + CMPIB,>= 0,num,bn_mul_words_exit + LDO 128(%sp),%sp ; bump stack + + ; + ; See if only 1 word to do, thus just do cleanup + ; + CMPIB,= 1,num,bn_mul_words_single_top + FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; + ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus + ; two 32-bit mutiplies can be issued per cycle. + ; +bn_mul_words_unroll2 + + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) + XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l + XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l + + FSTD fm1,-16(%sp) ; -16(sp) = m1 + FSTD fm1_1,-48(%sp) ; -48(sp) = m1 + XMPYU flt_0,fw_h,fm ; m = lt*fw_h + XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h + + FSTD fm,-8(%sp) ; -8(sp) = m + FSTD fm_1,-40(%sp) ; -40(sp) = m + XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h + XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h + + FSTD ht_temp,-24(%sp) ; -24(sp) = ht + FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht + XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l + XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l + + FSTD lt_temp,-32(%sp) ; -32(sp) = lt + FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt + LDD -8(%sp),m_0 + LDD -40(%sp),m_1 + + LDD -16(%sp),m1_0 + LDD -48(%sp),m1_1 + LDD -24(%sp),ht_0 + LDD -56(%sp),ht_1 + + ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; + ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; + LDD -32(%sp),lt_0 + LDD -64(%sp),lt_1 + + CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) + ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) + CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) + ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) + + EXTRD,U tmp_0,31,32,m_0 ; m>>32 + DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 + EXTRD,U tmp_1,31,32,m_1 ; m>>32 + DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 + + ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) + ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) + ADD lt_0,m1_0,lt_0 ; lt = lt+m1; + ADD,DC ht_0,%r0,ht_0 ; ht++ + + ADD lt_1,m1_1,lt_1 ; lt = lt+m1; + ADD,DC ht_1,%r0,ht_1 ; ht++ + ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1); + ADD,DC ht_0,%r0,ht_0 ; ht++ + + ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) + ADD,DC ht_1,%r0,ht_1 ; ht++ + STD lt_0,0(r_ptr) ; rp[0] = lt + STD lt_1,8(r_ptr) ; rp[1] = lt + + COPY ht_1,%ret1 ; carry = ht + LDO -2(num),num ; num = num - 2; + LDO 16(a_ptr),a_ptr ; ap += 2 + CMPIB,<= 2,num,bn_mul_words_unroll2 + LDO 16(r_ptr),r_ptr ; rp++ + + CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? + + ; + ; Top of loop aligned on 64-byte boundary + ; +bn_mul_words_single_top + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + + XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l + FSTD fm1,-16(%sp) ; -16(sp) = m1 + XMPYU flt_0,fw_h,fm ; m = lt*fw_h + FSTD fm,-8(%sp) ; -8(sp) = m + XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h + FSTD ht_temp,-24(%sp) ; -24(sp) = ht + XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l + FSTD lt_temp,-32(%sp) ; -32(sp) = lt + + LDD -8(%sp),m_0 + LDD -16(%sp),m1_0 + ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; + LDD -24(%sp),ht_0 + LDD -32(%sp),lt_0 + + CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) + ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) + + EXTRD,U tmp_0,31,32,m_0 ; m>>32 + DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 + + ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) + ADD lt_0,m1_0,lt_0 ; lt= lt+m1; + ADD,DC ht_0,%r0,ht_0 ; ht++ + + ADD %ret1,lt_0,lt_0 ; lt = lt + c; + ADD,DC ht_0,%r0,ht_0 ; ht++ + + COPY ht_0,%ret1 ; copy carry + STD lt_0,0(r_ptr) ; rp[0] = lt + +bn_mul_words_exit + .EXIT + EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 + LDD -96(%sp),%r7 ; restore r7 + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 ; restore r3 + .PROCEND + +;---------------------------------------------------------------------------- +; +;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) +; +; arg0 = rp +; arg1 = ap +; arg2 = num +; + +bn_sqr_words + .proc + .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + NOP + STD %r5,16(%sp) ; save r5 + + CMPIB,>= 0,num,bn_sqr_words_exit + LDO 128(%sp),%sp ; bump stack + + ; + ; If only 1, the goto straight to cleanup + ; + CMPIB,= 1,num,bn_sqr_words_single_top + DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; + +bn_sqr_words_unroll2 + FLDD 0(a_ptr),t_float_0 ; a[0] + FLDD 8(a_ptr),t_float_1 ; a[1] + XMPYU fht_0,flt_0,fm ; m[0] + XMPYU fht_1,flt_1,fm_1 ; m[1] + + FSTD fm,-24(%sp) ; store m[0] + FSTD fm_1,-56(%sp) ; store m[1] + XMPYU flt_0,flt_0,lt_temp ; lt[0] + XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] + + FSTD lt_temp,-16(%sp) ; store lt[0] + FSTD lt_temp_1,-48(%sp) ; store lt[1] + XMPYU fht_0,fht_0,ht_temp ; ht[0] + XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] + + FSTD ht_temp,-8(%sp) ; store ht[0] + FSTD ht_temp_1,-40(%sp) ; store ht[1] + LDD -24(%sp),m_0 + LDD -56(%sp),m_1 + + AND m_0,high_mask,tmp_0 ; m[0] & Mask + AND m_1,high_mask,tmp_1 ; m[1] & Mask + DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 + DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 + + LDD -16(%sp),lt_0 + LDD -48(%sp),lt_1 + EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 + EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 + + LDD -8(%sp),ht_0 + LDD -40(%sp),ht_1 + ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 + ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 + + ADD lt_0,m_0,lt_0 ; lt = lt+m + ADD,DC ht_0,%r0,ht_0 ; ht[0]++ + STD lt_0,0(r_ptr) ; rp[0] = lt[0] + STD ht_0,8(r_ptr) ; rp[1] = ht[1] + + ADD lt_1,m_1,lt_1 ; lt = lt+m + ADD,DC ht_1,%r0,ht_1 ; ht[1]++ + STD lt_1,16(r_ptr) ; rp[2] = lt[1] + STD ht_1,24(r_ptr) ; rp[3] = ht[1] + + LDO -2(num),num ; num = num - 2; + LDO 16(a_ptr),a_ptr ; ap += 2 + CMPIB,<= 2,num,bn_sqr_words_unroll2 + LDO 32(r_ptr),r_ptr ; rp += 4 + + CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? + + ; + ; Top of loop aligned on 64-byte boundary + ; +bn_sqr_words_single_top + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + + XMPYU fht_0,flt_0,fm ; m + FSTD fm,-24(%sp) ; store m + + XMPYU flt_0,flt_0,lt_temp ; lt + FSTD lt_temp,-16(%sp) ; store lt + + XMPYU fht_0,fht_0,ht_temp ; ht + FSTD ht_temp,-8(%sp) ; store ht + + LDD -24(%sp),m_0 ; load m + AND m_0,high_mask,tmp_0 ; m & Mask + DEPD,Z m_0,30,31,m_0 ; m << 32+1 + LDD -16(%sp),lt_0 ; lt + + LDD -8(%sp),ht_0 ; ht + EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 + ADD m_0,lt_0,lt_0 ; lt = lt+m + ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 + ADD,DC ht_0,%r0,ht_0 ; ht++ + + STD lt_0,0(r_ptr) ; rp[0] = lt + STD ht_0,8(r_ptr) ; rp[1] = ht + +bn_sqr_words_exit + .EXIT + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + .PROCEND ;in=23,24,25,26,29;out=28; + + +;---------------------------------------------------------------------------- +; +;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) +; +; arg0 = rp +; arg1 = ap +; arg2 = bp +; arg3 = n + +t .reg %r22 +b .reg %r21 +l .reg %r20 + +bn_add_words + .proc + .entry + .callinfo + .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .align 64 + + CMPIB,>= 0,n,bn_add_words_exit + COPY %r0,%ret1 ; return 0 by default + + ; + ; If 2 or more numbers do the loop + ; + CMPIB,= 1,n,bn_add_words_single_top + NOP + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; +bn_add_words_unroll2 + LDD 0(a_ptr),t + LDD 0(b_ptr),b + ADD t,%ret1,t ; t = t+c; + ADD,DC %r0,%r0,%ret1 ; set c to carry + ADD t,b,l ; l = t + b[0] + ADD,DC %ret1,%r0,%ret1 ; c+= carry + STD l,0(r_ptr) + + LDD 8(a_ptr),t + LDD 8(b_ptr),b + ADD t,%ret1,t ; t = t+c; + ADD,DC %r0,%r0,%ret1 ; set c to carry + ADD t,b,l ; l = t + b[0] + ADD,DC %ret1,%r0,%ret1 ; c+= carry + STD l,8(r_ptr) + + LDO -2(n),n + LDO 16(a_ptr),a_ptr + LDO 16(b_ptr),b_ptr + + CMPIB,<= 2,n,bn_add_words_unroll2 + LDO 16(r_ptr),r_ptr + + CMPIB,=,N 0,n,bn_add_words_exit ; are we done? + +bn_add_words_single_top + LDD 0(a_ptr),t + LDD 0(b_ptr),b + + ADD t,%ret1,t ; t = t+c; + ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??) + ADD t,b,l ; l = t + b[0] + ADD,DC %ret1,%r0,%ret1 ; c+= carry + STD l,0(r_ptr) + +bn_add_words_exit + .EXIT + BVE (%rp) + EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 + .PROCEND ;in=23,24,25,26,29;out=28; + +;---------------------------------------------------------------------------- +; +;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) +; +; arg0 = rp +; arg1 = ap +; arg2 = bp +; arg3 = n + +t1 .reg %r22 +t2 .reg %r21 +sub_tmp1 .reg %r20 +sub_tmp2 .reg %r19 + + +bn_sub_words + .proc + .callinfo + .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + + CMPIB,>= 0,n,bn_sub_words_exit + COPY %r0,%ret1 ; return 0 by default + + ; + ; If 2 or more numbers do the loop + ; + CMPIB,= 1,n,bn_sub_words_single_top + NOP + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; +bn_sub_words_unroll2 + LDD 0(a_ptr),t1 + LDD 0(b_ptr),t2 + SUB t1,t2,sub_tmp1 ; t3 = t1-t2; + SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; + + CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 + LDO 1(%r0),sub_tmp2 + + CMPCLR,*= t1,t2,%r0 + COPY sub_tmp2,%ret1 + STD sub_tmp1,0(r_ptr) + + LDD 8(a_ptr),t1 + LDD 8(b_ptr),t2 + SUB t1,t2,sub_tmp1 ; t3 = t1-t2; + SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; + CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 + LDO 1(%r0),sub_tmp2 + + CMPCLR,*= t1,t2,%r0 + COPY sub_tmp2,%ret1 + STD sub_tmp1,8(r_ptr) + + LDO -2(n),n + LDO 16(a_ptr),a_ptr + LDO 16(b_ptr),b_ptr + + CMPIB,<= 2,n,bn_sub_words_unroll2 + LDO 16(r_ptr),r_ptr + + CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? + +bn_sub_words_single_top + LDD 0(a_ptr),t1 + LDD 0(b_ptr),t2 + SUB t1,t2,sub_tmp1 ; t3 = t1-t2; + SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; + CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 + LDO 1(%r0),sub_tmp2 + + CMPCLR,*= t1,t2,%r0 + COPY sub_tmp2,%ret1 + + STD sub_tmp1,0(r_ptr) + +bn_sub_words_exit + .EXIT + BVE (%rp) + EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 + .PROCEND ;in=23,24,25,26,29;out=28; + +;------------------------------------------------------------------------------ +; +; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) +; +; arg0 = h +; arg1 = l +; arg2 = d +; +; This is mainly just output from the HP C compiler. +; +;------------------------------------------------------------------------------ +bn_div_words + .PROC + .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN + .IMPORT BN_num_bits_word,CODE + ;--- not PIC .IMPORT __iob,DATA + ;--- not PIC .IMPORT fprintf,CODE + .IMPORT abort,CODE + .IMPORT $$div2U,MILLICODE + .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE + .ENTRY + STW %r2,-20(%r30) ;offset 0x8ec + STW,MA %r3,192(%r30) ;offset 0x8f0 + STW %r4,-188(%r30) ;offset 0x8f4 + DEPD %r5,31,32,%r6 ;offset 0x8f8 + STD %r6,-184(%r30) ;offset 0x8fc + DEPD %r7,31,32,%r8 ;offset 0x900 + STD %r8,-176(%r30) ;offset 0x904 + STW %r9,-168(%r30) ;offset 0x908 + LDD -248(%r30),%r3 ;offset 0x90c + COPY %r26,%r4 ;offset 0x910 + COPY %r24,%r5 ;offset 0x914 + DEPD %r25,31,32,%r4 ;offset 0x918 + CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c + DEPD %r23,31,32,%r5 ;offset 0x920 + MOVIB,TR -1,%r29,$00060002 ;offset 0x924 + EXTRD,U %r29,31,32,%r28 ;offset 0x928 +$0006002A + LDO -1(%r29),%r29 ;offset 0x92c + SUB %r23,%r7,%r23 ;offset 0x930 +$00060024 + SUB %r4,%r31,%r25 ;offset 0x934 + AND %r25,%r19,%r26 ;offset 0x938 + CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c + DEPD,Z %r25,31,32,%r20 ;offset 0x940 + OR %r20,%r24,%r21 ;offset 0x944 + CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948 + SUB %r31,%r2,%r31 ;offset 0x94c +$00060046 +$0006002E + DEPD,Z %r23,31,32,%r25 ;offset 0x950 + EXTRD,U %r23,31,32,%r26 ;offset 0x954 + AND %r25,%r19,%r24 ;offset 0x958 + ADD,L %r31,%r26,%r31 ;offset 0x95c + CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960 + LDO 1(%r31),%r31 ;offset 0x964 +$00060032 + CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968 + LDO -1(%r29),%r29 ;offset 0x96c + ADD,L %r4,%r3,%r4 ;offset 0x970 +$00060036 + ADDIB,=,N -1,%r8,$D0 ;offset 0x974 + SUB %r5,%r24,%r28 ;offset 0x978 +$0006003A + SUB %r4,%r31,%r24 ;offset 0x97c + SHRPD %r24,%r28,32,%r4 ;offset 0x980 + DEPD,Z %r29,31,32,%r9 ;offset 0x984 + DEPD,Z %r28,31,32,%r5 ;offset 0x988 +$0006001C + EXTRD,U %r4,31,32,%r31 ;offset 0x98c + CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990 + MOVB,TR %r6,%r29,$D1 ;offset 0x994 + STD %r29,-152(%r30) ;offset 0x998 +$0006000C + EXTRD,U %r3,31,32,%r25 ;offset 0x99c + COPY %r3,%r26 ;offset 0x9a0 + EXTRD,U %r3,31,32,%r9 ;offset 0x9a4 + EXTRD,U %r4,31,32,%r8 ;offset 0x9a8 + .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28; + B,L BN_num_bits_word,%r2 ;offset 0x9ac + EXTRD,U %r5,31,32,%r7 ;offset 0x9b0 + LDI 64,%r20 ;offset 0x9b4 + DEPD %r7,31,32,%r5 ;offset 0x9b8 + DEPD %r8,31,32,%r4 ;offset 0x9bc + DEPD %r9,31,32,%r3 ;offset 0x9c0 + CMPB,= %r28,%r20,$00060012 ;offset 0x9c4 + COPY %r28,%r24 ;offset 0x9c8 + MTSARCM %r24 ;offset 0x9cc + DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0 + CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4 +$00060012 + SUBI 64,%r24,%r31 ;offset 0x9d8 + CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc + SUB %r4,%r3,%r4 ;offset 0x9e0 +$00060016 + CMPB,= %r31,%r0,$0006001A ;offset 0x9e4 + COPY %r0,%r9 ;offset 0x9e8 + MTSARCM %r31 ;offset 0x9ec + DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0 + SUBI 64,%r31,%r26 ;offset 0x9f4 + MTSAR %r26 ;offset 0x9f8 + SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc + MTSARCM %r31 ;offset 0xa00 + DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04 +$0006001A + DEPDI,Z -1,31,32,%r19 ;offset 0xa08 + AND %r3,%r19,%r29 ;offset 0xa0c + EXTRD,U %r29,31,32,%r2 ;offset 0xa10 + DEPDI,Z -1,63,32,%r6 ;offset 0xa14 + MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 + EXTRD,U %r3,63,32,%r7 ;offset 0xa1c +$D2 + ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 + ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24 + ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 + ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; + ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c + ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30 + .CALL ; + B,L abort,%r2 ;offset 0xa34 + NOP ;offset 0xa38 + B $D3 ;offset 0xa3c + LDW -212(%r30),%r2 ;offset 0xa40 +$00060020 + COPY %r4,%r26 ;offset 0xa44 + EXTRD,U %r4,31,32,%r25 ;offset 0xa48 + COPY %r2,%r24 ;offset 0xa4c + .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) + B,L $$div2U,%r31 ;offset 0xa50 + EXTRD,U %r2,31,32,%r23 ;offset 0xa54 + DEPD %r28,31,32,%r29 ;offset 0xa58 +$00060022 + STD %r29,-152(%r30) ;offset 0xa5c +$D1 + AND %r5,%r19,%r24 ;offset 0xa60 + EXTRD,U %r24,31,32,%r24 ;offset 0xa64 + STW %r2,-160(%r30) ;offset 0xa68 + STW %r7,-128(%r30) ;offset 0xa6c + FLDD -152(%r30),%fr4 ;offset 0xa70 + FLDD -152(%r30),%fr7 ;offset 0xa74 + FLDW -160(%r30),%fr8L ;offset 0xa78 + FLDW -128(%r30),%fr5L ;offset 0xa7c + XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80 + FSTD %fr10,-136(%r30) ;offset 0xa84 + XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88 + FSTD %fr22,-144(%r30) ;offset 0xa8c + XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90 + XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94 + FSTD %fr11,-112(%r30) ;offset 0xa98 + FSTD %fr23,-120(%r30) ;offset 0xa9c + LDD -136(%r30),%r28 ;offset 0xaa0 + DEPD,Z %r28,31,32,%r31 ;offset 0xaa4 + LDD -144(%r30),%r20 ;offset 0xaa8 + ADD,L %r20,%r31,%r31 ;offset 0xaac + LDD -112(%r30),%r22 ;offset 0xab0 + DEPD,Z %r22,31,32,%r22 ;offset 0xab4 + LDD -120(%r30),%r21 ;offset 0xab8 + B $00060024 ;offset 0xabc + ADD,L %r21,%r22,%r23 ;offset 0xac0 +$D0 + OR %r9,%r29,%r29 ;offset 0xac4 +$00060040 + EXTRD,U %r29,31,32,%r28 ;offset 0xac8 +$00060002 +$L2 + LDW -212(%r30),%r2 ;offset 0xacc +$D3 + LDW -168(%r30),%r9 ;offset 0xad0 + LDD -176(%r30),%r8 ;offset 0xad4 + EXTRD,U %r8,31,32,%r7 ;offset 0xad8 + LDD -184(%r30),%r6 ;offset 0xadc + EXTRD,U %r6,31,32,%r5 ;offset 0xae0 + LDW -188(%r30),%r4 ;offset 0xae4 + BVE (%r2) ;offset 0xae8 + .EXIT + LDW,MB -192(%r30),%r3 ;offset 0xaec + .PROCEND ;in=23,25;out=28,29;fpin=105,107; + + + + +;---------------------------------------------------------------------------- +; +; Registers to hold 64-bit values to manipulate. The "L" part +; of the register corresponds to the upper 32-bits, while the "R" +; part corresponds to the lower 32-bits +; +; Note, that when using b6 and b7, the code must save these before +; using them because they are callee save registers +; +; +; Floating point registers to use to save values that +; are manipulated. These don't collide with ftemp1-6 and +; are all caller save registers +; +a0 .reg %fr22 +a0L .reg %fr22L +a0R .reg %fr22R + +a1 .reg %fr23 +a1L .reg %fr23L +a1R .reg %fr23R + +a2 .reg %fr24 +a2L .reg %fr24L +a2R .reg %fr24R + +a3 .reg %fr25 +a3L .reg %fr25L +a3R .reg %fr25R + +a4 .reg %fr26 +a4L .reg %fr26L +a4R .reg %fr26R + +a5 .reg %fr27 +a5L .reg %fr27L +a5R .reg %fr27R + +a6 .reg %fr28 +a6L .reg %fr28L +a6R .reg %fr28R + +a7 .reg %fr29 +a7L .reg %fr29L +a7R .reg %fr29R + +b0 .reg %fr30 +b0L .reg %fr30L +b0R .reg %fr30R + +b1 .reg %fr31 +b1L .reg %fr31L +b1R .reg %fr31R + +; +; Temporary floating point variables, these are all caller save +; registers +; +ftemp1 .reg %fr4 +ftemp2 .reg %fr5 +ftemp3 .reg %fr6 +ftemp4 .reg %fr7 + +; +; The B set of registers when used. +; + +b2 .reg %fr8 +b2L .reg %fr8L +b2R .reg %fr8R + +b3 .reg %fr9 +b3L .reg %fr9L +b3R .reg %fr9R + +b4 .reg %fr10 +b4L .reg %fr10L +b4R .reg %fr10R + +b5 .reg %fr11 +b5L .reg %fr11L +b5R .reg %fr11R + +b6 .reg %fr12 +b6L .reg %fr12L +b6R .reg %fr12R + +b7 .reg %fr13 +b7L .reg %fr13L +b7R .reg %fr13R + +c1 .reg %r21 ; only reg +temp1 .reg %r20 ; only reg +temp2 .reg %r19 ; only reg +temp3 .reg %r31 ; only reg + +m1 .reg %r28 +c2 .reg %r23 +high_one .reg %r1 +ht .reg %r6 +lt .reg %r5 +m .reg %r4 +c3 .reg %r3 + +SQR_ADD_C .macro A0L,A0R,C1,C2,C3 + XMPYU A0L,A0R,ftemp1 ; m + FSTD ftemp1,-24(%sp) ; store m + + XMPYU A0R,A0R,ftemp2 ; lt + FSTD ftemp2,-16(%sp) ; store lt + + XMPYU A0L,A0L,ftemp3 ; ht + FSTD ftemp3,-8(%sp) ; store ht + + LDD -24(%sp),m ; load m + AND m,high_mask,temp2 ; m & Mask + DEPD,Z m,30,31,temp3 ; m << 32+1 + LDD -16(%sp),lt ; lt + + LDD -8(%sp),ht ; ht + EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 + ADD temp3,lt,lt ; lt = lt+m + ADD,L ht,temp1,ht ; ht += temp1 + ADD,DC ht,%r0,ht ; ht++ + + ADD C1,lt,C1 ; c1=c1+lt + ADD,DC ht,%r0,ht ; ht++ + + ADD C2,ht,C2 ; c2=c2+ht + ADD,DC C3,%r0,C3 ; c3++ +.endm + +SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 + XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht + FSTD ftemp1,-16(%sp) ; + XMPYU A0R,A1L,ftemp2 ; m = bh*lt + FSTD ftemp2,-8(%sp) ; + XMPYU A0R,A1R,ftemp3 ; lt = bl*lt + FSTD ftemp3,-32(%sp) + XMPYU A0L,A1L,ftemp4 ; ht = bh*ht + FSTD ftemp4,-24(%sp) ; + + LDD -8(%sp),m ; r21 = m + LDD -16(%sp),m1 ; r19 = m1 + ADD,L m,m1,m ; m+m1 + + DEPD,Z m,31,32,temp3 ; (m+m1<<32) + LDD -24(%sp),ht ; r24 = ht + + CMPCLR,*>>= m,m1,%r0 ; if (m < m1) + ADD,L ht,high_one,ht ; ht+=high_one + + EXTRD,U m,31,32,temp1 ; m >> 32 + LDD -32(%sp),lt ; lt + ADD,L ht,temp1,ht ; ht+= m>>32 + ADD lt,temp3,lt ; lt = lt+m1 + ADD,DC ht,%r0,ht ; ht++ + + ADD ht,ht,ht ; ht=ht+ht; + ADD,DC C3,%r0,C3 ; add in carry (c3++) + + ADD lt,lt,lt ; lt=lt+lt; + ADD,DC ht,%r0,ht ; add in carry (ht++) + + ADD C1,lt,C1 ; c1=c1+lt + ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) + LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise + + ADD C2,ht,C2 ; c2 = c2 + ht + ADD,DC C3,%r0,C3 ; add in carry (c3++) +.endm + +; +;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) +; arg0 = r_ptr +; arg1 = a_ptr +; + +bn_sqr_comba8 + .PROC + .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .ENTRY + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + + ; + ; Zero out carries + ; + COPY %r0,c1 + COPY %r0,c2 + COPY %r0,c3 + + LDO 128(%sp),%sp ; bump stack + DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L + DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 + + ; + ; Load up all of the values we are going to use + ; + FLDD 0(a_ptr),a0 + FLDD 8(a_ptr),a1 + FLDD 16(a_ptr),a2 + FLDD 24(a_ptr),a3 + FLDD 32(a_ptr),a4 + FLDD 40(a_ptr),a5 + FLDD 48(a_ptr),a6 + FLDD 56(a_ptr),a7 + + SQR_ADD_C a0L,a0R,c1,c2,c3 + STD c1,0(r_ptr) ; r[0] = c1; + COPY %r0,c1 + + SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 + STD c2,8(r_ptr) ; r[1] = c2; + COPY %r0,c2 + + SQR_ADD_C a1L,a1R,c3,c1,c2 + SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 + STD c3,16(r_ptr) ; r[2] = c3; + COPY %r0,c3 + + SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 + SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 + STD c1,24(r_ptr) ; r[3] = c1; + COPY %r0,c1 + + SQR_ADD_C a2L,a2R,c2,c3,c1 + SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 + SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 + STD c2,32(r_ptr) ; r[4] = c2; + COPY %r0,c2 + + SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 + SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 + SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 + STD c3,40(r_ptr) ; r[5] = c3; + COPY %r0,c3 + + SQR_ADD_C a3L,a3R,c1,c2,c3 + SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 + SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 + SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 + STD c1,48(r_ptr) ; r[6] = c1; + COPY %r0,c1 + + SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 + SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 + SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 + SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 + STD c2,56(r_ptr) ; r[7] = c2; + COPY %r0,c2 + + SQR_ADD_C a4L,a4R,c3,c1,c2 + SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 + SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 + SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 + STD c3,64(r_ptr) ; r[8] = c3; + COPY %r0,c3 + + SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 + SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 + SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 + STD c1,72(r_ptr) ; r[9] = c1; + COPY %r0,c1 + + SQR_ADD_C a5L,a5R,c2,c3,c1 + SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 + SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 + STD c2,80(r_ptr) ; r[10] = c2; + COPY %r0,c2 + + SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 + SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 + STD c3,88(r_ptr) ; r[11] = c3; + COPY %r0,c3 + + SQR_ADD_C a6L,a6R,c1,c2,c3 + SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 + STD c1,96(r_ptr) ; r[12] = c1; + COPY %r0,c1 + + SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 + STD c2,104(r_ptr) ; r[13] = c2; + COPY %r0,c2 + + SQR_ADD_C a7L,a7R,c3,c1,c2 + STD c3, 112(r_ptr) ; r[14] = c3 + STD c1, 120(r_ptr) ; r[15] = c1 + + .EXIT + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + + .PROCEND + +;----------------------------------------------------------------------------- +; +;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) +; arg0 = r_ptr +; arg1 = a_ptr +; + +bn_sqr_comba4 + .proc + .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + + ; + ; Zero out carries + ; + COPY %r0,c1 + COPY %r0,c2 + COPY %r0,c3 + + LDO 128(%sp),%sp ; bump stack + DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L + DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 + + ; + ; Load up all of the values we are going to use + ; + FLDD 0(a_ptr),a0 + FLDD 8(a_ptr),a1 + FLDD 16(a_ptr),a2 + FLDD 24(a_ptr),a3 + FLDD 32(a_ptr),a4 + FLDD 40(a_ptr),a5 + FLDD 48(a_ptr),a6 + FLDD 56(a_ptr),a7 + + SQR_ADD_C a0L,a0R,c1,c2,c3 + + STD c1,0(r_ptr) ; r[0] = c1; + COPY %r0,c1 + + SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 + + STD c2,8(r_ptr) ; r[1] = c2; + COPY %r0,c2 + + SQR_ADD_C a1L,a1R,c3,c1,c2 + SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 + + STD c3,16(r_ptr) ; r[2] = c3; + COPY %r0,c3 + + SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 + SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 + + STD c1,24(r_ptr) ; r[3] = c1; + COPY %r0,c1 + + SQR_ADD_C a2L,a2R,c2,c3,c1 + SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 + + STD c2,32(r_ptr) ; r[4] = c2; + COPY %r0,c2 + + SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 + STD c3,40(r_ptr) ; r[5] = c3; + COPY %r0,c3 + + SQR_ADD_C a3L,a3R,c1,c2,c3 + STD c1,48(r_ptr) ; r[6] = c1; + STD c2,56(r_ptr) ; r[7] = c2; + + .EXIT + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + + .PROCEND + + +;--------------------------------------------------------------------------- + +MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 + XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht + FSTD ftemp1,-16(%sp) ; + XMPYU A0R,B0L,ftemp2 ; m = bh*lt + FSTD ftemp2,-8(%sp) ; + XMPYU A0R,B0R,ftemp3 ; lt = bl*lt + FSTD ftemp3,-32(%sp) + XMPYU A0L,B0L,ftemp4 ; ht = bh*ht + FSTD ftemp4,-24(%sp) ; + + LDD -8(%sp),m ; r21 = m + LDD -16(%sp),m1 ; r19 = m1 + ADD,L m,m1,m ; m+m1 + + DEPD,Z m,31,32,temp3 ; (m+m1<<32) + LDD -24(%sp),ht ; r24 = ht + + CMPCLR,*>>= m,m1,%r0 ; if (m < m1) + ADD,L ht,high_one,ht ; ht+=high_one + + EXTRD,U m,31,32,temp1 ; m >> 32 + LDD -32(%sp),lt ; lt + ADD,L ht,temp1,ht ; ht+= m>>32 + ADD lt,temp3,lt ; lt = lt+m1 + ADD,DC ht,%r0,ht ; ht++ + + ADD C1,lt,C1 ; c1=c1+lt + ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise + + ADD C2,ht,C2 ; c2 = c2 + ht + ADD,DC C3,%r0,C3 ; add in carry (c3++) +.endm + + +; +;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) +; arg0 = r_ptr +; arg1 = a_ptr +; arg2 = b_ptr +; + +bn_mul_comba8 + .proc + .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + FSTD %fr12,32(%sp) ; save r6 + FSTD %fr13,40(%sp) ; save r7 + + ; + ; Zero out carries + ; + COPY %r0,c1 + COPY %r0,c2 + COPY %r0,c3 + + LDO 128(%sp),%sp ; bump stack + DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 + + ; + ; Load up all of the values we are going to use + ; + FLDD 0(a_ptr),a0 + FLDD 8(a_ptr),a1 + FLDD 16(a_ptr),a2 + FLDD 24(a_ptr),a3 + FLDD 32(a_ptr),a4 + FLDD 40(a_ptr),a5 + FLDD 48(a_ptr),a6 + FLDD 56(a_ptr),a7 + + FLDD 0(b_ptr),b0 + FLDD 8(b_ptr),b1 + FLDD 16(b_ptr),b2 + FLDD 24(b_ptr),b3 + FLDD 32(b_ptr),b4 + FLDD 40(b_ptr),b5 + FLDD 48(b_ptr),b6 + FLDD 56(b_ptr),b7 + + MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 + STD c1,0(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 + STD c2,8(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 + MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 + MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 + STD c3,16(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 + MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 + MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 + MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 + STD c1,24(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 + MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 + MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 + STD c2,32(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 + MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 + MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 + MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 + MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 + MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 + STD c3,40(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 + MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 + MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 + MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 + MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 + MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 + MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 + STD c1,48(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 + MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 + MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 + MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 + MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 + MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 + STD c2,56(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 + MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 + MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 + MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 + MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 + MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 + MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 + STD c3,64(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 + MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 + MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 + MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 + MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 + MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 + STD c1,72(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 + MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 + MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 + MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 + MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 + STD c2,80(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 + MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 + MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 + MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 + STD c3,88(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 + MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 + MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 + STD c1,96(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 + MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 + STD c2,104(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 + STD c3,112(r_ptr) + STD c1,120(r_ptr) + + .EXIT + FLDD -88(%sp),%fr13 + FLDD -96(%sp),%fr12 + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + + .PROCEND + +;----------------------------------------------------------------------------- +; +;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) +; arg0 = r_ptr +; arg1 = a_ptr +; arg2 = b_ptr +; + +bn_mul_comba4 + .proc + .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + FSTD %fr12,32(%sp) ; save r6 + FSTD %fr13,40(%sp) ; save r7 + + ; + ; Zero out carries + ; + COPY %r0,c1 + COPY %r0,c2 + COPY %r0,c3 + + LDO 128(%sp),%sp ; bump stack + DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 + + ; + ; Load up all of the values we are going to use + ; + FLDD 0(a_ptr),a0 + FLDD 8(a_ptr),a1 + FLDD 16(a_ptr),a2 + FLDD 24(a_ptr),a3 + + FLDD 0(b_ptr),b0 + FLDD 8(b_ptr),b1 + FLDD 16(b_ptr),b2 + FLDD 24(b_ptr),b3 + + MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 + STD c1,0(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 + STD c2,8(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 + MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 + MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 + STD c3,16(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 + MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 + MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 + MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 + STD c1,24(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 + STD c2,32(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 + MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 + STD c3,40(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 + STD c1,48(r_ptr) + STD c2,56(r_ptr) + + .EXIT + FLDD -88(%sp),%fr13 + FLDD -96(%sp),%fr12 + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + + .PROCEND + + +;--- not PIC .SPACE $TEXT$ +;--- not PIC .SUBSPA $CODE$ +;--- not PIC .SPACE $PRIVATE$,SORT=16 +;--- not PIC .IMPORT $global$,DATA +;--- not PIC .SPACE $TEXT$ +;--- not PIC .SUBSPA $CODE$ +;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c +;--- not PIC C$7 +;--- not PIC .ALIGN 8 +;--- not PIC .STRINGZ "Division would overflow (%d)\n" + .END diff --git a/main/openssl/crypto/bn/asm/pa-risc2.s b/main/openssl/crypto/bn/asm/pa-risc2.s deleted file mode 100644 index f3b16290..00000000 --- a/main/openssl/crypto/bn/asm/pa-risc2.s +++ /dev/null @@ -1,1618 +0,0 @@ -; -; PA-RISC 2.0 implementation of bn_asm code, based on the -; 64-bit version of the code. This code is effectively the -; same as the 64-bit version except the register model is -; slightly different given all values must be 32-bit between -; function calls. Thus the 64-bit return values are returned -; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit -; -; -; This code is approximately 2x faster than the C version -; for RSA/DSA. -; -; See http://devresource.hp.com/ for more details on the PA-RISC -; architecture. Also see the book "PA-RISC 2.0 Architecture" -; by Gerry Kane for information on the instruction set architecture. -; -; Code written by Chris Ruemmler (with some help from the HP C -; compiler). -; -; The code compiles with HP's assembler -; - - .level 2.0N - .space $TEXT$ - .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY - -; -; Global Register definitions used for the routines. -; -; Some information about HP's runtime architecture for 32-bits. -; -; "Caller save" means the calling function must save the register -; if it wants the register to be preserved. -; "Callee save" means if a function uses the register, it must save -; the value before using it. -; -; For the floating point registers -; -; "caller save" registers: fr4-fr11, fr22-fr31 -; "callee save" registers: fr12-fr21 -; "special" registers: fr0-fr3 (status and exception registers) -; -; For the integer registers -; value zero : r0 -; "caller save" registers: r1,r19-r26 -; "callee save" registers: r3-r18 -; return register : r2 (rp) -; return values ; r28,r29 (ret0,ret1) -; Stack pointer ; r30 (sp) -; millicode return ptr ; r31 (also a caller save register) - - -; -; Arguments to the routines -; -r_ptr .reg %r26 -a_ptr .reg %r25 -b_ptr .reg %r24 -num .reg %r24 -n .reg %r23 - -; -; Note that the "w" argument for bn_mul_add_words and bn_mul_words -; is passed on the stack at a delta of -56 from the top of stack -; as the routine is entered. -; - -; -; Globals used in some routines -; - -top_overflow .reg %r23 -high_mask .reg %r22 ; value 0xffffffff80000000L - - -;------------------------------------------------------------------------------ -; -; bn_mul_add_words -; -;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, -; int num, BN_ULONG w) -; -; arg0 = r_ptr -; arg1 = a_ptr -; arg3 = num -; -56(sp) = w -; -; Local register definitions -; - -fm1 .reg %fr22 -fm .reg %fr23 -ht_temp .reg %fr24 -ht_temp_1 .reg %fr25 -lt_temp .reg %fr26 -lt_temp_1 .reg %fr27 -fm1_1 .reg %fr28 -fm_1 .reg %fr29 - -fw_h .reg %fr7L -fw_l .reg %fr7R -fw .reg %fr7 - -fht_0 .reg %fr8L -flt_0 .reg %fr8R -t_float_0 .reg %fr8 - -fht_1 .reg %fr9L -flt_1 .reg %fr9R -t_float_1 .reg %fr9 - -tmp_0 .reg %r31 -tmp_1 .reg %r21 -m_0 .reg %r20 -m_1 .reg %r19 -ht_0 .reg %r1 -ht_1 .reg %r3 -lt_0 .reg %r4 -lt_1 .reg %r5 -m1_0 .reg %r6 -m1_1 .reg %r7 -rp_val .reg %r8 -rp_val_1 .reg %r9 - -bn_mul_add_words - .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN - .proc - .callinfo frame=128 - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP ; Needed to make the loop 16-byte aligned - NOP ; needed to make the loop 16-byte aligned - - STD %r5,16(%sp) ; save r5 - NOP - STD %r6,24(%sp) ; save r6 - STD %r7,32(%sp) ; save r7 - - STD %r8,40(%sp) ; save r8 - STD %r9,48(%sp) ; save r9 - COPY %r0,%ret1 ; return 0 by default - DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 - - CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit - LDO 128(%sp),%sp ; bump stack - - ; - ; The loop is unrolled twice, so if there is only 1 number - ; then go straight to the cleanup code. - ; - CMPIB,= 1,num,bn_mul_add_words_single_top - FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus - ; two 32-bit mutiplies can be issued per cycle. - ; -bn_mul_add_words_unroll2 - - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) - LDD 0(r_ptr),rp_val ; rp[0] - LDD 8(r_ptr),rp_val_1 ; rp[1] - - XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l - XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1[0] - FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] - - XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h - XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m[0] - FSTD fm_1,-40(%sp) ; -40(sp) = m[1] - - XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h - XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp - FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 - - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp - FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 - - LDD -8(%sp),m_0 ; m[0] - LDD -40(%sp),m_1 ; m[1] - LDD -16(%sp),m1_0 ; m1[0] - LDD -48(%sp),m1_1 ; m1[1] - - LDD -24(%sp),ht_0 ; ht[0] - LDD -56(%sp),ht_1 ; ht[1] - ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; - ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; - - LDD -32(%sp),lt_0 - LDD -64(%sp),lt_1 - CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) - ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) - - CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) - ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) - EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 - - EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 - DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 - ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) - ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) - - ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - - ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c; - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - - LDO -2(num),num ; num = num - 2; - ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - STD lt_0,0(r_ptr) ; rp[0] = lt[0] - - ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] - ADD,DC ht_1,%r0,%ret1 ; ht[1]++ - LDO 16(a_ptr),a_ptr ; a_ptr += 2 - - STD lt_1,8(r_ptr) ; rp[1] = lt[1] - CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do - LDO 16(r_ptr),r_ptr ; r_ptr += 2 - - CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_mul_add_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - LDD 0(r_ptr),rp_val ; rp[0] - LDO 8(a_ptr),a_ptr ; a_ptr++ - XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - - LDD -8(%sp),m_0 - LDD -16(%sp),m1_0 ; m1 = temp1 - ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; - LDD -24(%sp),ht_0 - LDD -32(%sp),lt_0 - - CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - ADD %ret1,tmp_0,lt_0 ; lt = lt + c; - ADD,DC ht_0,%r0,ht_0 ; ht++ - ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] - ADD,DC ht_0,%r0,%ret1 ; ht++ - STD lt_0,0(r_ptr) ; rp[0] = lt - -bn_mul_add_words_exit - .EXIT - - EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 - LDD -80(%sp),%r9 ; restore r9 - LDD -88(%sp),%r8 ; restore r8 - LDD -96(%sp),%r7 ; restore r7 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 ; restore r3 - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) -; -; arg0 = rp -; arg1 = ap -; arg3 = num -; w on stack at -56(sp) - -bn_mul_words - .proc - .callinfo frame=128 - .entry - .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP - STD %r5,16(%sp) ; save r5 - - STD %r6,24(%sp) ; save r6 - STD %r7,32(%sp) ; save r7 - COPY %r0,%ret1 ; return 0 by default - DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 - - CMPIB,>= 0,num,bn_mul_words_exit - LDO 128(%sp),%sp ; bump stack - - ; - ; See if only 1 word to do, thus just do cleanup - ; - CMPIB,= 1,num,bn_mul_words_single_top - FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus - ; two 32-bit mutiplies can be issued per cycle. - ; -bn_mul_words_unroll2 - - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) - XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l - XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l - - FSTD fm1,-16(%sp) ; -16(sp) = m1 - FSTD fm1_1,-48(%sp) ; -48(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h - - FSTD fm,-8(%sp) ; -8(sp) = m - FSTD fm_1,-40(%sp) ; -40(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h - XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h - - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l - - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt - LDD -8(%sp),m_0 - LDD -40(%sp),m_1 - - LDD -16(%sp),m1_0 - LDD -48(%sp),m1_1 - LDD -24(%sp),ht_0 - LDD -56(%sp),ht_1 - - ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; - ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; - LDD -32(%sp),lt_0 - LDD -64(%sp),lt_1 - - CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) - ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - EXTRD,U tmp_1,31,32,m_1 ; m>>32 - DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) - ADD lt_0,m1_0,lt_0 ; lt = lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD lt_1,m1_1,lt_1 ; lt = lt+m1; - ADD,DC ht_1,%r0,ht_1 ; ht++ - ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1); - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) - ADD,DC ht_1,%r0,ht_1 ; ht++ - STD lt_0,0(r_ptr) ; rp[0] = lt - STD lt_1,8(r_ptr) ; rp[1] = lt - - COPY ht_1,%ret1 ; carry = ht - LDO -2(num),num ; num = num - 2; - LDO 16(a_ptr),a_ptr ; ap += 2 - CMPIB,<= 2,num,bn_mul_words_unroll2 - LDO 16(r_ptr),r_ptr ; rp++ - - CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_mul_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - - XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - - LDD -8(%sp),m_0 - LDD -16(%sp),m1_0 - ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; - LDD -24(%sp),ht_0 - LDD -32(%sp),lt_0 - - CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD lt_0,m1_0,lt_0 ; lt= lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD %ret1,lt_0,lt_0 ; lt = lt + c; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - COPY ht_0,%ret1 ; copy carry - STD lt_0,0(r_ptr) ; rp[0] = lt - -bn_mul_words_exit - .EXIT - EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 - LDD -96(%sp),%r7 ; restore r7 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 ; restore r3 - .PROCEND - -;---------------------------------------------------------------------------- -; -;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) -; -; arg0 = rp -; arg1 = ap -; arg2 = num -; - -bn_sqr_words - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP - STD %r5,16(%sp) ; save r5 - - CMPIB,>= 0,num,bn_sqr_words_exit - LDO 128(%sp),%sp ; bump stack - - ; - ; If only 1, the goto straight to cleanup - ; - CMPIB,= 1,num,bn_sqr_words_single_top - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - -bn_sqr_words_unroll2 - FLDD 0(a_ptr),t_float_0 ; a[0] - FLDD 8(a_ptr),t_float_1 ; a[1] - XMPYU fht_0,flt_0,fm ; m[0] - XMPYU fht_1,flt_1,fm_1 ; m[1] - - FSTD fm,-24(%sp) ; store m[0] - FSTD fm_1,-56(%sp) ; store m[1] - XMPYU flt_0,flt_0,lt_temp ; lt[0] - XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] - - FSTD lt_temp,-16(%sp) ; store lt[0] - FSTD lt_temp_1,-48(%sp) ; store lt[1] - XMPYU fht_0,fht_0,ht_temp ; ht[0] - XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] - - FSTD ht_temp,-8(%sp) ; store ht[0] - FSTD ht_temp_1,-40(%sp) ; store ht[1] - LDD -24(%sp),m_0 - LDD -56(%sp),m_1 - - AND m_0,high_mask,tmp_0 ; m[0] & Mask - AND m_1,high_mask,tmp_1 ; m[1] & Mask - DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 - DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 - - LDD -16(%sp),lt_0 - LDD -48(%sp),lt_1 - EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 - EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 - - LDD -8(%sp),ht_0 - LDD -40(%sp),ht_1 - ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 - ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 - - ADD lt_0,m_0,lt_0 ; lt = lt+m - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - STD lt_0,0(r_ptr) ; rp[0] = lt[0] - STD ht_0,8(r_ptr) ; rp[1] = ht[1] - - ADD lt_1,m_1,lt_1 ; lt = lt+m - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - STD lt_1,16(r_ptr) ; rp[2] = lt[1] - STD ht_1,24(r_ptr) ; rp[3] = ht[1] - - LDO -2(num),num ; num = num - 2; - LDO 16(a_ptr),a_ptr ; ap += 2 - CMPIB,<= 2,num,bn_sqr_words_unroll2 - LDO 32(r_ptr),r_ptr ; rp += 4 - - CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_sqr_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - - XMPYU fht_0,flt_0,fm ; m - FSTD fm,-24(%sp) ; store m - - XMPYU flt_0,flt_0,lt_temp ; lt - FSTD lt_temp,-16(%sp) ; store lt - - XMPYU fht_0,fht_0,ht_temp ; ht - FSTD ht_temp,-8(%sp) ; store ht - - LDD -24(%sp),m_0 ; load m - AND m_0,high_mask,tmp_0 ; m & Mask - DEPD,Z m_0,30,31,m_0 ; m << 32+1 - LDD -16(%sp),lt_0 ; lt - - LDD -8(%sp),ht_0 ; ht - EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 - ADD m_0,lt_0,lt_0 ; lt = lt+m - ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 - ADD,DC ht_0,%r0,ht_0 ; ht++ - - STD lt_0,0(r_ptr) ; rp[0] = lt - STD ht_0,8(r_ptr) ; rp[1] = ht - -bn_sqr_words_exit - .EXIT - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - .PROCEND ;in=23,24,25,26,29;out=28; - - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) -; -; arg0 = rp -; arg1 = ap -; arg2 = bp -; arg3 = n - -t .reg %r22 -b .reg %r21 -l .reg %r20 - -bn_add_words - .proc - .entry - .callinfo - .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .align 64 - - CMPIB,>= 0,n,bn_add_words_exit - COPY %r0,%ret1 ; return 0 by default - - ; - ; If 2 or more numbers do the loop - ; - CMPIB,= 1,n,bn_add_words_single_top - NOP - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; -bn_add_words_unroll2 - LDD 0(a_ptr),t - LDD 0(b_ptr),b - ADD t,%ret1,t ; t = t+c; - ADD,DC %r0,%r0,%ret1 ; set c to carry - ADD t,b,l ; l = t + b[0] - ADD,DC %ret1,%r0,%ret1 ; c+= carry - STD l,0(r_ptr) - - LDD 8(a_ptr),t - LDD 8(b_ptr),b - ADD t,%ret1,t ; t = t+c; - ADD,DC %r0,%r0,%ret1 ; set c to carry - ADD t,b,l ; l = t + b[0] - ADD,DC %ret1,%r0,%ret1 ; c+= carry - STD l,8(r_ptr) - - LDO -2(n),n - LDO 16(a_ptr),a_ptr - LDO 16(b_ptr),b_ptr - - CMPIB,<= 2,n,bn_add_words_unroll2 - LDO 16(r_ptr),r_ptr - - CMPIB,=,N 0,n,bn_add_words_exit ; are we done? - -bn_add_words_single_top - LDD 0(a_ptr),t - LDD 0(b_ptr),b - - ADD t,%ret1,t ; t = t+c; - ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??) - ADD t,b,l ; l = t + b[0] - ADD,DC %ret1,%r0,%ret1 ; c+= carry - STD l,0(r_ptr) - -bn_add_words_exit - .EXIT - BVE (%rp) - EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) -; -; arg0 = rp -; arg1 = ap -; arg2 = bp -; arg3 = n - -t1 .reg %r22 -t2 .reg %r21 -sub_tmp1 .reg %r20 -sub_tmp2 .reg %r19 - - -bn_sub_words - .proc - .callinfo - .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - CMPIB,>= 0,n,bn_sub_words_exit - COPY %r0,%ret1 ; return 0 by default - - ; - ; If 2 or more numbers do the loop - ; - CMPIB,= 1,n,bn_sub_words_single_top - NOP - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; -bn_sub_words_unroll2 - LDD 0(a_ptr),t1 - LDD 0(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; - - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret1 - STD sub_tmp1,0(r_ptr) - - LDD 8(a_ptr),t1 - LDD 8(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret1 - STD sub_tmp1,8(r_ptr) - - LDO -2(n),n - LDO 16(a_ptr),a_ptr - LDO 16(b_ptr),b_ptr - - CMPIB,<= 2,n,bn_sub_words_unroll2 - LDO 16(r_ptr),r_ptr - - CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? - -bn_sub_words_single_top - LDD 0(a_ptr),t1 - LDD 0(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret1 - - STD sub_tmp1,0(r_ptr) - -bn_sub_words_exit - .EXIT - BVE (%rp) - EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 - .PROCEND ;in=23,24,25,26,29;out=28; - -;------------------------------------------------------------------------------ -; -; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) -; -; arg0 = h -; arg1 = l -; arg2 = d -; -; This is mainly just output from the HP C compiler. -; -;------------------------------------------------------------------------------ -bn_div_words - .PROC - .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN - .IMPORT BN_num_bits_word,CODE - ;--- not PIC .IMPORT __iob,DATA - ;--- not PIC .IMPORT fprintf,CODE - .IMPORT abort,CODE - .IMPORT $$div2U,MILLICODE - .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE - .ENTRY - STW %r2,-20(%r30) ;offset 0x8ec - STW,MA %r3,192(%r30) ;offset 0x8f0 - STW %r4,-188(%r30) ;offset 0x8f4 - DEPD %r5,31,32,%r6 ;offset 0x8f8 - STD %r6,-184(%r30) ;offset 0x8fc - DEPD %r7,31,32,%r8 ;offset 0x900 - STD %r8,-176(%r30) ;offset 0x904 - STW %r9,-168(%r30) ;offset 0x908 - LDD -248(%r30),%r3 ;offset 0x90c - COPY %r26,%r4 ;offset 0x910 - COPY %r24,%r5 ;offset 0x914 - DEPD %r25,31,32,%r4 ;offset 0x918 - CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c - DEPD %r23,31,32,%r5 ;offset 0x920 - MOVIB,TR -1,%r29,$00060002 ;offset 0x924 - EXTRD,U %r29,31,32,%r28 ;offset 0x928 -$0006002A - LDO -1(%r29),%r29 ;offset 0x92c - SUB %r23,%r7,%r23 ;offset 0x930 -$00060024 - SUB %r4,%r31,%r25 ;offset 0x934 - AND %r25,%r19,%r26 ;offset 0x938 - CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c - DEPD,Z %r25,31,32,%r20 ;offset 0x940 - OR %r20,%r24,%r21 ;offset 0x944 - CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948 - SUB %r31,%r2,%r31 ;offset 0x94c -$00060046 -$0006002E - DEPD,Z %r23,31,32,%r25 ;offset 0x950 - EXTRD,U %r23,31,32,%r26 ;offset 0x954 - AND %r25,%r19,%r24 ;offset 0x958 - ADD,L %r31,%r26,%r31 ;offset 0x95c - CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960 - LDO 1(%r31),%r31 ;offset 0x964 -$00060032 - CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968 - LDO -1(%r29),%r29 ;offset 0x96c - ADD,L %r4,%r3,%r4 ;offset 0x970 -$00060036 - ADDIB,=,N -1,%r8,$D0 ;offset 0x974 - SUB %r5,%r24,%r28 ;offset 0x978 -$0006003A - SUB %r4,%r31,%r24 ;offset 0x97c - SHRPD %r24,%r28,32,%r4 ;offset 0x980 - DEPD,Z %r29,31,32,%r9 ;offset 0x984 - DEPD,Z %r28,31,32,%r5 ;offset 0x988 -$0006001C - EXTRD,U %r4,31,32,%r31 ;offset 0x98c - CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990 - MOVB,TR %r6,%r29,$D1 ;offset 0x994 - STD %r29,-152(%r30) ;offset 0x998 -$0006000C - EXTRD,U %r3,31,32,%r25 ;offset 0x99c - COPY %r3,%r26 ;offset 0x9a0 - EXTRD,U %r3,31,32,%r9 ;offset 0x9a4 - EXTRD,U %r4,31,32,%r8 ;offset 0x9a8 - .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28; - B,L BN_num_bits_word,%r2 ;offset 0x9ac - EXTRD,U %r5,31,32,%r7 ;offset 0x9b0 - LDI 64,%r20 ;offset 0x9b4 - DEPD %r7,31,32,%r5 ;offset 0x9b8 - DEPD %r8,31,32,%r4 ;offset 0x9bc - DEPD %r9,31,32,%r3 ;offset 0x9c0 - CMPB,= %r28,%r20,$00060012 ;offset 0x9c4 - COPY %r28,%r24 ;offset 0x9c8 - MTSARCM %r24 ;offset 0x9cc - DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0 - CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4 -$00060012 - SUBI 64,%r24,%r31 ;offset 0x9d8 - CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc - SUB %r4,%r3,%r4 ;offset 0x9e0 -$00060016 - CMPB,= %r31,%r0,$0006001A ;offset 0x9e4 - COPY %r0,%r9 ;offset 0x9e8 - MTSARCM %r31 ;offset 0x9ec - DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0 - SUBI 64,%r31,%r26 ;offset 0x9f4 - MTSAR %r26 ;offset 0x9f8 - SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc - MTSARCM %r31 ;offset 0xa00 - DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04 -$0006001A - DEPDI,Z -1,31,32,%r19 ;offset 0xa08 - AND %r3,%r19,%r29 ;offset 0xa0c - EXTRD,U %r29,31,32,%r2 ;offset 0xa10 - DEPDI,Z -1,63,32,%r6 ;offset 0xa14 - MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 - EXTRD,U %r3,63,32,%r7 ;offset 0xa1c -$D2 - ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 - ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24 - ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 - ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; - ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c - ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30 - .CALL ; - B,L abort,%r2 ;offset 0xa34 - NOP ;offset 0xa38 - B $D3 ;offset 0xa3c - LDW -212(%r30),%r2 ;offset 0xa40 -$00060020 - COPY %r4,%r26 ;offset 0xa44 - EXTRD,U %r4,31,32,%r25 ;offset 0xa48 - COPY %r2,%r24 ;offset 0xa4c - .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) - B,L $$div2U,%r31 ;offset 0xa50 - EXTRD,U %r2,31,32,%r23 ;offset 0xa54 - DEPD %r28,31,32,%r29 ;offset 0xa58 -$00060022 - STD %r29,-152(%r30) ;offset 0xa5c -$D1 - AND %r5,%r19,%r24 ;offset 0xa60 - EXTRD,U %r24,31,32,%r24 ;offset 0xa64 - STW %r2,-160(%r30) ;offset 0xa68 - STW %r7,-128(%r30) ;offset 0xa6c - FLDD -152(%r30),%fr4 ;offset 0xa70 - FLDD -152(%r30),%fr7 ;offset 0xa74 - FLDW -160(%r30),%fr8L ;offset 0xa78 - FLDW -128(%r30),%fr5L ;offset 0xa7c - XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80 - FSTD %fr10,-136(%r30) ;offset 0xa84 - XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88 - FSTD %fr22,-144(%r30) ;offset 0xa8c - XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90 - XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94 - FSTD %fr11,-112(%r30) ;offset 0xa98 - FSTD %fr23,-120(%r30) ;offset 0xa9c - LDD -136(%r30),%r28 ;offset 0xaa0 - DEPD,Z %r28,31,32,%r31 ;offset 0xaa4 - LDD -144(%r30),%r20 ;offset 0xaa8 - ADD,L %r20,%r31,%r31 ;offset 0xaac - LDD -112(%r30),%r22 ;offset 0xab0 - DEPD,Z %r22,31,32,%r22 ;offset 0xab4 - LDD -120(%r30),%r21 ;offset 0xab8 - B $00060024 ;offset 0xabc - ADD,L %r21,%r22,%r23 ;offset 0xac0 -$D0 - OR %r9,%r29,%r29 ;offset 0xac4 -$00060040 - EXTRD,U %r29,31,32,%r28 ;offset 0xac8 -$00060002 -$L2 - LDW -212(%r30),%r2 ;offset 0xacc -$D3 - LDW -168(%r30),%r9 ;offset 0xad0 - LDD -176(%r30),%r8 ;offset 0xad4 - EXTRD,U %r8,31,32,%r7 ;offset 0xad8 - LDD -184(%r30),%r6 ;offset 0xadc - EXTRD,U %r6,31,32,%r5 ;offset 0xae0 - LDW -188(%r30),%r4 ;offset 0xae4 - BVE (%r2) ;offset 0xae8 - .EXIT - LDW,MB -192(%r30),%r3 ;offset 0xaec - .PROCEND ;in=23,25;out=28,29;fpin=105,107; - - - - -;---------------------------------------------------------------------------- -; -; Registers to hold 64-bit values to manipulate. The "L" part -; of the register corresponds to the upper 32-bits, while the "R" -; part corresponds to the lower 32-bits -; -; Note, that when using b6 and b7, the code must save these before -; using them because they are callee save registers -; -; -; Floating point registers to use to save values that -; are manipulated. These don't collide with ftemp1-6 and -; are all caller save registers -; -a0 .reg %fr22 -a0L .reg %fr22L -a0R .reg %fr22R - -a1 .reg %fr23 -a1L .reg %fr23L -a1R .reg %fr23R - -a2 .reg %fr24 -a2L .reg %fr24L -a2R .reg %fr24R - -a3 .reg %fr25 -a3L .reg %fr25L -a3R .reg %fr25R - -a4 .reg %fr26 -a4L .reg %fr26L -a4R .reg %fr26R - -a5 .reg %fr27 -a5L .reg %fr27L -a5R .reg %fr27R - -a6 .reg %fr28 -a6L .reg %fr28L -a6R .reg %fr28R - -a7 .reg %fr29 -a7L .reg %fr29L -a7R .reg %fr29R - -b0 .reg %fr30 -b0L .reg %fr30L -b0R .reg %fr30R - -b1 .reg %fr31 -b1L .reg %fr31L -b1R .reg %fr31R - -; -; Temporary floating point variables, these are all caller save -; registers -; -ftemp1 .reg %fr4 -ftemp2 .reg %fr5 -ftemp3 .reg %fr6 -ftemp4 .reg %fr7 - -; -; The B set of registers when used. -; - -b2 .reg %fr8 -b2L .reg %fr8L -b2R .reg %fr8R - -b3 .reg %fr9 -b3L .reg %fr9L -b3R .reg %fr9R - -b4 .reg %fr10 -b4L .reg %fr10L -b4R .reg %fr10R - -b5 .reg %fr11 -b5L .reg %fr11L -b5R .reg %fr11R - -b6 .reg %fr12 -b6L .reg %fr12L -b6R .reg %fr12R - -b7 .reg %fr13 -b7L .reg %fr13L -b7R .reg %fr13R - -c1 .reg %r21 ; only reg -temp1 .reg %r20 ; only reg -temp2 .reg %r19 ; only reg -temp3 .reg %r31 ; only reg - -m1 .reg %r28 -c2 .reg %r23 -high_one .reg %r1 -ht .reg %r6 -lt .reg %r5 -m .reg %r4 -c3 .reg %r3 - -SQR_ADD_C .macro A0L,A0R,C1,C2,C3 - XMPYU A0L,A0R,ftemp1 ; m - FSTD ftemp1,-24(%sp) ; store m - - XMPYU A0R,A0R,ftemp2 ; lt - FSTD ftemp2,-16(%sp) ; store lt - - XMPYU A0L,A0L,ftemp3 ; ht - FSTD ftemp3,-8(%sp) ; store ht - - LDD -24(%sp),m ; load m - AND m,high_mask,temp2 ; m & Mask - DEPD,Z m,30,31,temp3 ; m << 32+1 - LDD -16(%sp),lt ; lt - - LDD -8(%sp),ht ; ht - EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 - ADD temp3,lt,lt ; lt = lt+m - ADD,L ht,temp1,ht ; ht += temp1 - ADD,DC ht,%r0,ht ; ht++ - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC ht,%r0,ht ; ht++ - - ADD C2,ht,C2 ; c2=c2+ht - ADD,DC C3,%r0,C3 ; c3++ -.endm - -SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 - XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht - FSTD ftemp1,-16(%sp) ; - XMPYU A0R,A1L,ftemp2 ; m = bh*lt - FSTD ftemp2,-8(%sp) ; - XMPYU A0R,A1R,ftemp3 ; lt = bl*lt - FSTD ftemp3,-32(%sp) - XMPYU A0L,A1L,ftemp4 ; ht = bh*ht - FSTD ftemp4,-24(%sp) ; - - LDD -8(%sp),m ; r21 = m - LDD -16(%sp),m1 ; r19 = m1 - ADD,L m,m1,m ; m+m1 - - DEPD,Z m,31,32,temp3 ; (m+m1<<32) - LDD -24(%sp),ht ; r24 = ht - - CMPCLR,*>>= m,m1,%r0 ; if (m < m1) - ADD,L ht,high_one,ht ; ht+=high_one - - EXTRD,U m,31,32,temp1 ; m >> 32 - LDD -32(%sp),lt ; lt - ADD,L ht,temp1,ht ; ht+= m>>32 - ADD lt,temp3,lt ; lt = lt+m1 - ADD,DC ht,%r0,ht ; ht++ - - ADD ht,ht,ht ; ht=ht+ht; - ADD,DC C3,%r0,C3 ; add in carry (c3++) - - ADD lt,lt,lt ; lt=lt+lt; - ADD,DC ht,%r0,ht ; add in carry (ht++) - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) - LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise - - ADD C2,ht,C2 ; c2 = c2 + ht - ADD,DC C3,%r0,C3 ; add in carry (c3++) -.endm - -; -;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) -; arg0 = r_ptr -; arg1 = a_ptr -; - -bn_sqr_comba8 - .PROC - .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .ENTRY - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - SQR_ADD_C a0L,a0R,c1,c2,c3 - STD c1,0(r_ptr) ; r[0] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 - STD c2,8(r_ptr) ; r[1] = c2; - COPY %r0,c2 - - SQR_ADD_C a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 - STD c3,16(r_ptr) ; r[2] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 - SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 - STD c1,24(r_ptr) ; r[3] = c1; - COPY %r0,c1 - - SQR_ADD_C a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 - SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 - STD c2,32(r_ptr) ; r[4] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 - SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 - STD c3,40(r_ptr) ; r[5] = c3; - COPY %r0,c3 - - SQR_ADD_C a3L,a3R,c1,c2,c3 - SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 - SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 - SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 - STD c1,48(r_ptr) ; r[6] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 - SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 - SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 - STD c2,56(r_ptr) ; r[7] = c2; - COPY %r0,c2 - - SQR_ADD_C a4L,a4R,c3,c1,c2 - SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 - SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 - SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 - STD c3,64(r_ptr) ; r[8] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 - SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 - SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 - STD c1,72(r_ptr) ; r[9] = c1; - COPY %r0,c1 - - SQR_ADD_C a5L,a5R,c2,c3,c1 - SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 - SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 - STD c2,80(r_ptr) ; r[10] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 - SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 - STD c3,88(r_ptr) ; r[11] = c3; - COPY %r0,c3 - - SQR_ADD_C a6L,a6R,c1,c2,c3 - SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 - STD c1,96(r_ptr) ; r[12] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 - STD c2,104(r_ptr) ; r[13] = c2; - COPY %r0,c2 - - SQR_ADD_C a7L,a7R,c3,c1,c2 - STD c3, 112(r_ptr) ; r[14] = c3 - STD c1, 120(r_ptr) ; r[15] = c1 - - .EXIT - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - -;----------------------------------------------------------------------------- -; -;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) -; arg0 = r_ptr -; arg1 = a_ptr -; - -bn_sqr_comba4 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - SQR_ADD_C a0L,a0R,c1,c2,c3 - - STD c1,0(r_ptr) ; r[0] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 - - STD c2,8(r_ptr) ; r[1] = c2; - COPY %r0,c2 - - SQR_ADD_C a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 - - STD c3,16(r_ptr) ; r[2] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 - SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 - - STD c1,24(r_ptr) ; r[3] = c1; - COPY %r0,c1 - - SQR_ADD_C a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 - - STD c2,32(r_ptr) ; r[4] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 - STD c3,40(r_ptr) ; r[5] = c3; - COPY %r0,c3 - - SQR_ADD_C a3L,a3R,c1,c2,c3 - STD c1,48(r_ptr) ; r[6] = c1; - STD c2,56(r_ptr) ; r[7] = c2; - - .EXIT - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - - -;--------------------------------------------------------------------------- - -MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 - XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht - FSTD ftemp1,-16(%sp) ; - XMPYU A0R,B0L,ftemp2 ; m = bh*lt - FSTD ftemp2,-8(%sp) ; - XMPYU A0R,B0R,ftemp3 ; lt = bl*lt - FSTD ftemp3,-32(%sp) - XMPYU A0L,B0L,ftemp4 ; ht = bh*ht - FSTD ftemp4,-24(%sp) ; - - LDD -8(%sp),m ; r21 = m - LDD -16(%sp),m1 ; r19 = m1 - ADD,L m,m1,m ; m+m1 - - DEPD,Z m,31,32,temp3 ; (m+m1<<32) - LDD -24(%sp),ht ; r24 = ht - - CMPCLR,*>>= m,m1,%r0 ; if (m < m1) - ADD,L ht,high_one,ht ; ht+=high_one - - EXTRD,U m,31,32,temp1 ; m >> 32 - LDD -32(%sp),lt ; lt - ADD,L ht,temp1,ht ; ht+= m>>32 - ADD lt,temp3,lt ; lt = lt+m1 - ADD,DC ht,%r0,ht ; ht++ - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise - - ADD C2,ht,C2 ; c2 = c2 + ht - ADD,DC C3,%r0,C3 ; add in carry (c3++) -.endm - - -; -;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = b_ptr -; - -bn_mul_comba8 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - FSTD %fr12,32(%sp) ; save r6 - FSTD %fr13,40(%sp) ; save r7 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - FLDD 0(b_ptr),b0 - FLDD 8(b_ptr),b1 - FLDD 16(b_ptr),b2 - FLDD 24(b_ptr),b3 - FLDD 32(b_ptr),b4 - FLDD 40(b_ptr),b5 - FLDD 48(b_ptr),b6 - FLDD 56(b_ptr),b7 - - MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 - STD c1,0(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 - STD c2,8(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 - STD c3,16(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 - STD c1,24(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 - STD c2,32(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 - MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 - MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 - STD c3,40(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 - STD c1,48(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 - MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 - STD c2,56(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 - STD c3,64(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 - MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 - MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 - STD c1,72(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 - MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 - MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 - STD c2,80(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 - MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 - STD c3,88(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 - STD c1,96(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 - MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 - STD c2,104(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 - STD c3,112(r_ptr) - STD c1,120(r_ptr) - - .EXIT - FLDD -88(%sp),%fr13 - FLDD -96(%sp),%fr12 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - -;----------------------------------------------------------------------------- -; -;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = b_ptr -; - -bn_mul_comba4 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - FSTD %fr12,32(%sp) ; save r6 - FSTD %fr13,40(%sp) ; save r7 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - - FLDD 0(b_ptr),b0 - FLDD 8(b_ptr),b1 - FLDD 16(b_ptr),b2 - FLDD 24(b_ptr),b3 - - MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 - STD c1,0(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 - STD c2,8(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 - STD c3,16(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 - STD c1,24(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 - STD c2,32(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 - STD c3,40(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 - STD c1,48(r_ptr) - STD c2,56(r_ptr) - - .EXIT - FLDD -88(%sp),%fr13 - FLDD -96(%sp),%fr12 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - - -;--- not PIC .SPACE $TEXT$ -;--- not PIC .SUBSPA $CODE$ -;--- not PIC .SPACE $PRIVATE$,SORT=16 -;--- not PIC .IMPORT $global$,DATA -;--- not PIC .SPACE $TEXT$ -;--- not PIC .SUBSPA $CODE$ -;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c -;--- not PIC C$7 -;--- not PIC .ALIGN 8 -;--- not PIC .STRINGZ "Division would overflow (%d)\n" - .END diff --git a/main/openssl/crypto/bn/asm/pa-risc2W.S b/main/openssl/crypto/bn/asm/pa-risc2W.S new file mode 100644 index 00000000..a9954575 --- /dev/null +++ b/main/openssl/crypto/bn/asm/pa-risc2W.S @@ -0,0 +1,1605 @@ +; +; PA-RISC 64-bit implementation of bn_asm code +; +; This code is approximately 2x faster than the C version +; for RSA/DSA. +; +; See http://devresource.hp.com/ for more details on the PA-RISC +; architecture. Also see the book "PA-RISC 2.0 Architecture" +; by Gerry Kane for information on the instruction set architecture. +; +; Code written by Chris Ruemmler (with some help from the HP C +; compiler). +; +; The code compiles with HP's assembler +; + + .level 2.0W + .space $TEXT$ + .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY + +; +; Global Register definitions used for the routines. +; +; Some information about HP's runtime architecture for 64-bits. +; +; "Caller save" means the calling function must save the register +; if it wants the register to be preserved. +; "Callee save" means if a function uses the register, it must save +; the value before using it. +; +; For the floating point registers +; +; "caller save" registers: fr4-fr11, fr22-fr31 +; "callee save" registers: fr12-fr21 +; "special" registers: fr0-fr3 (status and exception registers) +; +; For the integer registers +; value zero : r0 +; "caller save" registers: r1,r19-r26 +; "callee save" registers: r3-r18 +; return register : r2 (rp) +; return values ; r28 (ret0,ret1) +; Stack pointer ; r30 (sp) +; global data pointer ; r27 (dp) +; argument pointer ; r29 (ap) +; millicode return ptr ; r31 (also a caller save register) + + +; +; Arguments to the routines +; +r_ptr .reg %r26 +a_ptr .reg %r25 +b_ptr .reg %r24 +num .reg %r24 +w .reg %r23 +n .reg %r23 + + +; +; Globals used in some routines +; + +top_overflow .reg %r29 +high_mask .reg %r22 ; value 0xffffffff80000000L + + +;------------------------------------------------------------------------------ +; +; bn_mul_add_words +; +;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, +; int num, BN_ULONG w) +; +; arg0 = r_ptr +; arg1 = a_ptr +; arg2 = num +; arg3 = w +; +; Local register definitions +; + +fm1 .reg %fr22 +fm .reg %fr23 +ht_temp .reg %fr24 +ht_temp_1 .reg %fr25 +lt_temp .reg %fr26 +lt_temp_1 .reg %fr27 +fm1_1 .reg %fr28 +fm_1 .reg %fr29 + +fw_h .reg %fr7L +fw_l .reg %fr7R +fw .reg %fr7 + +fht_0 .reg %fr8L +flt_0 .reg %fr8R +t_float_0 .reg %fr8 + +fht_1 .reg %fr9L +flt_1 .reg %fr9R +t_float_1 .reg %fr9 + +tmp_0 .reg %r31 +tmp_1 .reg %r21 +m_0 .reg %r20 +m_1 .reg %r19 +ht_0 .reg %r1 +ht_1 .reg %r3 +lt_0 .reg %r4 +lt_1 .reg %r5 +m1_0 .reg %r6 +m1_1 .reg %r7 +rp_val .reg %r8 +rp_val_1 .reg %r9 + +bn_mul_add_words + .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN + .proc + .callinfo frame=128 + .entry + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + NOP ; Needed to make the loop 16-byte aligned + NOP ; Needed to make the loop 16-byte aligned + + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + STD %r7,32(%sp) ; save r7 + STD %r8,40(%sp) ; save r8 + + STD %r9,48(%sp) ; save r9 + COPY %r0,%ret0 ; return 0 by default + DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 + STD w,56(%sp) ; store w on stack + + CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit + LDO 128(%sp),%sp ; bump stack + + ; + ; The loop is unrolled twice, so if there is only 1 number + ; then go straight to the cleanup code. + ; + CMPIB,= 1,num,bn_mul_add_words_single_top + FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; + ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus + ; two 32-bit mutiplies can be issued per cycle. + ; +bn_mul_add_words_unroll2 + + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) + LDD 0(r_ptr),rp_val ; rp[0] + LDD 8(r_ptr),rp_val_1 ; rp[1] + + XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l + XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l + FSTD fm1,-16(%sp) ; -16(sp) = m1[0] + FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] + + XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h + XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h + FSTD fm,-8(%sp) ; -8(sp) = m[0] + FSTD fm_1,-40(%sp) ; -40(sp) = m[1] + + XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h + XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h + FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp + FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 + + XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l + XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l + FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp + FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 + + LDD -8(%sp),m_0 ; m[0] + LDD -40(%sp),m_1 ; m[1] + LDD -16(%sp),m1_0 ; m1[0] + LDD -48(%sp),m1_1 ; m1[1] + + LDD -24(%sp),ht_0 ; ht[0] + LDD -56(%sp),ht_1 ; ht[1] + ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; + ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; + + LDD -32(%sp),lt_0 + LDD -64(%sp),lt_1 + CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) + ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) + + CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) + ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) + EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 + DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 + + EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 + DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 + ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) + ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) + + ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; + ADD,DC ht_0,%r0,ht_0 ; ht[0]++ + ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; + ADD,DC ht_1,%r0,ht_1 ; ht[1]++ + + ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c; + ADD,DC ht_0,%r0,ht_0 ; ht[0]++ + ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] + ADD,DC ht_0,%r0,ht_0 ; ht[0]++ + + LDO -2(num),num ; num = num - 2; + ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); + ADD,DC ht_1,%r0,ht_1 ; ht[1]++ + STD lt_0,0(r_ptr) ; rp[0] = lt[0] + + ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] + ADD,DC ht_1,%r0,%ret0 ; ht[1]++ + LDO 16(a_ptr),a_ptr ; a_ptr += 2 + + STD lt_1,8(r_ptr) ; rp[1] = lt[1] + CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do + LDO 16(r_ptr),r_ptr ; r_ptr += 2 + + CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one + + ; + ; Top of loop aligned on 64-byte boundary + ; +bn_mul_add_words_single_top + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + LDD 0(r_ptr),rp_val ; rp[0] + LDO 8(a_ptr),a_ptr ; a_ptr++ + XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l + FSTD fm1,-16(%sp) ; -16(sp) = m1 + XMPYU flt_0,fw_h,fm ; m = lt*fw_h + FSTD fm,-8(%sp) ; -8(sp) = m + XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h + FSTD ht_temp,-24(%sp) ; -24(sp) = ht + XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l + FSTD lt_temp,-32(%sp) ; -32(sp) = lt + + LDD -8(%sp),m_0 + LDD -16(%sp),m1_0 ; m1 = temp1 + ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; + LDD -24(%sp),ht_0 + LDD -32(%sp),lt_0 + + CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) + ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) + + EXTRD,U tmp_0,31,32,m_0 ; m>>32 + DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 + + ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) + ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; + ADD,DC ht_0,%r0,ht_0 ; ht++ + ADD %ret0,tmp_0,lt_0 ; lt = lt + c; + ADD,DC ht_0,%r0,ht_0 ; ht++ + ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] + ADD,DC ht_0,%r0,%ret0 ; ht++ + STD lt_0,0(r_ptr) ; rp[0] = lt + +bn_mul_add_words_exit + .EXIT + LDD -80(%sp),%r9 ; restore r9 + LDD -88(%sp),%r8 ; restore r8 + LDD -96(%sp),%r7 ; restore r7 + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 ; restore r3 + .PROCEND ;in=23,24,25,26,29;out=28; + +;---------------------------------------------------------------------------- +; +;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) +; +; arg0 = rp +; arg1 = ap +; arg2 = num +; arg3 = w + +bn_mul_words + .proc + .callinfo frame=128 + .entry + .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + + STD %r7,32(%sp) ; save r7 + COPY %r0,%ret0 ; return 0 by default + DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 + STD w,56(%sp) ; w on stack + + CMPIB,>= 0,num,bn_mul_words_exit + LDO 128(%sp),%sp ; bump stack + + ; + ; See if only 1 word to do, thus just do cleanup + ; + CMPIB,= 1,num,bn_mul_words_single_top + FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; + ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus + ; two 32-bit mutiplies can be issued per cycle. + ; +bn_mul_words_unroll2 + + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) + XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l + XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l + + FSTD fm1,-16(%sp) ; -16(sp) = m1 + FSTD fm1_1,-48(%sp) ; -48(sp) = m1 + XMPYU flt_0,fw_h,fm ; m = lt*fw_h + XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h + + FSTD fm,-8(%sp) ; -8(sp) = m + FSTD fm_1,-40(%sp) ; -40(sp) = m + XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h + XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h + + FSTD ht_temp,-24(%sp) ; -24(sp) = ht + FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht + XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l + XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l + + FSTD lt_temp,-32(%sp) ; -32(sp) = lt + FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt + LDD -8(%sp),m_0 + LDD -40(%sp),m_1 + + LDD -16(%sp),m1_0 + LDD -48(%sp),m1_1 + LDD -24(%sp),ht_0 + LDD -56(%sp),ht_1 + + ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; + ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; + LDD -32(%sp),lt_0 + LDD -64(%sp),lt_1 + + CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) + ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) + CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) + ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) + + EXTRD,U tmp_0,31,32,m_0 ; m>>32 + DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 + EXTRD,U tmp_1,31,32,m_1 ; m>>32 + DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 + + ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) + ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) + ADD lt_0,m1_0,lt_0 ; lt = lt+m1; + ADD,DC ht_0,%r0,ht_0 ; ht++ + + ADD lt_1,m1_1,lt_1 ; lt = lt+m1; + ADD,DC ht_1,%r0,ht_1 ; ht++ + ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0); + ADD,DC ht_0,%r0,ht_0 ; ht++ + + ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) + ADD,DC ht_1,%r0,ht_1 ; ht++ + STD lt_0,0(r_ptr) ; rp[0] = lt + STD lt_1,8(r_ptr) ; rp[1] = lt + + COPY ht_1,%ret0 ; carry = ht + LDO -2(num),num ; num = num - 2; + LDO 16(a_ptr),a_ptr ; ap += 2 + CMPIB,<= 2,num,bn_mul_words_unroll2 + LDO 16(r_ptr),r_ptr ; rp++ + + CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? + + ; + ; Top of loop aligned on 64-byte boundary + ; +bn_mul_words_single_top + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + + XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l + FSTD fm1,-16(%sp) ; -16(sp) = m1 + XMPYU flt_0,fw_h,fm ; m = lt*fw_h + FSTD fm,-8(%sp) ; -8(sp) = m + XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h + FSTD ht_temp,-24(%sp) ; -24(sp) = ht + XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l + FSTD lt_temp,-32(%sp) ; -32(sp) = lt + + LDD -8(%sp),m_0 + LDD -16(%sp),m1_0 + ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; + LDD -24(%sp),ht_0 + LDD -32(%sp),lt_0 + + CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) + ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) + + EXTRD,U tmp_0,31,32,m_0 ; m>>32 + DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 + + ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) + ADD lt_0,m1_0,lt_0 ; lt= lt+m1; + ADD,DC ht_0,%r0,ht_0 ; ht++ + + ADD %ret0,lt_0,lt_0 ; lt = lt + c; + ADD,DC ht_0,%r0,ht_0 ; ht++ + + COPY ht_0,%ret0 ; copy carry + STD lt_0,0(r_ptr) ; rp[0] = lt + +bn_mul_words_exit + .EXIT + LDD -96(%sp),%r7 ; restore r7 + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 ; restore r3 + .PROCEND ;in=23,24,25,26,29;out=28; + +;---------------------------------------------------------------------------- +; +;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) +; +; arg0 = rp +; arg1 = ap +; arg2 = num +; + +bn_sqr_words + .proc + .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + NOP + STD %r5,16(%sp) ; save r5 + + CMPIB,>= 0,num,bn_sqr_words_exit + LDO 128(%sp),%sp ; bump stack + + ; + ; If only 1, the goto straight to cleanup + ; + CMPIB,= 1,num,bn_sqr_words_single_top + DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; + +bn_sqr_words_unroll2 + FLDD 0(a_ptr),t_float_0 ; a[0] + FLDD 8(a_ptr),t_float_1 ; a[1] + XMPYU fht_0,flt_0,fm ; m[0] + XMPYU fht_1,flt_1,fm_1 ; m[1] + + FSTD fm,-24(%sp) ; store m[0] + FSTD fm_1,-56(%sp) ; store m[1] + XMPYU flt_0,flt_0,lt_temp ; lt[0] + XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] + + FSTD lt_temp,-16(%sp) ; store lt[0] + FSTD lt_temp_1,-48(%sp) ; store lt[1] + XMPYU fht_0,fht_0,ht_temp ; ht[0] + XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] + + FSTD ht_temp,-8(%sp) ; store ht[0] + FSTD ht_temp_1,-40(%sp) ; store ht[1] + LDD -24(%sp),m_0 + LDD -56(%sp),m_1 + + AND m_0,high_mask,tmp_0 ; m[0] & Mask + AND m_1,high_mask,tmp_1 ; m[1] & Mask + DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 + DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 + + LDD -16(%sp),lt_0 + LDD -48(%sp),lt_1 + EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 + EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 + + LDD -8(%sp),ht_0 + LDD -40(%sp),ht_1 + ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 + ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 + + ADD lt_0,m_0,lt_0 ; lt = lt+m + ADD,DC ht_0,%r0,ht_0 ; ht[0]++ + STD lt_0,0(r_ptr) ; rp[0] = lt[0] + STD ht_0,8(r_ptr) ; rp[1] = ht[1] + + ADD lt_1,m_1,lt_1 ; lt = lt+m + ADD,DC ht_1,%r0,ht_1 ; ht[1]++ + STD lt_1,16(r_ptr) ; rp[2] = lt[1] + STD ht_1,24(r_ptr) ; rp[3] = ht[1] + + LDO -2(num),num ; num = num - 2; + LDO 16(a_ptr),a_ptr ; ap += 2 + CMPIB,<= 2,num,bn_sqr_words_unroll2 + LDO 32(r_ptr),r_ptr ; rp += 4 + + CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? + + ; + ; Top of loop aligned on 64-byte boundary + ; +bn_sqr_words_single_top + FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) + + XMPYU fht_0,flt_0,fm ; m + FSTD fm,-24(%sp) ; store m + + XMPYU flt_0,flt_0,lt_temp ; lt + FSTD lt_temp,-16(%sp) ; store lt + + XMPYU fht_0,fht_0,ht_temp ; ht + FSTD ht_temp,-8(%sp) ; store ht + + LDD -24(%sp),m_0 ; load m + AND m_0,high_mask,tmp_0 ; m & Mask + DEPD,Z m_0,30,31,m_0 ; m << 32+1 + LDD -16(%sp),lt_0 ; lt + + LDD -8(%sp),ht_0 ; ht + EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 + ADD m_0,lt_0,lt_0 ; lt = lt+m + ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 + ADD,DC ht_0,%r0,ht_0 ; ht++ + + STD lt_0,0(r_ptr) ; rp[0] = lt + STD ht_0,8(r_ptr) ; rp[1] = ht + +bn_sqr_words_exit + .EXIT + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + .PROCEND ;in=23,24,25,26,29;out=28; + + +;---------------------------------------------------------------------------- +; +;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) +; +; arg0 = rp +; arg1 = ap +; arg2 = bp +; arg3 = n + +t .reg %r22 +b .reg %r21 +l .reg %r20 + +bn_add_words + .proc + .entry + .callinfo + .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .align 64 + + CMPIB,>= 0,n,bn_add_words_exit + COPY %r0,%ret0 ; return 0 by default + + ; + ; If 2 or more numbers do the loop + ; + CMPIB,= 1,n,bn_add_words_single_top + NOP + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; +bn_add_words_unroll2 + LDD 0(a_ptr),t + LDD 0(b_ptr),b + ADD t,%ret0,t ; t = t+c; + ADD,DC %r0,%r0,%ret0 ; set c to carry + ADD t,b,l ; l = t + b[0] + ADD,DC %ret0,%r0,%ret0 ; c+= carry + STD l,0(r_ptr) + + LDD 8(a_ptr),t + LDD 8(b_ptr),b + ADD t,%ret0,t ; t = t+c; + ADD,DC %r0,%r0,%ret0 ; set c to carry + ADD t,b,l ; l = t + b[0] + ADD,DC %ret0,%r0,%ret0 ; c+= carry + STD l,8(r_ptr) + + LDO -2(n),n + LDO 16(a_ptr),a_ptr + LDO 16(b_ptr),b_ptr + + CMPIB,<= 2,n,bn_add_words_unroll2 + LDO 16(r_ptr),r_ptr + + CMPIB,=,N 0,n,bn_add_words_exit ; are we done? + +bn_add_words_single_top + LDD 0(a_ptr),t + LDD 0(b_ptr),b + + ADD t,%ret0,t ; t = t+c; + ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??) + ADD t,b,l ; l = t + b[0] + ADD,DC %ret0,%r0,%ret0 ; c+= carry + STD l,0(r_ptr) + +bn_add_words_exit + .EXIT + BVE (%rp) + NOP + .PROCEND ;in=23,24,25,26,29;out=28; + +;---------------------------------------------------------------------------- +; +;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) +; +; arg0 = rp +; arg1 = ap +; arg2 = bp +; arg3 = n + +t1 .reg %r22 +t2 .reg %r21 +sub_tmp1 .reg %r20 +sub_tmp2 .reg %r19 + + +bn_sub_words + .proc + .callinfo + .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + + CMPIB,>= 0,n,bn_sub_words_exit + COPY %r0,%ret0 ; return 0 by default + + ; + ; If 2 or more numbers do the loop + ; + CMPIB,= 1,n,bn_sub_words_single_top + NOP + + ; + ; This loop is unrolled 2 times (64-byte aligned as well) + ; +bn_sub_words_unroll2 + LDD 0(a_ptr),t1 + LDD 0(b_ptr),t2 + SUB t1,t2,sub_tmp1 ; t3 = t1-t2; + SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; + + CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 + LDO 1(%r0),sub_tmp2 + + CMPCLR,*= t1,t2,%r0 + COPY sub_tmp2,%ret0 + STD sub_tmp1,0(r_ptr) + + LDD 8(a_ptr),t1 + LDD 8(b_ptr),t2 + SUB t1,t2,sub_tmp1 ; t3 = t1-t2; + SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; + CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 + LDO 1(%r0),sub_tmp2 + + CMPCLR,*= t1,t2,%r0 + COPY sub_tmp2,%ret0 + STD sub_tmp1,8(r_ptr) + + LDO -2(n),n + LDO 16(a_ptr),a_ptr + LDO 16(b_ptr),b_ptr + + CMPIB,<= 2,n,bn_sub_words_unroll2 + LDO 16(r_ptr),r_ptr + + CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? + +bn_sub_words_single_top + LDD 0(a_ptr),t1 + LDD 0(b_ptr),t2 + SUB t1,t2,sub_tmp1 ; t3 = t1-t2; + SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; + CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 + LDO 1(%r0),sub_tmp2 + + CMPCLR,*= t1,t2,%r0 + COPY sub_tmp2,%ret0 + + STD sub_tmp1,0(r_ptr) + +bn_sub_words_exit + .EXIT + BVE (%rp) + NOP + .PROCEND ;in=23,24,25,26,29;out=28; + +;------------------------------------------------------------------------------ +; +; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) +; +; arg0 = h +; arg1 = l +; arg2 = d +; +; This is mainly just modified assembly from the compiler, thus the +; lack of variable names. +; +;------------------------------------------------------------------------------ +bn_div_words + .proc + .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .IMPORT BN_num_bits_word,CODE,NO_RELOCATION + .IMPORT __iob,DATA + .IMPORT fprintf,CODE,NO_RELOCATION + .IMPORT abort,CODE,NO_RELOCATION + .IMPORT $$div2U,MILLICODE + .entry + STD %r2,-16(%r30) + STD,MA %r3,352(%r30) + STD %r4,-344(%r30) + STD %r5,-336(%r30) + STD %r6,-328(%r30) + STD %r7,-320(%r30) + STD %r8,-312(%r30) + STD %r9,-304(%r30) + STD %r10,-296(%r30) + + STD %r27,-288(%r30) ; save gp + + COPY %r24,%r3 ; save d + COPY %r26,%r4 ; save h (high 64-bits) + LDO -1(%r0),%ret0 ; return -1 by default + + CMPB,*= %r0,%arg2,$D3 ; if (d == 0) + COPY %r25,%r5 ; save l (low 64-bits) + + LDO -48(%r30),%r29 ; create ap + .CALL ;in=26,29;out=28; + B,L BN_num_bits_word,%r2 + COPY %r3,%r26 + LDD -288(%r30),%r27 ; restore gp + LDI 64,%r21 + + CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward) + COPY %ret0,%r24 ; i + MTSARCM %r24 + DEPDI,Z -1,%sar,1,%r29 + CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<= d) + SUB %r4,%r3,%r4 ; h -= d + CMPB,= %r31,%r0,$0000001A ; if (i) + COPY %r0,%r10 ; ret = 0 + MTSARCM %r31 ; i to shift + DEPD,Z %r3,%sar,64,%r3 ; d <<= i; + SUBI 64,%r31,%r19 ; 64 - i; redundent + MTSAR %r19 ; (64 -i) to shift + SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i) + MTSARCM %r31 ; i to shift + DEPD,Z %r5,%sar,64,%r5 ; l <<= i; + +$0000001A + DEPDI,Z -1,31,32,%r19 + EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32 + EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff + LDO 2(%r0),%r9 + STD %r3,-280(%r30) ; "d" to stack + +$0000001C + DEPDI,Z -1,63,32,%r29 ; + EXTRD,U %r4,31,32,%r31 ; h >> 32 + CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div + COPY %r4,%r26 + EXTRD,U %r4,31,32,%r25 + COPY %r6,%r24 + .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) + B,L $$div2U,%r2 + EXTRD,U %r6,31,32,%r23 + DEPD %r28,31,32,%r29 +$D2 + STD %r29,-272(%r30) ; q + AND %r5,%r19,%r24 ; t & 0xffffffff00000000; + EXTRD,U %r24,31,32,%r24 ; ??? + FLDD -272(%r30),%fr7 ; q + FLDD -280(%r30),%fr8 ; d + XMPYU %fr8L,%fr7L,%fr10 + FSTD %fr10,-256(%r30) + XMPYU %fr8L,%fr7R,%fr22 + FSTD %fr22,-264(%r30) + XMPYU %fr8R,%fr7L,%fr11 + XMPYU %fr8R,%fr7R,%fr23 + FSTD %fr11,-232(%r30) + FSTD %fr23,-240(%r30) + LDD -256(%r30),%r28 + DEPD,Z %r28,31,32,%r2 + LDD -264(%r30),%r20 + ADD,L %r20,%r2,%r31 + LDD -232(%r30),%r22 + DEPD,Z %r22,31,32,%r22 + LDD -240(%r30),%r21 + B $00000024 ; enter loop + ADD,L %r21,%r22,%r23 + +$0000002A + LDO -1(%r29),%r29 + SUB %r23,%r8,%r23 +$00000024 + SUB %r4,%r31,%r25 + AND %r25,%r19,%r26 + CMPB,*<>,N %r0,%r26,$00000046 ; (forward) + DEPD,Z %r25,31,32,%r20 + OR %r20,%r24,%r21 + CMPB,*<<,N %r21,%r23,$0000002A ;(backward) + SUB %r31,%r6,%r31 +;-------------Break path--------------------- + +$00000046 + DEPD,Z %r23,31,32,%r25 ;tl + EXTRD,U %r23,31,32,%r26 ;t + AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L + ADD,L %r31,%r26,%r31 ;th += t; + CMPCLR,*>>= %r5,%r24,%r0 ;if (l>32)); + DEPD,Z %r29,31,32,%r10 ; ret = q<<32 + b $0000001C + DEPD,Z %r28,31,32,%r5 ; l = l << 32 + +$D1 + OR %r10,%r29,%r28 ; ret |= q +$D3 + LDD -368(%r30),%r2 +$D0 + LDD -296(%r30),%r10 + LDD -304(%r30),%r9 + LDD -312(%r30),%r8 + LDD -320(%r30),%r7 + LDD -328(%r30),%r6 + LDD -336(%r30),%r5 + LDD -344(%r30),%r4 + BVE (%r2) + .EXIT + LDD,MB -352(%r30),%r3 + +bn_div_err_case + MFIA %r6 + ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1 + LDO R'bn_div_words-bn_div_err_case(%r1),%r6 + ADDIL LT'__iob,%r27,%r1 + LDD RT'__iob(%r1),%r26 + ADDIL L'C$4-bn_div_words,%r6,%r1 + LDO R'C$4-bn_div_words(%r1),%r25 + LDO 64(%r26),%r26 + .CALL ;in=24,25,26,29;out=28; + B,L fprintf,%r2 + LDO -48(%r30),%r29 + LDD -288(%r30),%r27 + .CALL ;in=29; + B,L abort,%r2 + LDO -48(%r30),%r29 + LDD -288(%r30),%r27 + B $D0 + LDD -368(%r30),%r2 + .PROCEND ;in=24,25,26,29;out=28; + +;---------------------------------------------------------------------------- +; +; Registers to hold 64-bit values to manipulate. The "L" part +; of the register corresponds to the upper 32-bits, while the "R" +; part corresponds to the lower 32-bits +; +; Note, that when using b6 and b7, the code must save these before +; using them because they are callee save registers +; +; +; Floating point registers to use to save values that +; are manipulated. These don't collide with ftemp1-6 and +; are all caller save registers +; +a0 .reg %fr22 +a0L .reg %fr22L +a0R .reg %fr22R + +a1 .reg %fr23 +a1L .reg %fr23L +a1R .reg %fr23R + +a2 .reg %fr24 +a2L .reg %fr24L +a2R .reg %fr24R + +a3 .reg %fr25 +a3L .reg %fr25L +a3R .reg %fr25R + +a4 .reg %fr26 +a4L .reg %fr26L +a4R .reg %fr26R + +a5 .reg %fr27 +a5L .reg %fr27L +a5R .reg %fr27R + +a6 .reg %fr28 +a6L .reg %fr28L +a6R .reg %fr28R + +a7 .reg %fr29 +a7L .reg %fr29L +a7R .reg %fr29R + +b0 .reg %fr30 +b0L .reg %fr30L +b0R .reg %fr30R + +b1 .reg %fr31 +b1L .reg %fr31L +b1R .reg %fr31R + +; +; Temporary floating point variables, these are all caller save +; registers +; +ftemp1 .reg %fr4 +ftemp2 .reg %fr5 +ftemp3 .reg %fr6 +ftemp4 .reg %fr7 + +; +; The B set of registers when used. +; + +b2 .reg %fr8 +b2L .reg %fr8L +b2R .reg %fr8R + +b3 .reg %fr9 +b3L .reg %fr9L +b3R .reg %fr9R + +b4 .reg %fr10 +b4L .reg %fr10L +b4R .reg %fr10R + +b5 .reg %fr11 +b5L .reg %fr11L +b5R .reg %fr11R + +b6 .reg %fr12 +b6L .reg %fr12L +b6R .reg %fr12R + +b7 .reg %fr13 +b7L .reg %fr13L +b7R .reg %fr13R + +c1 .reg %r21 ; only reg +temp1 .reg %r20 ; only reg +temp2 .reg %r19 ; only reg +temp3 .reg %r31 ; only reg + +m1 .reg %r28 +c2 .reg %r23 +high_one .reg %r1 +ht .reg %r6 +lt .reg %r5 +m .reg %r4 +c3 .reg %r3 + +SQR_ADD_C .macro A0L,A0R,C1,C2,C3 + XMPYU A0L,A0R,ftemp1 ; m + FSTD ftemp1,-24(%sp) ; store m + + XMPYU A0R,A0R,ftemp2 ; lt + FSTD ftemp2,-16(%sp) ; store lt + + XMPYU A0L,A0L,ftemp3 ; ht + FSTD ftemp3,-8(%sp) ; store ht + + LDD -24(%sp),m ; load m + AND m,high_mask,temp2 ; m & Mask + DEPD,Z m,30,31,temp3 ; m << 32+1 + LDD -16(%sp),lt ; lt + + LDD -8(%sp),ht ; ht + EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 + ADD temp3,lt,lt ; lt = lt+m + ADD,L ht,temp1,ht ; ht += temp1 + ADD,DC ht,%r0,ht ; ht++ + + ADD C1,lt,C1 ; c1=c1+lt + ADD,DC ht,%r0,ht ; ht++ + + ADD C2,ht,C2 ; c2=c2+ht + ADD,DC C3,%r0,C3 ; c3++ +.endm + +SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 + XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht + FSTD ftemp1,-16(%sp) ; + XMPYU A0R,A1L,ftemp2 ; m = bh*lt + FSTD ftemp2,-8(%sp) ; + XMPYU A0R,A1R,ftemp3 ; lt = bl*lt + FSTD ftemp3,-32(%sp) + XMPYU A0L,A1L,ftemp4 ; ht = bh*ht + FSTD ftemp4,-24(%sp) ; + + LDD -8(%sp),m ; r21 = m + LDD -16(%sp),m1 ; r19 = m1 + ADD,L m,m1,m ; m+m1 + + DEPD,Z m,31,32,temp3 ; (m+m1<<32) + LDD -24(%sp),ht ; r24 = ht + + CMPCLR,*>>= m,m1,%r0 ; if (m < m1) + ADD,L ht,high_one,ht ; ht+=high_one + + EXTRD,U m,31,32,temp1 ; m >> 32 + LDD -32(%sp),lt ; lt + ADD,L ht,temp1,ht ; ht+= m>>32 + ADD lt,temp3,lt ; lt = lt+m1 + ADD,DC ht,%r0,ht ; ht++ + + ADD ht,ht,ht ; ht=ht+ht; + ADD,DC C3,%r0,C3 ; add in carry (c3++) + + ADD lt,lt,lt ; lt=lt+lt; + ADD,DC ht,%r0,ht ; add in carry (ht++) + + ADD C1,lt,C1 ; c1=c1+lt + ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) + LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise + + ADD C2,ht,C2 ; c2 = c2 + ht + ADD,DC C3,%r0,C3 ; add in carry (c3++) +.endm + +; +;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) +; arg0 = r_ptr +; arg1 = a_ptr +; + +bn_sqr_comba8 + .PROC + .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .ENTRY + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + + ; + ; Zero out carries + ; + COPY %r0,c1 + COPY %r0,c2 + COPY %r0,c3 + + LDO 128(%sp),%sp ; bump stack + DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L + DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 + + ; + ; Load up all of the values we are going to use + ; + FLDD 0(a_ptr),a0 + FLDD 8(a_ptr),a1 + FLDD 16(a_ptr),a2 + FLDD 24(a_ptr),a3 + FLDD 32(a_ptr),a4 + FLDD 40(a_ptr),a5 + FLDD 48(a_ptr),a6 + FLDD 56(a_ptr),a7 + + SQR_ADD_C a0L,a0R,c1,c2,c3 + STD c1,0(r_ptr) ; r[0] = c1; + COPY %r0,c1 + + SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 + STD c2,8(r_ptr) ; r[1] = c2; + COPY %r0,c2 + + SQR_ADD_C a1L,a1R,c3,c1,c2 + SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 + STD c3,16(r_ptr) ; r[2] = c3; + COPY %r0,c3 + + SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 + SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 + STD c1,24(r_ptr) ; r[3] = c1; + COPY %r0,c1 + + SQR_ADD_C a2L,a2R,c2,c3,c1 + SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 + SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 + STD c2,32(r_ptr) ; r[4] = c2; + COPY %r0,c2 + + SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 + SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 + SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 + STD c3,40(r_ptr) ; r[5] = c3; + COPY %r0,c3 + + SQR_ADD_C a3L,a3R,c1,c2,c3 + SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 + SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 + SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 + STD c1,48(r_ptr) ; r[6] = c1; + COPY %r0,c1 + + SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 + SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 + SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 + SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 + STD c2,56(r_ptr) ; r[7] = c2; + COPY %r0,c2 + + SQR_ADD_C a4L,a4R,c3,c1,c2 + SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 + SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 + SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 + STD c3,64(r_ptr) ; r[8] = c3; + COPY %r0,c3 + + SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 + SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 + SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 + STD c1,72(r_ptr) ; r[9] = c1; + COPY %r0,c1 + + SQR_ADD_C a5L,a5R,c2,c3,c1 + SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 + SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 + STD c2,80(r_ptr) ; r[10] = c2; + COPY %r0,c2 + + SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 + SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 + STD c3,88(r_ptr) ; r[11] = c3; + COPY %r0,c3 + + SQR_ADD_C a6L,a6R,c1,c2,c3 + SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 + STD c1,96(r_ptr) ; r[12] = c1; + COPY %r0,c1 + + SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 + STD c2,104(r_ptr) ; r[13] = c2; + COPY %r0,c2 + + SQR_ADD_C a7L,a7R,c3,c1,c2 + STD c3, 112(r_ptr) ; r[14] = c3 + STD c1, 120(r_ptr) ; r[15] = c1 + + .EXIT + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + + .PROCEND + +;----------------------------------------------------------------------------- +; +;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) +; arg0 = r_ptr +; arg1 = a_ptr +; + +bn_sqr_comba4 + .proc + .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + + ; + ; Zero out carries + ; + COPY %r0,c1 + COPY %r0,c2 + COPY %r0,c3 + + LDO 128(%sp),%sp ; bump stack + DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L + DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 + + ; + ; Load up all of the values we are going to use + ; + FLDD 0(a_ptr),a0 + FLDD 8(a_ptr),a1 + FLDD 16(a_ptr),a2 + FLDD 24(a_ptr),a3 + FLDD 32(a_ptr),a4 + FLDD 40(a_ptr),a5 + FLDD 48(a_ptr),a6 + FLDD 56(a_ptr),a7 + + SQR_ADD_C a0L,a0R,c1,c2,c3 + + STD c1,0(r_ptr) ; r[0] = c1; + COPY %r0,c1 + + SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 + + STD c2,8(r_ptr) ; r[1] = c2; + COPY %r0,c2 + + SQR_ADD_C a1L,a1R,c3,c1,c2 + SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 + + STD c3,16(r_ptr) ; r[2] = c3; + COPY %r0,c3 + + SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 + SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 + + STD c1,24(r_ptr) ; r[3] = c1; + COPY %r0,c1 + + SQR_ADD_C a2L,a2R,c2,c3,c1 + SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 + + STD c2,32(r_ptr) ; r[4] = c2; + COPY %r0,c2 + + SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 + STD c3,40(r_ptr) ; r[5] = c3; + COPY %r0,c3 + + SQR_ADD_C a3L,a3R,c1,c2,c3 + STD c1,48(r_ptr) ; r[6] = c1; + STD c2,56(r_ptr) ; r[7] = c2; + + .EXIT + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + + .PROCEND + + +;--------------------------------------------------------------------------- + +MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 + XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht + FSTD ftemp1,-16(%sp) ; + XMPYU A0R,B0L,ftemp2 ; m = bh*lt + FSTD ftemp2,-8(%sp) ; + XMPYU A0R,B0R,ftemp3 ; lt = bl*lt + FSTD ftemp3,-32(%sp) + XMPYU A0L,B0L,ftemp4 ; ht = bh*ht + FSTD ftemp4,-24(%sp) ; + + LDD -8(%sp),m ; r21 = m + LDD -16(%sp),m1 ; r19 = m1 + ADD,L m,m1,m ; m+m1 + + DEPD,Z m,31,32,temp3 ; (m+m1<<32) + LDD -24(%sp),ht ; r24 = ht + + CMPCLR,*>>= m,m1,%r0 ; if (m < m1) + ADD,L ht,high_one,ht ; ht+=high_one + + EXTRD,U m,31,32,temp1 ; m >> 32 + LDD -32(%sp),lt ; lt + ADD,L ht,temp1,ht ; ht+= m>>32 + ADD lt,temp3,lt ; lt = lt+m1 + ADD,DC ht,%r0,ht ; ht++ + + ADD C1,lt,C1 ; c1=c1+lt + ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise + + ADD C2,ht,C2 ; c2 = c2 + ht + ADD,DC C3,%r0,C3 ; add in carry (c3++) +.endm + + +; +;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) +; arg0 = r_ptr +; arg1 = a_ptr +; arg2 = b_ptr +; + +bn_mul_comba8 + .proc + .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + FSTD %fr12,32(%sp) ; save r6 + FSTD %fr13,40(%sp) ; save r7 + + ; + ; Zero out carries + ; + COPY %r0,c1 + COPY %r0,c2 + COPY %r0,c3 + + LDO 128(%sp),%sp ; bump stack + DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 + + ; + ; Load up all of the values we are going to use + ; + FLDD 0(a_ptr),a0 + FLDD 8(a_ptr),a1 + FLDD 16(a_ptr),a2 + FLDD 24(a_ptr),a3 + FLDD 32(a_ptr),a4 + FLDD 40(a_ptr),a5 + FLDD 48(a_ptr),a6 + FLDD 56(a_ptr),a7 + + FLDD 0(b_ptr),b0 + FLDD 8(b_ptr),b1 + FLDD 16(b_ptr),b2 + FLDD 24(b_ptr),b3 + FLDD 32(b_ptr),b4 + FLDD 40(b_ptr),b5 + FLDD 48(b_ptr),b6 + FLDD 56(b_ptr),b7 + + MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 + STD c1,0(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 + STD c2,8(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 + MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 + MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 + STD c3,16(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 + MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 + MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 + MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 + STD c1,24(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 + MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 + MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 + STD c2,32(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 + MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 + MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 + MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 + MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 + MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 + STD c3,40(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 + MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 + MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 + MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 + MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 + MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 + MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 + STD c1,48(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 + MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 + MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 + MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 + MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 + MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 + STD c2,56(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 + MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 + MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 + MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 + MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 + MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 + MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 + STD c3,64(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 + MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 + MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 + MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 + MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 + MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 + STD c1,72(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 + MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 + MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 + MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 + MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 + STD c2,80(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 + MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 + MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 + MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 + STD c3,88(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 + MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 + MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 + STD c1,96(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 + MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 + STD c2,104(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 + STD c3,112(r_ptr) + STD c1,120(r_ptr) + + .EXIT + FLDD -88(%sp),%fr13 + FLDD -96(%sp),%fr12 + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + + .PROCEND + +;----------------------------------------------------------------------------- +; +;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) +; arg0 = r_ptr +; arg1 = a_ptr +; arg2 = b_ptr +; + +bn_mul_comba4 + .proc + .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE + .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN + .entry + .align 64 + + STD %r3,0(%sp) ; save r3 + STD %r4,8(%sp) ; save r4 + STD %r5,16(%sp) ; save r5 + STD %r6,24(%sp) ; save r6 + FSTD %fr12,32(%sp) ; save r6 + FSTD %fr13,40(%sp) ; save r7 + + ; + ; Zero out carries + ; + COPY %r0,c1 + COPY %r0,c2 + COPY %r0,c3 + + LDO 128(%sp),%sp ; bump stack + DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 + + ; + ; Load up all of the values we are going to use + ; + FLDD 0(a_ptr),a0 + FLDD 8(a_ptr),a1 + FLDD 16(a_ptr),a2 + FLDD 24(a_ptr),a3 + + FLDD 0(b_ptr),b0 + FLDD 8(b_ptr),b1 + FLDD 16(b_ptr),b2 + FLDD 24(b_ptr),b3 + + MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 + STD c1,0(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 + STD c2,8(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 + MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 + MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 + STD c3,16(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 + MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 + MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 + MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 + STD c1,24(r_ptr) + COPY %r0,c1 + + MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 + MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 + MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 + STD c2,32(r_ptr) + COPY %r0,c2 + + MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 + MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 + STD c3,40(r_ptr) + COPY %r0,c3 + + MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 + STD c1,48(r_ptr) + STD c2,56(r_ptr) + + .EXIT + FLDD -88(%sp),%fr13 + FLDD -96(%sp),%fr12 + LDD -104(%sp),%r6 ; restore r6 + LDD -112(%sp),%r5 ; restore r5 + LDD -120(%sp),%r4 ; restore r4 + BVE (%rp) + LDD,MB -128(%sp),%r3 + + .PROCEND + + + .SPACE $TEXT$ + .SUBSPA $CODE$ + .SPACE $PRIVATE$,SORT=16 + .IMPORT $global$,DATA + .SPACE $TEXT$ + .SUBSPA $CODE$ + .SUBSPA $LIT$,ACCESS=0x2c +C$4 + .ALIGN 8 + .STRINGZ "Division would overflow (%d)\n" + .END diff --git a/main/openssl/crypto/bn/asm/pa-risc2W.s b/main/openssl/crypto/bn/asm/pa-risc2W.s deleted file mode 100644 index a9954575..00000000 --- a/main/openssl/crypto/bn/asm/pa-risc2W.s +++ /dev/null @@ -1,1605 +0,0 @@ -; -; PA-RISC 64-bit implementation of bn_asm code -; -; This code is approximately 2x faster than the C version -; for RSA/DSA. -; -; See http://devresource.hp.com/ for more details on the PA-RISC -; architecture. Also see the book "PA-RISC 2.0 Architecture" -; by Gerry Kane for information on the instruction set architecture. -; -; Code written by Chris Ruemmler (with some help from the HP C -; compiler). -; -; The code compiles with HP's assembler -; - - .level 2.0W - .space $TEXT$ - .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY - -; -; Global Register definitions used for the routines. -; -; Some information about HP's runtime architecture for 64-bits. -; -; "Caller save" means the calling function must save the register -; if it wants the register to be preserved. -; "Callee save" means if a function uses the register, it must save -; the value before using it. -; -; For the floating point registers -; -; "caller save" registers: fr4-fr11, fr22-fr31 -; "callee save" registers: fr12-fr21 -; "special" registers: fr0-fr3 (status and exception registers) -; -; For the integer registers -; value zero : r0 -; "caller save" registers: r1,r19-r26 -; "callee save" registers: r3-r18 -; return register : r2 (rp) -; return values ; r28 (ret0,ret1) -; Stack pointer ; r30 (sp) -; global data pointer ; r27 (dp) -; argument pointer ; r29 (ap) -; millicode return ptr ; r31 (also a caller save register) - - -; -; Arguments to the routines -; -r_ptr .reg %r26 -a_ptr .reg %r25 -b_ptr .reg %r24 -num .reg %r24 -w .reg %r23 -n .reg %r23 - - -; -; Globals used in some routines -; - -top_overflow .reg %r29 -high_mask .reg %r22 ; value 0xffffffff80000000L - - -;------------------------------------------------------------------------------ -; -; bn_mul_add_words -; -;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, -; int num, BN_ULONG w) -; -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = num -; arg3 = w -; -; Local register definitions -; - -fm1 .reg %fr22 -fm .reg %fr23 -ht_temp .reg %fr24 -ht_temp_1 .reg %fr25 -lt_temp .reg %fr26 -lt_temp_1 .reg %fr27 -fm1_1 .reg %fr28 -fm_1 .reg %fr29 - -fw_h .reg %fr7L -fw_l .reg %fr7R -fw .reg %fr7 - -fht_0 .reg %fr8L -flt_0 .reg %fr8R -t_float_0 .reg %fr8 - -fht_1 .reg %fr9L -flt_1 .reg %fr9R -t_float_1 .reg %fr9 - -tmp_0 .reg %r31 -tmp_1 .reg %r21 -m_0 .reg %r20 -m_1 .reg %r19 -ht_0 .reg %r1 -ht_1 .reg %r3 -lt_0 .reg %r4 -lt_1 .reg %r5 -m1_0 .reg %r6 -m1_1 .reg %r7 -rp_val .reg %r8 -rp_val_1 .reg %r9 - -bn_mul_add_words - .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN - .proc - .callinfo frame=128 - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP ; Needed to make the loop 16-byte aligned - NOP ; Needed to make the loop 16-byte aligned - - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - STD %r7,32(%sp) ; save r7 - STD %r8,40(%sp) ; save r8 - - STD %r9,48(%sp) ; save r9 - COPY %r0,%ret0 ; return 0 by default - DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 - STD w,56(%sp) ; store w on stack - - CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit - LDO 128(%sp),%sp ; bump stack - - ; - ; The loop is unrolled twice, so if there is only 1 number - ; then go straight to the cleanup code. - ; - CMPIB,= 1,num,bn_mul_add_words_single_top - FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus - ; two 32-bit mutiplies can be issued per cycle. - ; -bn_mul_add_words_unroll2 - - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) - LDD 0(r_ptr),rp_val ; rp[0] - LDD 8(r_ptr),rp_val_1 ; rp[1] - - XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l - XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1[0] - FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] - - XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h - XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m[0] - FSTD fm_1,-40(%sp) ; -40(sp) = m[1] - - XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h - XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp - FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 - - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp - FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 - - LDD -8(%sp),m_0 ; m[0] - LDD -40(%sp),m_1 ; m[1] - LDD -16(%sp),m1_0 ; m1[0] - LDD -48(%sp),m1_1 ; m1[1] - - LDD -24(%sp),ht_0 ; ht[0] - LDD -56(%sp),ht_1 ; ht[1] - ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; - ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; - - LDD -32(%sp),lt_0 - LDD -64(%sp),lt_1 - CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) - ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) - - CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) - ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) - EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 - - EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 - DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 - ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) - ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) - - ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - - ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c; - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - - LDO -2(num),num ; num = num - 2; - ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - STD lt_0,0(r_ptr) ; rp[0] = lt[0] - - ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] - ADD,DC ht_1,%r0,%ret0 ; ht[1]++ - LDO 16(a_ptr),a_ptr ; a_ptr += 2 - - STD lt_1,8(r_ptr) ; rp[1] = lt[1] - CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do - LDO 16(r_ptr),r_ptr ; r_ptr += 2 - - CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_mul_add_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - LDD 0(r_ptr),rp_val ; rp[0] - LDO 8(a_ptr),a_ptr ; a_ptr++ - XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - - LDD -8(%sp),m_0 - LDD -16(%sp),m1_0 ; m1 = temp1 - ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; - LDD -24(%sp),ht_0 - LDD -32(%sp),lt_0 - - CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - ADD %ret0,tmp_0,lt_0 ; lt = lt + c; - ADD,DC ht_0,%r0,ht_0 ; ht++ - ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] - ADD,DC ht_0,%r0,%ret0 ; ht++ - STD lt_0,0(r_ptr) ; rp[0] = lt - -bn_mul_add_words_exit - .EXIT - LDD -80(%sp),%r9 ; restore r9 - LDD -88(%sp),%r8 ; restore r8 - LDD -96(%sp),%r7 ; restore r7 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 ; restore r3 - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) -; -; arg0 = rp -; arg1 = ap -; arg2 = num -; arg3 = w - -bn_mul_words - .proc - .callinfo frame=128 - .entry - .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - STD %r7,32(%sp) ; save r7 - COPY %r0,%ret0 ; return 0 by default - DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 - STD w,56(%sp) ; w on stack - - CMPIB,>= 0,num,bn_mul_words_exit - LDO 128(%sp),%sp ; bump stack - - ; - ; See if only 1 word to do, thus just do cleanup - ; - CMPIB,= 1,num,bn_mul_words_single_top - FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus - ; two 32-bit mutiplies can be issued per cycle. - ; -bn_mul_words_unroll2 - - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) - XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l - XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l - - FSTD fm1,-16(%sp) ; -16(sp) = m1 - FSTD fm1_1,-48(%sp) ; -48(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h - - FSTD fm,-8(%sp) ; -8(sp) = m - FSTD fm_1,-40(%sp) ; -40(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h - XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h - - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l - - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt - LDD -8(%sp),m_0 - LDD -40(%sp),m_1 - - LDD -16(%sp),m1_0 - LDD -48(%sp),m1_1 - LDD -24(%sp),ht_0 - LDD -56(%sp),ht_1 - - ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; - ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; - LDD -32(%sp),lt_0 - LDD -64(%sp),lt_1 - - CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) - ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - EXTRD,U tmp_1,31,32,m_1 ; m>>32 - DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) - ADD lt_0,m1_0,lt_0 ; lt = lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD lt_1,m1_1,lt_1 ; lt = lt+m1; - ADD,DC ht_1,%r0,ht_1 ; ht++ - ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0); - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) - ADD,DC ht_1,%r0,ht_1 ; ht++ - STD lt_0,0(r_ptr) ; rp[0] = lt - STD lt_1,8(r_ptr) ; rp[1] = lt - - COPY ht_1,%ret0 ; carry = ht - LDO -2(num),num ; num = num - 2; - LDO 16(a_ptr),a_ptr ; ap += 2 - CMPIB,<= 2,num,bn_mul_words_unroll2 - LDO 16(r_ptr),r_ptr ; rp++ - - CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_mul_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - - XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - - LDD -8(%sp),m_0 - LDD -16(%sp),m1_0 - ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; - LDD -24(%sp),ht_0 - LDD -32(%sp),lt_0 - - CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD lt_0,m1_0,lt_0 ; lt= lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD %ret0,lt_0,lt_0 ; lt = lt + c; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - COPY ht_0,%ret0 ; copy carry - STD lt_0,0(r_ptr) ; rp[0] = lt - -bn_mul_words_exit - .EXIT - LDD -96(%sp),%r7 ; restore r7 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 ; restore r3 - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) -; -; arg0 = rp -; arg1 = ap -; arg2 = num -; - -bn_sqr_words - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP - STD %r5,16(%sp) ; save r5 - - CMPIB,>= 0,num,bn_sqr_words_exit - LDO 128(%sp),%sp ; bump stack - - ; - ; If only 1, the goto straight to cleanup - ; - CMPIB,= 1,num,bn_sqr_words_single_top - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - -bn_sqr_words_unroll2 - FLDD 0(a_ptr),t_float_0 ; a[0] - FLDD 8(a_ptr),t_float_1 ; a[1] - XMPYU fht_0,flt_0,fm ; m[0] - XMPYU fht_1,flt_1,fm_1 ; m[1] - - FSTD fm,-24(%sp) ; store m[0] - FSTD fm_1,-56(%sp) ; store m[1] - XMPYU flt_0,flt_0,lt_temp ; lt[0] - XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] - - FSTD lt_temp,-16(%sp) ; store lt[0] - FSTD lt_temp_1,-48(%sp) ; store lt[1] - XMPYU fht_0,fht_0,ht_temp ; ht[0] - XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] - - FSTD ht_temp,-8(%sp) ; store ht[0] - FSTD ht_temp_1,-40(%sp) ; store ht[1] - LDD -24(%sp),m_0 - LDD -56(%sp),m_1 - - AND m_0,high_mask,tmp_0 ; m[0] & Mask - AND m_1,high_mask,tmp_1 ; m[1] & Mask - DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 - DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 - - LDD -16(%sp),lt_0 - LDD -48(%sp),lt_1 - EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 - EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 - - LDD -8(%sp),ht_0 - LDD -40(%sp),ht_1 - ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 - ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 - - ADD lt_0,m_0,lt_0 ; lt = lt+m - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - STD lt_0,0(r_ptr) ; rp[0] = lt[0] - STD ht_0,8(r_ptr) ; rp[1] = ht[1] - - ADD lt_1,m_1,lt_1 ; lt = lt+m - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - STD lt_1,16(r_ptr) ; rp[2] = lt[1] - STD ht_1,24(r_ptr) ; rp[3] = ht[1] - - LDO -2(num),num ; num = num - 2; - LDO 16(a_ptr),a_ptr ; ap += 2 - CMPIB,<= 2,num,bn_sqr_words_unroll2 - LDO 32(r_ptr),r_ptr ; rp += 4 - - CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_sqr_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - - XMPYU fht_0,flt_0,fm ; m - FSTD fm,-24(%sp) ; store m - - XMPYU flt_0,flt_0,lt_temp ; lt - FSTD lt_temp,-16(%sp) ; store lt - - XMPYU fht_0,fht_0,ht_temp ; ht - FSTD ht_temp,-8(%sp) ; store ht - - LDD -24(%sp),m_0 ; load m - AND m_0,high_mask,tmp_0 ; m & Mask - DEPD,Z m_0,30,31,m_0 ; m << 32+1 - LDD -16(%sp),lt_0 ; lt - - LDD -8(%sp),ht_0 ; ht - EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 - ADD m_0,lt_0,lt_0 ; lt = lt+m - ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 - ADD,DC ht_0,%r0,ht_0 ; ht++ - - STD lt_0,0(r_ptr) ; rp[0] = lt - STD ht_0,8(r_ptr) ; rp[1] = ht - -bn_sqr_words_exit - .EXIT - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - .PROCEND ;in=23,24,25,26,29;out=28; - - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) -; -; arg0 = rp -; arg1 = ap -; arg2 = bp -; arg3 = n - -t .reg %r22 -b .reg %r21 -l .reg %r20 - -bn_add_words - .proc - .entry - .callinfo - .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .align 64 - - CMPIB,>= 0,n,bn_add_words_exit - COPY %r0,%ret0 ; return 0 by default - - ; - ; If 2 or more numbers do the loop - ; - CMPIB,= 1,n,bn_add_words_single_top - NOP - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; -bn_add_words_unroll2 - LDD 0(a_ptr),t - LDD 0(b_ptr),b - ADD t,%ret0,t ; t = t+c; - ADD,DC %r0,%r0,%ret0 ; set c to carry - ADD t,b,l ; l = t + b[0] - ADD,DC %ret0,%r0,%ret0 ; c+= carry - STD l,0(r_ptr) - - LDD 8(a_ptr),t - LDD 8(b_ptr),b - ADD t,%ret0,t ; t = t+c; - ADD,DC %r0,%r0,%ret0 ; set c to carry - ADD t,b,l ; l = t + b[0] - ADD,DC %ret0,%r0,%ret0 ; c+= carry - STD l,8(r_ptr) - - LDO -2(n),n - LDO 16(a_ptr),a_ptr - LDO 16(b_ptr),b_ptr - - CMPIB,<= 2,n,bn_add_words_unroll2 - LDO 16(r_ptr),r_ptr - - CMPIB,=,N 0,n,bn_add_words_exit ; are we done? - -bn_add_words_single_top - LDD 0(a_ptr),t - LDD 0(b_ptr),b - - ADD t,%ret0,t ; t = t+c; - ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??) - ADD t,b,l ; l = t + b[0] - ADD,DC %ret0,%r0,%ret0 ; c+= carry - STD l,0(r_ptr) - -bn_add_words_exit - .EXIT - BVE (%rp) - NOP - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) -; -; arg0 = rp -; arg1 = ap -; arg2 = bp -; arg3 = n - -t1 .reg %r22 -t2 .reg %r21 -sub_tmp1 .reg %r20 -sub_tmp2 .reg %r19 - - -bn_sub_words - .proc - .callinfo - .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - CMPIB,>= 0,n,bn_sub_words_exit - COPY %r0,%ret0 ; return 0 by default - - ; - ; If 2 or more numbers do the loop - ; - CMPIB,= 1,n,bn_sub_words_single_top - NOP - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; -bn_sub_words_unroll2 - LDD 0(a_ptr),t1 - LDD 0(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; - - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret0 - STD sub_tmp1,0(r_ptr) - - LDD 8(a_ptr),t1 - LDD 8(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret0 - STD sub_tmp1,8(r_ptr) - - LDO -2(n),n - LDO 16(a_ptr),a_ptr - LDO 16(b_ptr),b_ptr - - CMPIB,<= 2,n,bn_sub_words_unroll2 - LDO 16(r_ptr),r_ptr - - CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? - -bn_sub_words_single_top - LDD 0(a_ptr),t1 - LDD 0(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret0 - - STD sub_tmp1,0(r_ptr) - -bn_sub_words_exit - .EXIT - BVE (%rp) - NOP - .PROCEND ;in=23,24,25,26,29;out=28; - -;------------------------------------------------------------------------------ -; -; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) -; -; arg0 = h -; arg1 = l -; arg2 = d -; -; This is mainly just modified assembly from the compiler, thus the -; lack of variable names. -; -;------------------------------------------------------------------------------ -bn_div_words - .proc - .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .IMPORT BN_num_bits_word,CODE,NO_RELOCATION - .IMPORT __iob,DATA - .IMPORT fprintf,CODE,NO_RELOCATION - .IMPORT abort,CODE,NO_RELOCATION - .IMPORT $$div2U,MILLICODE - .entry - STD %r2,-16(%r30) - STD,MA %r3,352(%r30) - STD %r4,-344(%r30) - STD %r5,-336(%r30) - STD %r6,-328(%r30) - STD %r7,-320(%r30) - STD %r8,-312(%r30) - STD %r9,-304(%r30) - STD %r10,-296(%r30) - - STD %r27,-288(%r30) ; save gp - - COPY %r24,%r3 ; save d - COPY %r26,%r4 ; save h (high 64-bits) - LDO -1(%r0),%ret0 ; return -1 by default - - CMPB,*= %r0,%arg2,$D3 ; if (d == 0) - COPY %r25,%r5 ; save l (low 64-bits) - - LDO -48(%r30),%r29 ; create ap - .CALL ;in=26,29;out=28; - B,L BN_num_bits_word,%r2 - COPY %r3,%r26 - LDD -288(%r30),%r27 ; restore gp - LDI 64,%r21 - - CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward) - COPY %ret0,%r24 ; i - MTSARCM %r24 - DEPDI,Z -1,%sar,1,%r29 - CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<= d) - SUB %r4,%r3,%r4 ; h -= d - CMPB,= %r31,%r0,$0000001A ; if (i) - COPY %r0,%r10 ; ret = 0 - MTSARCM %r31 ; i to shift - DEPD,Z %r3,%sar,64,%r3 ; d <<= i; - SUBI 64,%r31,%r19 ; 64 - i; redundent - MTSAR %r19 ; (64 -i) to shift - SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i) - MTSARCM %r31 ; i to shift - DEPD,Z %r5,%sar,64,%r5 ; l <<= i; - -$0000001A - DEPDI,Z -1,31,32,%r19 - EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32 - EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff - LDO 2(%r0),%r9 - STD %r3,-280(%r30) ; "d" to stack - -$0000001C - DEPDI,Z -1,63,32,%r29 ; - EXTRD,U %r4,31,32,%r31 ; h >> 32 - CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div - COPY %r4,%r26 - EXTRD,U %r4,31,32,%r25 - COPY %r6,%r24 - .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) - B,L $$div2U,%r2 - EXTRD,U %r6,31,32,%r23 - DEPD %r28,31,32,%r29 -$D2 - STD %r29,-272(%r30) ; q - AND %r5,%r19,%r24 ; t & 0xffffffff00000000; - EXTRD,U %r24,31,32,%r24 ; ??? - FLDD -272(%r30),%fr7 ; q - FLDD -280(%r30),%fr8 ; d - XMPYU %fr8L,%fr7L,%fr10 - FSTD %fr10,-256(%r30) - XMPYU %fr8L,%fr7R,%fr22 - FSTD %fr22,-264(%r30) - XMPYU %fr8R,%fr7L,%fr11 - XMPYU %fr8R,%fr7R,%fr23 - FSTD %fr11,-232(%r30) - FSTD %fr23,-240(%r30) - LDD -256(%r30),%r28 - DEPD,Z %r28,31,32,%r2 - LDD -264(%r30),%r20 - ADD,L %r20,%r2,%r31 - LDD -232(%r30),%r22 - DEPD,Z %r22,31,32,%r22 - LDD -240(%r30),%r21 - B $00000024 ; enter loop - ADD,L %r21,%r22,%r23 - -$0000002A - LDO -1(%r29),%r29 - SUB %r23,%r8,%r23 -$00000024 - SUB %r4,%r31,%r25 - AND %r25,%r19,%r26 - CMPB,*<>,N %r0,%r26,$00000046 ; (forward) - DEPD,Z %r25,31,32,%r20 - OR %r20,%r24,%r21 - CMPB,*<<,N %r21,%r23,$0000002A ;(backward) - SUB %r31,%r6,%r31 -;-------------Break path--------------------- - -$00000046 - DEPD,Z %r23,31,32,%r25 ;tl - EXTRD,U %r23,31,32,%r26 ;t - AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L - ADD,L %r31,%r26,%r31 ;th += t; - CMPCLR,*>>= %r5,%r24,%r0 ;if (l>32)); - DEPD,Z %r29,31,32,%r10 ; ret = q<<32 - b $0000001C - DEPD,Z %r28,31,32,%r5 ; l = l << 32 - -$D1 - OR %r10,%r29,%r28 ; ret |= q -$D3 - LDD -368(%r30),%r2 -$D0 - LDD -296(%r30),%r10 - LDD -304(%r30),%r9 - LDD -312(%r30),%r8 - LDD -320(%r30),%r7 - LDD -328(%r30),%r6 - LDD -336(%r30),%r5 - LDD -344(%r30),%r4 - BVE (%r2) - .EXIT - LDD,MB -352(%r30),%r3 - -bn_div_err_case - MFIA %r6 - ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1 - LDO R'bn_div_words-bn_div_err_case(%r1),%r6 - ADDIL LT'__iob,%r27,%r1 - LDD RT'__iob(%r1),%r26 - ADDIL L'C$4-bn_div_words,%r6,%r1 - LDO R'C$4-bn_div_words(%r1),%r25 - LDO 64(%r26),%r26 - .CALL ;in=24,25,26,29;out=28; - B,L fprintf,%r2 - LDO -48(%r30),%r29 - LDD -288(%r30),%r27 - .CALL ;in=29; - B,L abort,%r2 - LDO -48(%r30),%r29 - LDD -288(%r30),%r27 - B $D0 - LDD -368(%r30),%r2 - .PROCEND ;in=24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -; Registers to hold 64-bit values to manipulate. The "L" part -; of the register corresponds to the upper 32-bits, while the "R" -; part corresponds to the lower 32-bits -; -; Note, that when using b6 and b7, the code must save these before -; using them because they are callee save registers -; -; -; Floating point registers to use to save values that -; are manipulated. These don't collide with ftemp1-6 and -; are all caller save registers -; -a0 .reg %fr22 -a0L .reg %fr22L -a0R .reg %fr22R - -a1 .reg %fr23 -a1L .reg %fr23L -a1R .reg %fr23R - -a2 .reg %fr24 -a2L .reg %fr24L -a2R .reg %fr24R - -a3 .reg %fr25 -a3L .reg %fr25L -a3R .reg %fr25R - -a4 .reg %fr26 -a4L .reg %fr26L -a4R .reg %fr26R - -a5 .reg %fr27 -a5L .reg %fr27L -a5R .reg %fr27R - -a6 .reg %fr28 -a6L .reg %fr28L -a6R .reg %fr28R - -a7 .reg %fr29 -a7L .reg %fr29L -a7R .reg %fr29R - -b0 .reg %fr30 -b0L .reg %fr30L -b0R .reg %fr30R - -b1 .reg %fr31 -b1L .reg %fr31L -b1R .reg %fr31R - -; -; Temporary floating point variables, these are all caller save -; registers -; -ftemp1 .reg %fr4 -ftemp2 .reg %fr5 -ftemp3 .reg %fr6 -ftemp4 .reg %fr7 - -; -; The B set of registers when used. -; - -b2 .reg %fr8 -b2L .reg %fr8L -b2R .reg %fr8R - -b3 .reg %fr9 -b3L .reg %fr9L -b3R .reg %fr9R - -b4 .reg %fr10 -b4L .reg %fr10L -b4R .reg %fr10R - -b5 .reg %fr11 -b5L .reg %fr11L -b5R .reg %fr11R - -b6 .reg %fr12 -b6L .reg %fr12L -b6R .reg %fr12R - -b7 .reg %fr13 -b7L .reg %fr13L -b7R .reg %fr13R - -c1 .reg %r21 ; only reg -temp1 .reg %r20 ; only reg -temp2 .reg %r19 ; only reg -temp3 .reg %r31 ; only reg - -m1 .reg %r28 -c2 .reg %r23 -high_one .reg %r1 -ht .reg %r6 -lt .reg %r5 -m .reg %r4 -c3 .reg %r3 - -SQR_ADD_C .macro A0L,A0R,C1,C2,C3 - XMPYU A0L,A0R,ftemp1 ; m - FSTD ftemp1,-24(%sp) ; store m - - XMPYU A0R,A0R,ftemp2 ; lt - FSTD ftemp2,-16(%sp) ; store lt - - XMPYU A0L,A0L,ftemp3 ; ht - FSTD ftemp3,-8(%sp) ; store ht - - LDD -24(%sp),m ; load m - AND m,high_mask,temp2 ; m & Mask - DEPD,Z m,30,31,temp3 ; m << 32+1 - LDD -16(%sp),lt ; lt - - LDD -8(%sp),ht ; ht - EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 - ADD temp3,lt,lt ; lt = lt+m - ADD,L ht,temp1,ht ; ht += temp1 - ADD,DC ht,%r0,ht ; ht++ - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC ht,%r0,ht ; ht++ - - ADD C2,ht,C2 ; c2=c2+ht - ADD,DC C3,%r0,C3 ; c3++ -.endm - -SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 - XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht - FSTD ftemp1,-16(%sp) ; - XMPYU A0R,A1L,ftemp2 ; m = bh*lt - FSTD ftemp2,-8(%sp) ; - XMPYU A0R,A1R,ftemp3 ; lt = bl*lt - FSTD ftemp3,-32(%sp) - XMPYU A0L,A1L,ftemp4 ; ht = bh*ht - FSTD ftemp4,-24(%sp) ; - - LDD -8(%sp),m ; r21 = m - LDD -16(%sp),m1 ; r19 = m1 - ADD,L m,m1,m ; m+m1 - - DEPD,Z m,31,32,temp3 ; (m+m1<<32) - LDD -24(%sp),ht ; r24 = ht - - CMPCLR,*>>= m,m1,%r0 ; if (m < m1) - ADD,L ht,high_one,ht ; ht+=high_one - - EXTRD,U m,31,32,temp1 ; m >> 32 - LDD -32(%sp),lt ; lt - ADD,L ht,temp1,ht ; ht+= m>>32 - ADD lt,temp3,lt ; lt = lt+m1 - ADD,DC ht,%r0,ht ; ht++ - - ADD ht,ht,ht ; ht=ht+ht; - ADD,DC C3,%r0,C3 ; add in carry (c3++) - - ADD lt,lt,lt ; lt=lt+lt; - ADD,DC ht,%r0,ht ; add in carry (ht++) - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) - LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise - - ADD C2,ht,C2 ; c2 = c2 + ht - ADD,DC C3,%r0,C3 ; add in carry (c3++) -.endm - -; -;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) -; arg0 = r_ptr -; arg1 = a_ptr -; - -bn_sqr_comba8 - .PROC - .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .ENTRY - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - SQR_ADD_C a0L,a0R,c1,c2,c3 - STD c1,0(r_ptr) ; r[0] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 - STD c2,8(r_ptr) ; r[1] = c2; - COPY %r0,c2 - - SQR_ADD_C a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 - STD c3,16(r_ptr) ; r[2] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 - SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 - STD c1,24(r_ptr) ; r[3] = c1; - COPY %r0,c1 - - SQR_ADD_C a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 - SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 - STD c2,32(r_ptr) ; r[4] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 - SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 - STD c3,40(r_ptr) ; r[5] = c3; - COPY %r0,c3 - - SQR_ADD_C a3L,a3R,c1,c2,c3 - SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 - SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 - SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 - STD c1,48(r_ptr) ; r[6] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 - SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 - SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 - STD c2,56(r_ptr) ; r[7] = c2; - COPY %r0,c2 - - SQR_ADD_C a4L,a4R,c3,c1,c2 - SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 - SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 - SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 - STD c3,64(r_ptr) ; r[8] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 - SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 - SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 - STD c1,72(r_ptr) ; r[9] = c1; - COPY %r0,c1 - - SQR_ADD_C a5L,a5R,c2,c3,c1 - SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 - SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 - STD c2,80(r_ptr) ; r[10] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 - SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 - STD c3,88(r_ptr) ; r[11] = c3; - COPY %r0,c3 - - SQR_ADD_C a6L,a6R,c1,c2,c3 - SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 - STD c1,96(r_ptr) ; r[12] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 - STD c2,104(r_ptr) ; r[13] = c2; - COPY %r0,c2 - - SQR_ADD_C a7L,a7R,c3,c1,c2 - STD c3, 112(r_ptr) ; r[14] = c3 - STD c1, 120(r_ptr) ; r[15] = c1 - - .EXIT - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - -;----------------------------------------------------------------------------- -; -;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) -; arg0 = r_ptr -; arg1 = a_ptr -; - -bn_sqr_comba4 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - SQR_ADD_C a0L,a0R,c1,c2,c3 - - STD c1,0(r_ptr) ; r[0] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 - - STD c2,8(r_ptr) ; r[1] = c2; - COPY %r0,c2 - - SQR_ADD_C a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 - - STD c3,16(r_ptr) ; r[2] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 - SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 - - STD c1,24(r_ptr) ; r[3] = c1; - COPY %r0,c1 - - SQR_ADD_C a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 - - STD c2,32(r_ptr) ; r[4] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 - STD c3,40(r_ptr) ; r[5] = c3; - COPY %r0,c3 - - SQR_ADD_C a3L,a3R,c1,c2,c3 - STD c1,48(r_ptr) ; r[6] = c1; - STD c2,56(r_ptr) ; r[7] = c2; - - .EXIT - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - - -;--------------------------------------------------------------------------- - -MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 - XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht - FSTD ftemp1,-16(%sp) ; - XMPYU A0R,B0L,ftemp2 ; m = bh*lt - FSTD ftemp2,-8(%sp) ; - XMPYU A0R,B0R,ftemp3 ; lt = bl*lt - FSTD ftemp3,-32(%sp) - XMPYU A0L,B0L,ftemp4 ; ht = bh*ht - FSTD ftemp4,-24(%sp) ; - - LDD -8(%sp),m ; r21 = m - LDD -16(%sp),m1 ; r19 = m1 - ADD,L m,m1,m ; m+m1 - - DEPD,Z m,31,32,temp3 ; (m+m1<<32) - LDD -24(%sp),ht ; r24 = ht - - CMPCLR,*>>= m,m1,%r0 ; if (m < m1) - ADD,L ht,high_one,ht ; ht+=high_one - - EXTRD,U m,31,32,temp1 ; m >> 32 - LDD -32(%sp),lt ; lt - ADD,L ht,temp1,ht ; ht+= m>>32 - ADD lt,temp3,lt ; lt = lt+m1 - ADD,DC ht,%r0,ht ; ht++ - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise - - ADD C2,ht,C2 ; c2 = c2 + ht - ADD,DC C3,%r0,C3 ; add in carry (c3++) -.endm - - -; -;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = b_ptr -; - -bn_mul_comba8 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - FSTD %fr12,32(%sp) ; save r6 - FSTD %fr13,40(%sp) ; save r7 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - FLDD 0(b_ptr),b0 - FLDD 8(b_ptr),b1 - FLDD 16(b_ptr),b2 - FLDD 24(b_ptr),b3 - FLDD 32(b_ptr),b4 - FLDD 40(b_ptr),b5 - FLDD 48(b_ptr),b6 - FLDD 56(b_ptr),b7 - - MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 - STD c1,0(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 - STD c2,8(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 - STD c3,16(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 - STD c1,24(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 - STD c2,32(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 - MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 - MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 - STD c3,40(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 - STD c1,48(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 - MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 - STD c2,56(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 - STD c3,64(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 - MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 - MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 - STD c1,72(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 - MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 - MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 - STD c2,80(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 - MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 - STD c3,88(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 - STD c1,96(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 - MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 - STD c2,104(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 - STD c3,112(r_ptr) - STD c1,120(r_ptr) - - .EXIT - FLDD -88(%sp),%fr13 - FLDD -96(%sp),%fr12 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - -;----------------------------------------------------------------------------- -; -;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = b_ptr -; - -bn_mul_comba4 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - FSTD %fr12,32(%sp) ; save r6 - FSTD %fr13,40(%sp) ; save r7 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - - FLDD 0(b_ptr),b0 - FLDD 8(b_ptr),b1 - FLDD 16(b_ptr),b2 - FLDD 24(b_ptr),b3 - - MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 - STD c1,0(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 - STD c2,8(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 - STD c3,16(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 - STD c1,24(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 - STD c2,32(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 - STD c3,40(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 - STD c1,48(r_ptr) - STD c2,56(r_ptr) - - .EXIT - FLDD -88(%sp),%fr13 - FLDD -96(%sp),%fr12 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - - - .SPACE $TEXT$ - .SUBSPA $CODE$ - .SPACE $PRIVATE$,SORT=16 - .IMPORT $global$,DATA - .SPACE $TEXT$ - .SUBSPA $CODE$ - .SUBSPA $LIT$,ACCESS=0x2c -C$4 - .ALIGN 8 - .STRINGZ "Division would overflow (%d)\n" - .END diff --git a/main/openssl/crypto/sha/asm/sha1-armv4-large.S b/main/openssl/crypto/sha/asm/sha1-armv4-large.S new file mode 100644 index 00000000..a1562883 --- /dev/null +++ b/main/openssl/crypto/sha/asm/sha1-armv4-large.S @@ -0,0 +1,1450 @@ +#include "arm_arch.h" + +.text +.code 32 + +.global sha1_block_data_order +.type sha1_block_data_order,%function + +.align 5 +sha1_block_data_order: +#if __ARM_ARCH__>=7 + sub r3,pc,#8 @ sha1_block_data_order + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA1 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + stmdb sp!,{r4-r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +.Lloop: + ldr r8,.LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +.L_00_15: +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) + teq r14,sp + bne .L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,.LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +.L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) + teq r14,sp @ preserve carry + bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,.LK_40_59 + sub sp,sp,#20*4 @ [+2] +.L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 + teq r14,sp + bne .L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,.LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b .L_20_39_or_60_79 @ [+4], spare 300 bytes +.L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne .Lloop @ [+18], total 1307 + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha1_block_data_order,.-sha1_block_data_order + +.align 5 +.LK_00_19: .word 0x5a827999 +.LK_20_39: .word 0x6ed9eba1 +.LK_40_59: .word 0x8f1bbcdc +.LK_60_79: .word 0xca62c1d6 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha1_block_data_order +.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " +.align 5 +#if __ARM_ARCH__>=7 +.fpu neon + +.type sha1_block_data_order_neon,%function +.align 4 +sha1_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + @ dmb @ errata #451034 on early Cortex A8 + @ vstmdb sp!,{d8-d15} @ ABI specification says so + mov r14,sp + sub sp,sp,#64 @ alloca + adr r8,.LK_00_19 + bic sp,sp,#15 @ align for 128-bit stores + + ldmia r0,{r3,r4,r5,r6,r7} @ load context + mov r12,sp + + vld1.8 {q0-q1},[r1]! @ handles unaligned + veor q15,q15,q15 + vld1.8 {q2-q3},[r1]! + vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 + vrev32.8 q0,q0 @ yes, even on + vrev32.8 q1,q1 @ big-endian... + vrev32.8 q2,q2 + vadd.i32 q8,q0,q14 + vrev32.8 q3,q3 + vadd.i32 q9,q1,q14 + vst1.32 {q8},[r12,:128]! + vadd.i32 q10,q2,q14 + vst1.32 {q9},[r12,:128]! + vst1.32 {q10},[r12,:128]! + ldr r9,[sp] @ big RAW stall + +.Loop_neon: + vext.8 q8,q0,q1,#8 + bic r10,r6,r4 + add r7,r7,r9 + and r11,r5,r4 + vadd.i32 q13,q3,q14 + ldr r9,[sp,#4] + add r7,r7,r3,ror#27 + vext.8 q12,q3,q15,#4 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q8,q8,q0 + bic r10,r5,r3 + add r6,r6,r9 + veor q12,q12,q2 + and r11,r4,r3 + ldr r9,[sp,#8] + veor q12,q12,q8 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q13,q15,q12,#4 + bic r10,r4,r7 + add r5,r5,r9 + vadd.i32 q8,q12,q12 + and r11,r3,r7 + ldr r9,[sp,#12] + vsri.32 q8,q12,#31 + add r5,r5,r6,ror#27 + eor r11,r11,r10 + mov r7,r7,ror#2 + vshr.u32 q12,q13,#30 + add r5,r5,r11 + bic r10,r3,r6 + vshl.u32 q13,q13,#2 + add r4,r4,r9 + and r11,r7,r6 + veor q8,q8,q12 + ldr r9,[sp,#16] + add r4,r4,r5,ror#27 + veor q8,q8,q13 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q9,q1,q2,#8 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + vadd.i32 q13,q8,q14 + ldr r9,[sp,#20] + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r4,ror#27 + vext.8 q12,q8,q15,#4 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + veor q9,q9,q1 + bic r10,r6,r4 + add r7,r7,r9 + veor q12,q12,q3 + and r11,r5,r4 + ldr r9,[sp,#24] + veor q12,q12,q9 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q13,q15,q12,#4 + bic r10,r5,r3 + add r6,r6,r9 + vadd.i32 q9,q12,q12 + and r11,r4,r3 + ldr r9,[sp,#28] + vsri.32 q9,q12,#31 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + mov r3,r3,ror#2 + vshr.u32 q12,q13,#30 + add r6,r6,r11 + bic r10,r4,r7 + vshl.u32 q13,q13,#2 + add r5,r5,r9 + and r11,r3,r7 + veor q9,q9,q12 + ldr r9,[sp,#32] + add r5,r5,r6,ror#27 + veor q9,q9,q13 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q10,q2,q3,#8 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + vadd.i32 q13,q9,q14 + ldr r9,[sp,#36] + add r4,r4,r5,ror#27 + vext.8 q12,q9,q15,#4 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q10,q10,q2 + bic r10,r7,r5 + add r3,r3,r9 + veor q12,q12,q8 + and r11,r6,r5 + ldr r9,[sp,#40] + veor q12,q12,q10 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q13,q15,q12,#4 + bic r10,r6,r4 + add r7,r7,r9 + vadd.i32 q10,q12,q12 + and r11,r5,r4 + ldr r9,[sp,#44] + vsri.32 q10,q12,#31 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + mov r4,r4,ror#2 + vshr.u32 q12,q13,#30 + add r7,r7,r11 + bic r10,r5,r3 + vshl.u32 q13,q13,#2 + add r6,r6,r9 + and r11,r4,r3 + veor q10,q10,q12 + ldr r9,[sp,#48] + add r6,r6,r7,ror#27 + veor q10,q10,q13 + eor r11,r11,r10 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q11,q3,q8,#8 + bic r10,r4,r7 + add r5,r5,r9 + and r11,r3,r7 + vadd.i32 q13,q10,q14 + ldr r9,[sp,#52] + add r5,r5,r6,ror#27 + vext.8 q12,q10,q15,#4 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q11,q11,q3 + bic r10,r3,r6 + add r4,r4,r9 + veor q12,q12,q9 + and r11,r7,r6 + ldr r9,[sp,#56] + veor q12,q12,q11 + add r4,r4,r5,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q13,q15,q12,#4 + bic r10,r7,r5 + add r3,r3,r9 + vadd.i32 q11,q12,q12 + and r11,r6,r5 + ldr r9,[sp,#60] + vsri.32 q11,q12,#31 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + vshr.u32 q12,q13,#30 + add r3,r3,r11 + bic r10,r6,r4 + vshl.u32 q13,q13,#2 + add r7,r7,r9 + and r11,r5,r4 + veor q11,q11,q12 + ldr r9,[sp,#0] + add r7,r7,r3,ror#27 + veor q11,q11,q13 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q10,q11,#8 + bic r10,r5,r3 + add r6,r6,r9 + and r11,r4,r3 + veor q0,q0,q8 + ldr r9,[sp,#4] + add r6,r6,r7,ror#27 + veor q0,q0,q1 + eor r11,r11,r10 + mov r3,r3,ror#2 + vadd.i32 q13,q11,q14 + add r6,r6,r11 + bic r10,r4,r7 + veor q12,q12,q0 + add r5,r5,r9 + and r11,r3,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r11,r10 + mov r7,r7,ror#2 + vsli.32 q0,q12,#2 + add r5,r5,r11 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + ldr r9,[sp,#12] + add r4,r4,r5,ror#27 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + ldr r9,[sp,#16] + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q11,q0,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#20] + veor q1,q1,q9 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q1,q1,q2 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q0,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q1 + ldr r9,[sp,#24] + eor r11,r10,r4 + vshr.u32 q1,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q1,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#28] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#32] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q0,q1,#8 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#36] + veor q2,q2,q10 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + veor q2,q2,q3 + mov r5,r5,ror#2 + add r3,r3,r11 + vadd.i32 q13,q1,q14 + eor r10,r4,r6 + vld1.32 {d28[],d29[]},[r8,:32]! + add r7,r7,r9 + veor q12,q12,q2 + ldr r9,[sp,#40] + eor r11,r10,r5 + vshr.u32 q2,q12,#30 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + vst1.32 {q13},[r12,:128]! + add r7,r7,r11 + eor r10,r3,r5 + vsli.32 q2,q12,#2 + add r6,r6,r9 + ldr r9,[sp,#44] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#48] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q1,q2,#8 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r7 + add r4,r4,r5,ror#27 + veor q3,q3,q8 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q2,q14 + eor r10,r5,r7 + add r3,r3,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r6 + vshr.u32 q3,q12,#30 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vst1.32 {q13},[r12,:128]! + add r3,r3,r11 + eor r10,r4,r6 + vsli.32 q3,q12,#2 + add r7,r7,r9 + ldr r9,[sp,#60] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#0] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q2,q3,#8 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#4] + veor q8,q8,q0 + eor r11,r10,r3 + add r5,r5,r6,ror#27 + veor q8,q8,q9 + mov r7,r7,ror#2 + add r5,r5,r11 + vadd.i32 q13,q3,q14 + eor r10,r6,r3 + add r4,r4,r9 + veor q12,q12,q8 + ldr r9,[sp,#8] + eor r11,r10,r7 + vshr.u32 q8,q12,#30 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + add r4,r4,r11 + eor r10,r5,r7 + vsli.32 q8,q12,#2 + add r3,r3,r9 + ldr r9,[sp,#12] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#16] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q3,q8,#8 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#20] + veor q9,q9,q1 + eor r11,r10,r4 + add r6,r6,r7,ror#27 + veor q9,q9,q10 + mov r3,r3,ror#2 + add r6,r6,r11 + vadd.i32 q13,q8,q14 + eor r10,r7,r4 + add r5,r5,r9 + veor q12,q12,q9 + ldr r9,[sp,#24] + eor r11,r10,r3 + vshr.u32 q9,q12,#30 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + vst1.32 {q13},[r12,:128]! + add r5,r5,r11 + eor r10,r6,r3 + vsli.32 q9,q12,#2 + add r4,r4,r9 + ldr r9,[sp,#28] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#32] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q8,q9,#8 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#36] + veor q10,q10,q2 + add r7,r7,r3,ror#27 + eor r11,r5,r6 + veor q10,q10,q11 + add r7,r7,r10 + and r11,r11,r4 + vadd.i32 q13,q9,q14 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q12,q12,q10 + add r6,r6,r9 + and r10,r4,r5 + vshr.u32 q10,q12,#30 + ldr r9,[sp,#40] + add r6,r6,r7,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r4,r5 + add r6,r6,r10 + vsli.32 q10,q12,#2 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#44] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#48] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q9,q10,#8 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#52] + veor q11,q11,q3 + add r3,r3,r4,ror#27 + eor r11,r6,r7 + veor q11,q11,q0 + add r3,r3,r10 + and r11,r11,r5 + vadd.i32 q13,q10,q14 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + veor q12,q12,q11 + add r7,r7,r9 + and r10,r5,r6 + vshr.u32 q11,q12,#30 + ldr r9,[sp,#56] + add r7,r7,r3,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r5,r6 + add r7,r7,r10 + vsli.32 q11,q12,#2 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#60] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#0] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q10,q11,#8 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#4] + veor q0,q0,q8 + add r4,r4,r5,ror#27 + eor r11,r7,r3 + veor q0,q0,q1 + add r4,r4,r10 + and r11,r11,r6 + vadd.i32 q13,q11,q14 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q12,q12,q0 + add r3,r3,r9 + and r10,r6,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r3,r3,r4,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r6,r7 + add r3,r3,r10 + vsli.32 q0,q12,#2 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#12] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#16] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q11,q0,#8 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#20] + veor q1,q1,q9 + add r5,r5,r6,ror#27 + eor r11,r3,r4 + veor q1,q1,q2 + add r5,r5,r10 + and r11,r11,r7 + vadd.i32 q13,q0,q14 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q12,q12,q1 + add r4,r4,r9 + and r10,r7,r3 + vshr.u32 q1,q12,#30 + ldr r9,[sp,#24] + add r4,r4,r5,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r7,r3 + add r4,r4,r10 + vsli.32 q1,q12,#2 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#28] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#32] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q0,q1,#8 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#36] + veor q2,q2,q10 + add r6,r6,r7,ror#27 + eor r11,r4,r5 + veor q2,q2,q3 + add r6,r6,r10 + and r11,r11,r3 + vadd.i32 q13,q1,q14 + mov r3,r3,ror#2 + add r6,r6,r11 + veor q12,q12,q2 + add r5,r5,r9 + and r10,r3,r4 + vshr.u32 q2,q12,#30 + ldr r9,[sp,#40] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r3,r4 + add r5,r5,r10 + vsli.32 q2,q12,#2 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#44] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#48] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q1,q2,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q3,q3,q8 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q2,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r4 + vshr.u32 q3,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q3,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#60] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#0] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q3,q14 + eor r10,r5,r7 + add r3,r3,r9 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + teq r1,r2 + sub r8,r8,#16 + subeq r1,r1,#64 + vld1.8 {q0-q1},[r1]! + ldr r9,[sp,#4] + eor r11,r10,r6 + vld1.8 {q2-q3},[r1]! + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + eor r10,r4,r6 + vrev32.8 q0,q0 + add r7,r7,r9 + ldr r9,[sp,#8] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#12] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#16] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vrev32.8 q1,q1 + eor r10,r6,r3 + add r4,r4,r9 + vadd.i32 q8,q0,q14 + ldr r9,[sp,#20] + eor r11,r10,r7 + vst1.32 {q8},[r12,:128]! + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#24] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#28] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#32] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vrev32.8 q2,q2 + eor r10,r7,r4 + add r5,r5,r9 + vadd.i32 q9,q1,q14 + ldr r9,[sp,#36] + eor r11,r10,r3 + vst1.32 {q9},[r12,:128]! + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#40] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#44] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#48] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vrev32.8 q3,q3 + eor r10,r3,r5 + add r6,r6,r9 + vadd.i32 q10,q2,q14 + ldr r9,[sp,#52] + eor r11,r10,r4 + vst1.32 {q10},[r12,:128]! + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#56] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#60] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + ldmia r0,{r9,r10,r11,r12} @ accumulate context + add r3,r3,r9 + ldr r9,[r0,#16] + add r4,r4,r10 + add r5,r5,r11 + add r6,r6,r12 + moveq sp,r14 + add r7,r7,r9 + ldrne r9,[sp] + stmia r0,{r3,r4,r5,r6,r7} + addne r12,sp,#3*16 + bne .Loop_neon + + @ vldmia sp!,{d8-d15} + ldmia sp!,{r4-r12,pc} +.size sha1_block_data_order_neon,.-sha1_block_data_order_neon +#endif +#if __ARM_ARCH__>=7 +.type sha1_block_data_order_armv8,%function +.align 5 +sha1_block_data_order_armv8: +.LARMv8: + vstmdb sp!,{d8-d15} @ ABI specification says so + + veor q1,q1,q1 + adr r3,.LK_00_19 + vld1.32 {q0},[r0]! + vld1.32 {d2[0]},[r0] + sub r0,r0,#16 + vld1.32 {d16[],d17[]},[r3,:32]! + vld1.32 {d18[],d19[]},[r3,:32]! + vld1.32 {d20[],d21[]},[r3,:32]! + vld1.32 {d22[],d23[]},[r3,:32] + +.Loop_v8: + vld1.8 {q4-q5},[r1]! + vld1.8 {q6-q7},[r1]! + vrev32.8 q4,q4 + vrev32.8 q5,q5 + + vadd.i32 q12,q8,q4 + vrev32.8 q6,q6 + vmov q14,q0 @ offload + subs r2,r2,#1 + + vadd.i32 q13,q8,q5 + vrev32.8 q7,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 0 + .byte 0x68,0x0c,0x02,0xf2 @ sha1c q0,q1,q12 + vadd.i32 q12,q8,q6 + .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 1 + .byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 + vadd.i32 q13,q8,q7 + .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 + .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 2 + .byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 + vadd.i32 q12,q8,q4 + .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 + .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 3 + .byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 + vadd.i32 q13,q9,q5 + .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 + .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 4 + .byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 + vadd.i32 q12,q9,q6 + .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 + .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 5 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q7 + .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 + .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 6 + .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + vadd.i32 q12,q9,q4 + .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 + .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 7 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q5 + .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 + .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 8 + .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + vadd.i32 q12,q10,q6 + .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 + .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 9 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q10,q7 + .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 + .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 10 + .byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q4 + .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 + .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 11 + .byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 + vadd.i32 q13,q10,q5 + .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 + .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 12 + .byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q6 + .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 + .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 13 + .byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 + vadd.i32 q13,q11,q7 + .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 + .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 14 + .byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 + vadd.i32 q12,q11,q4 + .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 + .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 15 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q5 + .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 + .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 16 + .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + vadd.i32 q12,q11,q6 + .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 17 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q7 + + .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 18 + .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + + .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 19 + .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + + vadd.i32 q1,q1,q2 + vadd.i32 q0,q0,q14 + bne .Loop_v8 + + vst1.32 {q0},[r0]! + vst1.32 {d2[0]},[r0] + + vldmia sp!,{d8-d15} + bx lr @ bx lr +.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 +#endif +.comm OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha1-armv4-large.s b/main/openssl/crypto/sha/asm/sha1-armv4-large.s deleted file mode 100644 index a1562883..00000000 --- a/main/openssl/crypto/sha/asm/sha1-armv4-large.s +++ /dev/null @@ -1,1450 +0,0 @@ -#include "arm_arch.h" - -.text -.code 32 - -.global sha1_block_data_order -.type sha1_block_data_order,%function - -.align 5 -sha1_block_data_order: -#if __ARM_ARCH__>=7 - sub r3,pc,#8 @ sha1_block_data_order - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#ARMV8_SHA1 - bne .LARMv8 - tst r12,#ARMV7_NEON - bne .LNEON -#endif - stmdb sp!,{r4-r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - ldmia r0,{r3,r4,r5,r6,r7} -.Lloop: - ldr r8,.LK_00_19 - mov r14,sp - sub sp,sp,#15*4 - mov r5,r5,ror#30 - mov r6,r6,ror#30 - mov r7,r7,ror#30 @ [6] -.L_00_15: -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r6,r8,r6,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r4,r5 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r6,r8,r6,ror#2 @ E+=K_00_19 - eor r10,r4,r5 @ F_xx_xx - add r6,r6,r7,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r3,r10,ror#2 - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r6,r6,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r5,r8,r5,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r3,r4 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r5,r8,r5,ror#2 @ E+=K_00_19 - eor r10,r3,r4 @ F_xx_xx - add r5,r5,r6,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r7,r10,ror#2 - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r5,r5,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r4,r8,r4,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r7,r3 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r4,r8,r4,ror#2 @ E+=K_00_19 - eor r10,r7,r3 @ F_xx_xx - add r4,r4,r5,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r6,r10,ror#2 - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r4,r4,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r3,r8,r3,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r6,r7 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r3,r8,r3,ror#2 @ E+=K_00_19 - eor r10,r6,r7 @ F_xx_xx - add r3,r3,r4,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r5,r10,ror#2 - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r3,r3,r10 @ E+=F_00_19(B,C,D) - teq r14,sp - bne .L_00_15 @ [((11+4)*5+2)*3] - sub sp,sp,#25*4 -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - add r6,r6,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - add r5,r5,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - add r4,r4,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - add r3,r3,r10 @ E+=F_00_19(B,C,D) - - ldr r8,.LK_20_39 @ [+15+16*4] - cmn sp,#0 @ [+3], clear carry to denote 20_39 -.L_20_39_or_60_79: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r4,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_20_39(B,C,D) - teq r14,sp @ preserve carry - bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] - bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes - - ldr r8,.LK_40_59 - sub sp,sp,#20*4 @ [+2] -.L_40_59: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r4,r10,ror#2 @ F_xx_xx - and r11,r5,r6 @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_40_59(B,C,D) - add r7,r7,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - and r11,r4,r5 @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_40_59(B,C,D) - add r6,r6,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - and r11,r3,r4 @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_40_59(B,C,D) - add r5,r5,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - and r11,r7,r3 @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_40_59(B,C,D) - add r4,r4,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - and r11,r6,r7 @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_40_59(B,C,D) - add r3,r3,r11,ror#2 - teq r14,sp - bne .L_40_59 @ [+((12+5)*5+2)*4] - - ldr r8,.LK_60_79 - sub sp,sp,#20*4 - cmp sp,#0 @ set carry to denote 60_79 - b .L_20_39_or_60_79 @ [+4], spare 300 bytes -.L_done: - add sp,sp,#80*4 @ "deallocate" stack frame - ldmia r0,{r8,r9,r10,r11,r12} - add r3,r8,r3 - add r4,r9,r4 - add r5,r10,r5,ror#2 - add r6,r11,r6,ror#2 - add r7,r12,r7,ror#2 - stmia r0,{r3,r4,r5,r6,r7} - teq r1,r2 - bne .Lloop @ [+18], total 1307 - -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size sha1_block_data_order,.-sha1_block_data_order - -.align 5 -.LK_00_19: .word 0x5a827999 -.LK_20_39: .word 0x6ed9eba1 -.LK_40_59: .word 0x8f1bbcdc -.LK_60_79: .word 0xca62c1d6 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha1_block_data_order -.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " -.align 5 -#if __ARM_ARCH__>=7 -.fpu neon - -.type sha1_block_data_order_neon,%function -.align 4 -sha1_block_data_order_neon: -.LNEON: - stmdb sp!,{r4-r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - @ dmb @ errata #451034 on early Cortex A8 - @ vstmdb sp!,{d8-d15} @ ABI specification says so - mov r14,sp - sub sp,sp,#64 @ alloca - adr r8,.LK_00_19 - bic sp,sp,#15 @ align for 128-bit stores - - ldmia r0,{r3,r4,r5,r6,r7} @ load context - mov r12,sp - - vld1.8 {q0-q1},[r1]! @ handles unaligned - veor q15,q15,q15 - vld1.8 {q2-q3},[r1]! - vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 - vrev32.8 q0,q0 @ yes, even on - vrev32.8 q1,q1 @ big-endian... - vrev32.8 q2,q2 - vadd.i32 q8,q0,q14 - vrev32.8 q3,q3 - vadd.i32 q9,q1,q14 - vst1.32 {q8},[r12,:128]! - vadd.i32 q10,q2,q14 - vst1.32 {q9},[r12,:128]! - vst1.32 {q10},[r12,:128]! - ldr r9,[sp] @ big RAW stall - -.Loop_neon: - vext.8 q8,q0,q1,#8 - bic r10,r6,r4 - add r7,r7,r9 - and r11,r5,r4 - vadd.i32 q13,q3,q14 - ldr r9,[sp,#4] - add r7,r7,r3,ror#27 - vext.8 q12,q3,q15,#4 - eor r11,r11,r10 - mov r4,r4,ror#2 - add r7,r7,r11 - veor q8,q8,q0 - bic r10,r5,r3 - add r6,r6,r9 - veor q12,q12,q2 - and r11,r4,r3 - ldr r9,[sp,#8] - veor q12,q12,q8 - add r6,r6,r7,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q13,q15,q12,#4 - bic r10,r4,r7 - add r5,r5,r9 - vadd.i32 q8,q12,q12 - and r11,r3,r7 - ldr r9,[sp,#12] - vsri.32 q8,q12,#31 - add r5,r5,r6,ror#27 - eor r11,r11,r10 - mov r7,r7,ror#2 - vshr.u32 q12,q13,#30 - add r5,r5,r11 - bic r10,r3,r6 - vshl.u32 q13,q13,#2 - add r4,r4,r9 - and r11,r7,r6 - veor q8,q8,q12 - ldr r9,[sp,#16] - add r4,r4,r5,ror#27 - veor q8,q8,q13 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q9,q1,q2,#8 - bic r10,r7,r5 - add r3,r3,r9 - and r11,r6,r5 - vadd.i32 q13,q8,q14 - ldr r9,[sp,#20] - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r4,ror#27 - vext.8 q12,q8,q15,#4 - eor r11,r11,r10 - mov r5,r5,ror#2 - add r3,r3,r11 - veor q9,q9,q1 - bic r10,r6,r4 - add r7,r7,r9 - veor q12,q12,q3 - and r11,r5,r4 - ldr r9,[sp,#24] - veor q12,q12,q9 - add r7,r7,r3,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q13,q15,q12,#4 - bic r10,r5,r3 - add r6,r6,r9 - vadd.i32 q9,q12,q12 - and r11,r4,r3 - ldr r9,[sp,#28] - vsri.32 q9,q12,#31 - add r6,r6,r7,ror#27 - eor r11,r11,r10 - mov r3,r3,ror#2 - vshr.u32 q12,q13,#30 - add r6,r6,r11 - bic r10,r4,r7 - vshl.u32 q13,q13,#2 - add r5,r5,r9 - and r11,r3,r7 - veor q9,q9,q12 - ldr r9,[sp,#32] - add r5,r5,r6,ror#27 - veor q9,q9,q13 - eor r11,r11,r10 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q10,q2,q3,#8 - bic r10,r3,r6 - add r4,r4,r9 - and r11,r7,r6 - vadd.i32 q13,q9,q14 - ldr r9,[sp,#36] - add r4,r4,r5,ror#27 - vext.8 q12,q9,q15,#4 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - veor q10,q10,q2 - bic r10,r7,r5 - add r3,r3,r9 - veor q12,q12,q8 - and r11,r6,r5 - ldr r9,[sp,#40] - veor q12,q12,q10 - add r3,r3,r4,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q13,q15,q12,#4 - bic r10,r6,r4 - add r7,r7,r9 - vadd.i32 q10,q12,q12 - and r11,r5,r4 - ldr r9,[sp,#44] - vsri.32 q10,q12,#31 - add r7,r7,r3,ror#27 - eor r11,r11,r10 - mov r4,r4,ror#2 - vshr.u32 q12,q13,#30 - add r7,r7,r11 - bic r10,r5,r3 - vshl.u32 q13,q13,#2 - add r6,r6,r9 - and r11,r4,r3 - veor q10,q10,q12 - ldr r9,[sp,#48] - add r6,r6,r7,ror#27 - veor q10,q10,q13 - eor r11,r11,r10 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q11,q3,q8,#8 - bic r10,r4,r7 - add r5,r5,r9 - and r11,r3,r7 - vadd.i32 q13,q10,q14 - ldr r9,[sp,#52] - add r5,r5,r6,ror#27 - vext.8 q12,q10,q15,#4 - eor r11,r11,r10 - mov r7,r7,ror#2 - add r5,r5,r11 - veor q11,q11,q3 - bic r10,r3,r6 - add r4,r4,r9 - veor q12,q12,q9 - and r11,r7,r6 - ldr r9,[sp,#56] - veor q12,q12,q11 - add r4,r4,r5,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q13,q15,q12,#4 - bic r10,r7,r5 - add r3,r3,r9 - vadd.i32 q11,q12,q12 - and r11,r6,r5 - ldr r9,[sp,#60] - vsri.32 q11,q12,#31 - add r3,r3,r4,ror#27 - eor r11,r11,r10 - mov r5,r5,ror#2 - vshr.u32 q12,q13,#30 - add r3,r3,r11 - bic r10,r6,r4 - vshl.u32 q13,q13,#2 - add r7,r7,r9 - and r11,r5,r4 - veor q11,q11,q12 - ldr r9,[sp,#0] - add r7,r7,r3,ror#27 - veor q11,q11,q13 - eor r11,r11,r10 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q10,q11,#8 - bic r10,r5,r3 - add r6,r6,r9 - and r11,r4,r3 - veor q0,q0,q8 - ldr r9,[sp,#4] - add r6,r6,r7,ror#27 - veor q0,q0,q1 - eor r11,r11,r10 - mov r3,r3,ror#2 - vadd.i32 q13,q11,q14 - add r6,r6,r11 - bic r10,r4,r7 - veor q12,q12,q0 - add r5,r5,r9 - and r11,r3,r7 - vshr.u32 q0,q12,#30 - ldr r9,[sp,#8] - add r5,r5,r6,ror#27 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - eor r11,r11,r10 - mov r7,r7,ror#2 - vsli.32 q0,q12,#2 - add r5,r5,r11 - bic r10,r3,r6 - add r4,r4,r9 - and r11,r7,r6 - ldr r9,[sp,#12] - add r4,r4,r5,ror#27 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - bic r10,r7,r5 - add r3,r3,r9 - and r11,r6,r5 - ldr r9,[sp,#16] - add r3,r3,r4,ror#27 - eor r11,r11,r10 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q11,q0,#8 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#20] - veor q1,q1,q9 - eor r11,r10,r5 - add r7,r7,r3,ror#27 - veor q1,q1,q2 - mov r4,r4,ror#2 - add r7,r7,r11 - vadd.i32 q13,q0,q14 - eor r10,r3,r5 - add r6,r6,r9 - veor q12,q12,q1 - ldr r9,[sp,#24] - eor r11,r10,r4 - vshr.u32 q1,q12,#30 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - vst1.32 {q13},[r12,:128]! - add r6,r6,r11 - eor r10,r7,r4 - vsli.32 q1,q12,#2 - add r5,r5,r9 - ldr r9,[sp,#28] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#32] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q12,q0,q1,#8 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#36] - veor q2,q2,q10 - eor r11,r10,r6 - add r3,r3,r4,ror#27 - veor q2,q2,q3 - mov r5,r5,ror#2 - add r3,r3,r11 - vadd.i32 q13,q1,q14 - eor r10,r4,r6 - vld1.32 {d28[],d29[]},[r8,:32]! - add r7,r7,r9 - veor q12,q12,q2 - ldr r9,[sp,#40] - eor r11,r10,r5 - vshr.u32 q2,q12,#30 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - vst1.32 {q13},[r12,:128]! - add r7,r7,r11 - eor r10,r3,r5 - vsli.32 q2,q12,#2 - add r6,r6,r9 - ldr r9,[sp,#44] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#48] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q12,q1,q2,#8 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#52] - veor q3,q3,q11 - eor r11,r10,r7 - add r4,r4,r5,ror#27 - veor q3,q3,q8 - mov r6,r6,ror#2 - add r4,r4,r11 - vadd.i32 q13,q2,q14 - eor r10,r5,r7 - add r3,r3,r9 - veor q12,q12,q3 - ldr r9,[sp,#56] - eor r11,r10,r6 - vshr.u32 q3,q12,#30 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - vst1.32 {q13},[r12,:128]! - add r3,r3,r11 - eor r10,r4,r6 - vsli.32 q3,q12,#2 - add r7,r7,r9 - ldr r9,[sp,#60] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#0] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q12,q2,q3,#8 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#4] - veor q8,q8,q0 - eor r11,r10,r3 - add r5,r5,r6,ror#27 - veor q8,q8,q9 - mov r7,r7,ror#2 - add r5,r5,r11 - vadd.i32 q13,q3,q14 - eor r10,r6,r3 - add r4,r4,r9 - veor q12,q12,q8 - ldr r9,[sp,#8] - eor r11,r10,r7 - vshr.u32 q8,q12,#30 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - add r4,r4,r11 - eor r10,r5,r7 - vsli.32 q8,q12,#2 - add r3,r3,r9 - ldr r9,[sp,#12] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#16] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q3,q8,#8 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#20] - veor q9,q9,q1 - eor r11,r10,r4 - add r6,r6,r7,ror#27 - veor q9,q9,q10 - mov r3,r3,ror#2 - add r6,r6,r11 - vadd.i32 q13,q8,q14 - eor r10,r7,r4 - add r5,r5,r9 - veor q12,q12,q9 - ldr r9,[sp,#24] - eor r11,r10,r3 - vshr.u32 q9,q12,#30 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - vst1.32 {q13},[r12,:128]! - add r5,r5,r11 - eor r10,r6,r3 - vsli.32 q9,q12,#2 - add r4,r4,r9 - ldr r9,[sp,#28] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#32] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q8,q9,#8 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#36] - veor q10,q10,q2 - add r7,r7,r3,ror#27 - eor r11,r5,r6 - veor q10,q10,q11 - add r7,r7,r10 - and r11,r11,r4 - vadd.i32 q13,q9,q14 - mov r4,r4,ror#2 - add r7,r7,r11 - veor q12,q12,q10 - add r6,r6,r9 - and r10,r4,r5 - vshr.u32 q10,q12,#30 - ldr r9,[sp,#40] - add r6,r6,r7,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r4,r5 - add r6,r6,r10 - vsli.32 q10,q12,#2 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#44] - add r5,r5,r6,ror#27 - eor r11,r3,r4 - add r5,r5,r10 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#48] - add r4,r4,r5,ror#27 - eor r11,r7,r3 - add r4,r4,r10 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q12,q9,q10,#8 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#52] - veor q11,q11,q3 - add r3,r3,r4,ror#27 - eor r11,r6,r7 - veor q11,q11,q0 - add r3,r3,r10 - and r11,r11,r5 - vadd.i32 q13,q10,q14 - mov r5,r5,ror#2 - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r11 - veor q12,q12,q11 - add r7,r7,r9 - and r10,r5,r6 - vshr.u32 q11,q12,#30 - ldr r9,[sp,#56] - add r7,r7,r3,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r5,r6 - add r7,r7,r10 - vsli.32 q11,q12,#2 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#60] - add r6,r6,r7,ror#27 - eor r11,r4,r5 - add r6,r6,r10 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#0] - add r5,r5,r6,ror#27 - eor r11,r3,r4 - add r5,r5,r10 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q12,q10,q11,#8 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#4] - veor q0,q0,q8 - add r4,r4,r5,ror#27 - eor r11,r7,r3 - veor q0,q0,q1 - add r4,r4,r10 - and r11,r11,r6 - vadd.i32 q13,q11,q14 - mov r6,r6,ror#2 - add r4,r4,r11 - veor q12,q12,q0 - add r3,r3,r9 - and r10,r6,r7 - vshr.u32 q0,q12,#30 - ldr r9,[sp,#8] - add r3,r3,r4,ror#27 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - eor r11,r6,r7 - add r3,r3,r10 - vsli.32 q0,q12,#2 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#12] - add r7,r7,r3,ror#27 - eor r11,r5,r6 - add r7,r7,r10 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#16] - add r6,r6,r7,ror#27 - eor r11,r4,r5 - add r6,r6,r10 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q12,q11,q0,#8 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#20] - veor q1,q1,q9 - add r5,r5,r6,ror#27 - eor r11,r3,r4 - veor q1,q1,q2 - add r5,r5,r10 - and r11,r11,r7 - vadd.i32 q13,q0,q14 - mov r7,r7,ror#2 - add r5,r5,r11 - veor q12,q12,q1 - add r4,r4,r9 - and r10,r7,r3 - vshr.u32 q1,q12,#30 - ldr r9,[sp,#24] - add r4,r4,r5,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r7,r3 - add r4,r4,r10 - vsli.32 q1,q12,#2 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#28] - add r3,r3,r4,ror#27 - eor r11,r6,r7 - add r3,r3,r10 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#32] - add r7,r7,r3,ror#27 - eor r11,r5,r6 - add r7,r7,r10 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q0,q1,#8 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#36] - veor q2,q2,q10 - add r6,r6,r7,ror#27 - eor r11,r4,r5 - veor q2,q2,q3 - add r6,r6,r10 - and r11,r11,r3 - vadd.i32 q13,q1,q14 - mov r3,r3,ror#2 - add r6,r6,r11 - veor q12,q12,q2 - add r5,r5,r9 - and r10,r3,r4 - vshr.u32 q2,q12,#30 - ldr r9,[sp,#40] - add r5,r5,r6,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r3,r4 - add r5,r5,r10 - vsli.32 q2,q12,#2 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#44] - add r4,r4,r5,ror#27 - eor r11,r7,r3 - add r4,r4,r10 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#48] - add r3,r3,r4,ror#27 - eor r11,r6,r7 - add r3,r3,r10 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q1,q2,#8 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#52] - veor q3,q3,q11 - eor r11,r10,r5 - add r7,r7,r3,ror#27 - veor q3,q3,q8 - mov r4,r4,ror#2 - add r7,r7,r11 - vadd.i32 q13,q2,q14 - eor r10,r3,r5 - add r6,r6,r9 - veor q12,q12,q3 - ldr r9,[sp,#56] - eor r11,r10,r4 - vshr.u32 q3,q12,#30 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - vst1.32 {q13},[r12,:128]! - add r6,r6,r11 - eor r10,r7,r4 - vsli.32 q3,q12,#2 - add r5,r5,r9 - ldr r9,[sp,#60] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#0] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - vadd.i32 q13,q3,q14 - eor r10,r5,r7 - add r3,r3,r9 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - teq r1,r2 - sub r8,r8,#16 - subeq r1,r1,#64 - vld1.8 {q0-q1},[r1]! - ldr r9,[sp,#4] - eor r11,r10,r6 - vld1.8 {q2-q3},[r1]! - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r11 - eor r10,r4,r6 - vrev32.8 q0,q0 - add r7,r7,r9 - ldr r9,[sp,#8] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#12] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#16] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - vrev32.8 q1,q1 - eor r10,r6,r3 - add r4,r4,r9 - vadd.i32 q8,q0,q14 - ldr r9,[sp,#20] - eor r11,r10,r7 - vst1.32 {q8},[r12,:128]! - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#24] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#28] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#32] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - vrev32.8 q2,q2 - eor r10,r7,r4 - add r5,r5,r9 - vadd.i32 q9,q1,q14 - ldr r9,[sp,#36] - eor r11,r10,r3 - vst1.32 {q9},[r12,:128]! - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#40] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#44] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#48] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - vrev32.8 q3,q3 - eor r10,r3,r5 - add r6,r6,r9 - vadd.i32 q10,q2,q14 - ldr r9,[sp,#52] - eor r11,r10,r4 - vst1.32 {q10},[r12,:128]! - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#56] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#60] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - ldmia r0,{r9,r10,r11,r12} @ accumulate context - add r3,r3,r9 - ldr r9,[r0,#16] - add r4,r4,r10 - add r5,r5,r11 - add r6,r6,r12 - moveq sp,r14 - add r7,r7,r9 - ldrne r9,[sp] - stmia r0,{r3,r4,r5,r6,r7} - addne r12,sp,#3*16 - bne .Loop_neon - - @ vldmia sp!,{d8-d15} - ldmia sp!,{r4-r12,pc} -.size sha1_block_data_order_neon,.-sha1_block_data_order_neon -#endif -#if __ARM_ARCH__>=7 -.type sha1_block_data_order_armv8,%function -.align 5 -sha1_block_data_order_armv8: -.LARMv8: - vstmdb sp!,{d8-d15} @ ABI specification says so - - veor q1,q1,q1 - adr r3,.LK_00_19 - vld1.32 {q0},[r0]! - vld1.32 {d2[0]},[r0] - sub r0,r0,#16 - vld1.32 {d16[],d17[]},[r3,:32]! - vld1.32 {d18[],d19[]},[r3,:32]! - vld1.32 {d20[],d21[]},[r3,:32]! - vld1.32 {d22[],d23[]},[r3,:32] - -.Loop_v8: - vld1.8 {q4-q5},[r1]! - vld1.8 {q6-q7},[r1]! - vrev32.8 q4,q4 - vrev32.8 q5,q5 - - vadd.i32 q12,q8,q4 - vrev32.8 q6,q6 - vmov q14,q0 @ offload - subs r2,r2,#1 - - vadd.i32 q13,q8,q5 - vrev32.8 q7,q7 - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 0 - .byte 0x68,0x0c,0x02,0xf2 @ sha1c q0,q1,q12 - vadd.i32 q12,q8,q6 - .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 1 - .byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 - vadd.i32 q13,q8,q7 - .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 - .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 2 - .byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 - vadd.i32 q12,q8,q4 - .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 - .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 3 - .byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 - vadd.i32 q13,q9,q5 - .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 - .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 4 - .byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 - vadd.i32 q12,q9,q6 - .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 - .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 5 - .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 - vadd.i32 q13,q9,q7 - .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 - .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 6 - .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 - vadd.i32 q12,q9,q4 - .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 - .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 7 - .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 - vadd.i32 q13,q9,q5 - .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 - .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 8 - .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 - vadd.i32 q12,q10,q6 - .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 - .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 9 - .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 - vadd.i32 q13,q10,q7 - .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 - .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 10 - .byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 - vadd.i32 q12,q10,q4 - .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 - .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 11 - .byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 - vadd.i32 q13,q10,q5 - .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 - .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 12 - .byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 - vadd.i32 q12,q10,q6 - .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 - .byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 13 - .byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 - vadd.i32 q13,q11,q7 - .byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 - .byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 14 - .byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 - vadd.i32 q12,q11,q4 - .byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 - .byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 15 - .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 - vadd.i32 q13,q11,q5 - .byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 - .byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 16 - .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 - vadd.i32 q12,q11,q6 - .byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 17 - .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 - vadd.i32 q13,q11,q7 - - .byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 18 - .byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 - - .byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 19 - .byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 - - vadd.i32 q1,q1,q2 - vadd.i32 q0,q0,q14 - bne .Loop_v8 - - vst1.32 {q0},[r0]! - vst1.32 {d2[0]},[r0] - - vldmia sp!,{d8-d15} - bx lr @ bx lr -.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 -#endif -.comm OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha256-armv4.S b/main/openssl/crypto/sha/asm/sha256-armv4.S new file mode 100644 index 00000000..853d7da5 --- /dev/null +++ b/main/openssl/crypto/sha/asm/sha256-armv4.S @@ -0,0 +1,2690 @@ +#include "arm_arch.h" + +.text +.code 32 + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha256_block_data_order +.align 5 + +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: + sub r3,pc,#8 @ sha256_block_data_order + add r2,r1,r2,lsl#6 @ len to point at the end of inp +#if __ARM_ARCH__>=7 + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + stmdb sp!,{r0,r1,r2,r4-r11,lr} + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r14,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ magic + eor r12,r12,r12 +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 0 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 0 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 0==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 0<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 1 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 1 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 1==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 1<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 2 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 2 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 2==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 2<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 3 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 3 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 3==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 3<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 4 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 4 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 4==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 4<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 5 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 5==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 5<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 6 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 6 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 6==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 6<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 7 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 7==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 7<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 8 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 8 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 8==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 8<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 9 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 9 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 9==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 9<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 10 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 10 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 10==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 10<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 11 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 11 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 11==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 11<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 12 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 12 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 12==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 12<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 13 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 13 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 13==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 13<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 14 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 14 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 14==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 14<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 15 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) + rev r2,r2 +#else + @ ldrb r2,[r1,#3] @ 15 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 15==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 15<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +.Lrounds_16_xx: + @ ldr r2,[sp,#1*4] @ 16 + @ ldr r1,[sp,#14*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#0*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#9*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 16==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 16<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#2*4] @ 17 + @ ldr r1,[sp,#15*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#1*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#10*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 17==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 17<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#3*4] @ 18 + @ ldr r1,[sp,#0*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#2*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#11*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 18==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 18<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#4*4] @ 19 + @ ldr r1,[sp,#1*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#3*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#12*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 19==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 19<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#5*4] @ 20 + @ ldr r1,[sp,#2*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#4*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#13*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 20==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 20<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#6*4] @ 21 + @ ldr r1,[sp,#3*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#5*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#14*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 21==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 21<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#7*4] @ 22 + @ ldr r1,[sp,#4*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#6*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#15*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 22==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 22<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#8*4] @ 23 + @ ldr r1,[sp,#5*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#7*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#0*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 23==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 23<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#9*4] @ 24 + @ ldr r1,[sp,#6*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#8*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#1*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 24==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 24<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#10*4] @ 25 + @ ldr r1,[sp,#7*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#9*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#2*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 25==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 25<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#11*4] @ 26 + @ ldr r1,[sp,#8*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#10*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#3*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 26==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 26<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#12*4] @ 27 + @ ldr r1,[sp,#9*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#11*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#4*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 27==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 27<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#13*4] @ 28 + @ ldr r1,[sp,#10*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#12*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#5*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 28==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 28<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#14*4] @ 29 + @ ldr r1,[sp,#11*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#13*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#6*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 29==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 29<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#15*4] @ 30 + @ ldr r1,[sp,#12*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#14*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#7*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 30==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 30<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#0*4] @ 31 + @ ldr r1,[sp,#13*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#15*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#8*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 31==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 31<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + ldreq r3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r0,[r3,#0] + ldr r2,[r3,#4] + ldr r12,[r3,#8] + add r4,r4,r0 + ldr r0,[r3,#12] + add r5,r5,r2 + ldr r2,[r3,#16] + add r6,r6,r12 + ldr r12,[r3,#20] + add r7,r7,r0 + ldr r0,[r3,#24] + add r8,r8,r2 + ldr r2,[r3,#28] + add r9,r9,r12 + ldr r1,[sp,#17*4] @ pull inp + ldr r12,[sp,#18*4] @ pull inp+len + add r10,r10,r0 + add r11,r11,r2 + stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} + cmp r1,r12 + sub r14,r14,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#19*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order,.-sha256_block_data_order +#if __ARM_ARCH__>=7 +.fpu neon + +.type sha256_block_data_order_neon,%function +.align 4 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + mov r12,sp + sub sp,sp,#16*4+16 @ alloca + sub r14,r3,#256+32 @ K256 + bic sp,sp,#15 @ align for 128-bit stores + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4-r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b .L_00_48 + +.align 4 +.L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne .L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8-r11} + + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + eorne r3,r5,r6 + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +#if __ARM_ARCH__>=7 +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {q0,q1},[r0] + sub r3,r3,#sha256_block_data_order-K256 + +.Loop_v8: + vld1.8 {q8-q9},[r1]! + vld1.8 {q10-q11},[r1]! + vld1.32 {q12},[r3]! + vrev32.8 q8,q8 + vrev32.8 q9,q9 + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vmov q14,q0 @ offload + vmov q15,q1 + teq r1,r2 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + + vld1.32 {q13},[r3] + vadd.i32 q12,q12,q10 + sub r3,r3,#256-16 @ rewind + vmov q2,q0 + .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 + .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + + vadd.i32 q13,q13,q11 + vmov q2,q0 + .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 + .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + + vadd.i32 q0,q0,q14 + vadd.i32 q1,q1,q15 + bne .Loop_v8 + + vst1.32 {q0,q1},[r0] + + bx lr @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " +.align 2 +.comm OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha256-armv4.s b/main/openssl/crypto/sha/asm/sha256-armv4.s deleted file mode 100644 index 853d7da5..00000000 --- a/main/openssl/crypto/sha/asm/sha256-armv4.s +++ /dev/null @@ -1,2690 +0,0 @@ -#include "arm_arch.h" - -.text -.code 32 - -.type K256,%object -.align 5 -K256: -.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.size K256,.-K256 -.word 0 @ terminator -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha256_block_data_order -.align 5 - -.global sha256_block_data_order -.type sha256_block_data_order,%function -sha256_block_data_order: - sub r3,pc,#8 @ sha256_block_data_order - add r2,r1,r2,lsl#6 @ len to point at the end of inp -#if __ARM_ARCH__>=7 - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#ARMV8_SHA256 - bne .LARMv8 - tst r12,#ARMV7_NEON - bne .LNEON -#endif - stmdb sp!,{r0,r1,r2,r4-r11,lr} - ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} - sub r14,r3,#256+32 @ K256 - sub sp,sp,#16*4 @ alloca(X[16]) -.Loop: -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ magic - eor r12,r12,r12 -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 0 -# if 0==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r8,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 0 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 0==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r8,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#0*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 0==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 0<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#2*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#15*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 1 -# if 1==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r7,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 1 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 1==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r7,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#1*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 1==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 1<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#3*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#0*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 2 -# if 2==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r6,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 2 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 2==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r6,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#2*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 2==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 2<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#4*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#1*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 3 -# if 3==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r5,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 3 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 3==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r5,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#3*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 3==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 3<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#5*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#2*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 4 -# if 4==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r4,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 4 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 4==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r4,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#4*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 4==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 4<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#6*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#3*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 5 -# if 5==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r11,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 5==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r11,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#5*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 5==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 5<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#7*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#4*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 6 -# if 6==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r10,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 6 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 6==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r10,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#6*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 6==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 6<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#8*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#5*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 7 -# if 7==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r9,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 7==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r9,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#7*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 7==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 7<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#9*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#6*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 8 -# if 8==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r8,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 8 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 8==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r8,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#8*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 8==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 8<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#10*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#7*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 9 -# if 9==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r7,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 9 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 9==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r7,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#9*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 9==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 9<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#11*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#8*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 10 -# if 10==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r6,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 10 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 10==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r6,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#10*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 10==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 10<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#12*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#9*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 11 -# if 11==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r5,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 11 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 11==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r5,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#11*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 11==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 11<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#13*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#10*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 12 -# if 12==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r4,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 12 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 12==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r4,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#12*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 12==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 12<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#14*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#11*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 13 -# if 13==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r11,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 13 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 13==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r11,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#13*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 13==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 13<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#15*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#12*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 14 -# if 14==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r10,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 14 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 14==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r10,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#14*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 14==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 14<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#0*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#13*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 15 -# if 15==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r9,ror#19 @ Sigma1(e) - rev r2,r2 -#else - @ ldrb r2,[r1,#3] @ 15 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 15==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r9,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#15*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 15==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 15<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#1*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#14*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -.Lrounds_16_xx: - @ ldr r2,[sp,#1*4] @ 16 - @ ldr r1,[sp,#14*4] - mov r0,r2,ror#7 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#0*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#9*4] - - add r12,r12,r0 - eor r0,r8,r8,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r8,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#0*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 16==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 16<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#2*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#15*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#2*4] @ 17 - @ ldr r1,[sp,#15*4] - mov r0,r2,ror#7 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#1*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#10*4] - - add r3,r3,r0 - eor r0,r7,r7,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r7,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#1*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 17==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 17<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#3*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#0*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#3*4] @ 18 - @ ldr r1,[sp,#0*4] - mov r0,r2,ror#7 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#2*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#11*4] - - add r12,r12,r0 - eor r0,r6,r6,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r6,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#2*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 18==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 18<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#4*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#1*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#4*4] @ 19 - @ ldr r1,[sp,#1*4] - mov r0,r2,ror#7 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#3*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#12*4] - - add r3,r3,r0 - eor r0,r5,r5,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r5,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#3*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 19==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 19<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#5*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#2*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#5*4] @ 20 - @ ldr r1,[sp,#2*4] - mov r0,r2,ror#7 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#4*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#13*4] - - add r12,r12,r0 - eor r0,r4,r4,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r4,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#4*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 20==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 20<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#6*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#3*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#6*4] @ 21 - @ ldr r1,[sp,#3*4] - mov r0,r2,ror#7 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#5*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#14*4] - - add r3,r3,r0 - eor r0,r11,r11,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r11,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#5*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 21==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 21<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#7*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#4*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#7*4] @ 22 - @ ldr r1,[sp,#4*4] - mov r0,r2,ror#7 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#6*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#15*4] - - add r12,r12,r0 - eor r0,r10,r10,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r10,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#6*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 22==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 22<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#8*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#5*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#8*4] @ 23 - @ ldr r1,[sp,#5*4] - mov r0,r2,ror#7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#7*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#0*4] - - add r3,r3,r0 - eor r0,r9,r9,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r9,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#7*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 23==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 23<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#9*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#6*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#9*4] @ 24 - @ ldr r1,[sp,#6*4] - mov r0,r2,ror#7 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#8*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#1*4] - - add r12,r12,r0 - eor r0,r8,r8,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r8,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#8*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 24==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 24<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#10*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#7*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#10*4] @ 25 - @ ldr r1,[sp,#7*4] - mov r0,r2,ror#7 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#9*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#2*4] - - add r3,r3,r0 - eor r0,r7,r7,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r7,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#9*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 25==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 25<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#11*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#8*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#11*4] @ 26 - @ ldr r1,[sp,#8*4] - mov r0,r2,ror#7 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#10*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#3*4] - - add r12,r12,r0 - eor r0,r6,r6,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r6,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#10*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 26==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 26<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#12*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#9*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#12*4] @ 27 - @ ldr r1,[sp,#9*4] - mov r0,r2,ror#7 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#11*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#4*4] - - add r3,r3,r0 - eor r0,r5,r5,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r5,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#11*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 27==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 27<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#13*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#10*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#13*4] @ 28 - @ ldr r1,[sp,#10*4] - mov r0,r2,ror#7 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#12*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#5*4] - - add r12,r12,r0 - eor r0,r4,r4,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r4,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#12*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 28==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 28<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#14*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#11*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#14*4] @ 29 - @ ldr r1,[sp,#11*4] - mov r0,r2,ror#7 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#13*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#6*4] - - add r3,r3,r0 - eor r0,r11,r11,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r11,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#13*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 29==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 29<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#15*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#12*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#15*4] @ 30 - @ ldr r1,[sp,#12*4] - mov r0,r2,ror#7 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#14*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#7*4] - - add r12,r12,r0 - eor r0,r10,r10,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r10,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#14*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 30==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 30<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#0*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#13*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#0*4] @ 31 - @ ldr r1,[sp,#13*4] - mov r0,r2,ror#7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#15*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#8*4] - - add r3,r3,r0 - eor r0,r9,r9,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r9,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#15*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 31==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 31<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#1*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#14*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) - ldreq r3,[sp,#16*4] @ pull ctx - bne .Lrounds_16_xx - - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldr r0,[r3,#0] - ldr r2,[r3,#4] - ldr r12,[r3,#8] - add r4,r4,r0 - ldr r0,[r3,#12] - add r5,r5,r2 - ldr r2,[r3,#16] - add r6,r6,r12 - ldr r12,[r3,#20] - add r7,r7,r0 - ldr r0,[r3,#24] - add r8,r8,r2 - ldr r2,[r3,#28] - add r9,r9,r12 - ldr r1,[sp,#17*4] @ pull inp - ldr r12,[sp,#18*4] @ pull inp+len - add r10,r10,r0 - add r11,r11,r2 - stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} - cmp r1,r12 - sub r14,r14,#256 @ rewind Ktbl - bne .Loop - - add sp,sp,#19*4 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r11,pc} -#else - ldmia sp!,{r4-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size sha256_block_data_order,.-sha256_block_data_order -#if __ARM_ARCH__>=7 -.fpu neon - -.type sha256_block_data_order_neon,%function -.align 4 -sha256_block_data_order_neon: -.LNEON: - stmdb sp!,{r4-r12,lr} - - mov r12,sp - sub sp,sp,#16*4+16 @ alloca - sub r14,r3,#256+32 @ K256 - bic sp,sp,#15 @ align for 128-bit stores - - vld1.8 {q0},[r1]! - vld1.8 {q1},[r1]! - vld1.8 {q2},[r1]! - vld1.8 {q3},[r1]! - vld1.32 {q8},[r14,:128]! - vld1.32 {q9},[r14,:128]! - vld1.32 {q10},[r14,:128]! - vld1.32 {q11},[r14,:128]! - vrev32.8 q0,q0 @ yes, even on - str r0,[sp,#64] - vrev32.8 q1,q1 @ big-endian - str r1,[sp,#68] - mov r1,sp - vrev32.8 q2,q2 - str r2,[sp,#72] - vrev32.8 q3,q3 - str r12,[sp,#76] @ save original sp - vadd.i32 q8,q8,q0 - vadd.i32 q9,q9,q1 - vst1.32 {q8},[r1,:128]! - vadd.i32 q10,q10,q2 - vst1.32 {q9},[r1,:128]! - vadd.i32 q11,q11,q3 - vst1.32 {q10},[r1,:128]! - vst1.32 {q11},[r1,:128]! - - ldmia r0,{r4-r11} - sub r1,r1,#64 - ldr r2,[sp,#0] - eor r12,r12,r12 - eor r3,r5,r6 - b .L_00_48 - -.align 4 -.L_00_48: - vext.8 q8,q0,q1,#4 - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - vext.8 q9,q2,q3,#4 - add r4,r4,r12 - and r2,r2,r8 - eor r12,r0,r8,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vadd.i32 q0,q0,q9 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - vshr.u32 q9,q8,#3 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#4] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - veor q9,q9,q10 - add r10,r10,r2 - vsli.32 q11,q8,#14 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - vshr.u32 d24,d7,#17 - add r11,r11,r3 - and r2,r2,r7 - veor q9,q9,q11 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - vsli.32 d24,d7,#15 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - vshr.u32 d25,d7,#10 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - vadd.i32 q0,q0,q9 - add r10,r10,r2 - ldr r2,[sp,#8] - veor d25,d25,d24 - and r12,r12,r3 - add r6,r6,r10 - vshr.u32 d24,d7,#19 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - vsli.32 d24,d7,#13 - add r9,r9,r2 - eor r2,r7,r8 - veor d25,d25,d24 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - vadd.i32 d0,d0,d25 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - vshr.u32 d24,d0,#17 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - vsli.32 d24,d0,#15 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - vshr.u32 d25,d0,#10 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - veor d25,d25,d24 - ldr r2,[sp,#12] - and r3,r3,r12 - vshr.u32 d24,d0,#19 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - vld1.32 {q8},[r14,:128]! - add r8,r8,r2 - vsli.32 d24,d0,#13 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - veor d25,d25,d24 - add r9,r9,r3 - and r2,r2,r5 - vadd.i32 d1,d1,d25 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - vadd.i32 q8,q8,q0 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#16] - and r12,r12,r3 - add r4,r4,r8 - vst1.32 {q8},[r1,:128]! - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vext.8 q8,q1,q2,#4 - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - vext.8 q9,q3,q0,#4 - add r8,r8,r12 - and r2,r2,r4 - eor r12,r0,r4,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vadd.i32 q1,q1,q9 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - vshr.u32 q9,q8,#3 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#20] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - veor q9,q9,q10 - add r6,r6,r2 - vsli.32 q11,q8,#14 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - vshr.u32 d24,d1,#17 - add r7,r7,r3 - and r2,r2,r11 - veor q9,q9,q11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - vsli.32 d24,d1,#15 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - vshr.u32 d25,d1,#10 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - vadd.i32 q1,q1,q9 - add r6,r6,r2 - ldr r2,[sp,#24] - veor d25,d25,d24 - and r12,r12,r3 - add r10,r10,r6 - vshr.u32 d24,d1,#19 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - vsli.32 d24,d1,#13 - add r5,r5,r2 - eor r2,r11,r4 - veor d25,d25,d24 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - vadd.i32 d2,d2,d25 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - vshr.u32 d24,d2,#17 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - vsli.32 d24,d2,#15 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - vshr.u32 d25,d2,#10 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - veor d25,d25,d24 - ldr r2,[sp,#28] - and r3,r3,r12 - vshr.u32 d24,d2,#19 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - vld1.32 {q8},[r14,:128]! - add r4,r4,r2 - vsli.32 d24,d2,#13 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - veor d25,d25,d24 - add r5,r5,r3 - and r2,r2,r9 - vadd.i32 d3,d3,d25 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - vadd.i32 q8,q8,q1 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#32] - and r12,r12,r3 - add r8,r8,r4 - vst1.32 {q8},[r1,:128]! - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vext.8 q8,q2,q3,#4 - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - vext.8 q9,q0,q1,#4 - add r4,r4,r12 - and r2,r2,r8 - eor r12,r0,r8,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vadd.i32 q2,q2,q9 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - vshr.u32 q9,q8,#3 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#36] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - veor q9,q9,q10 - add r10,r10,r2 - vsli.32 q11,q8,#14 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - vshr.u32 d24,d3,#17 - add r11,r11,r3 - and r2,r2,r7 - veor q9,q9,q11 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - vsli.32 d24,d3,#15 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - vshr.u32 d25,d3,#10 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - vadd.i32 q2,q2,q9 - add r10,r10,r2 - ldr r2,[sp,#40] - veor d25,d25,d24 - and r12,r12,r3 - add r6,r6,r10 - vshr.u32 d24,d3,#19 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - vsli.32 d24,d3,#13 - add r9,r9,r2 - eor r2,r7,r8 - veor d25,d25,d24 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - vadd.i32 d4,d4,d25 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - vshr.u32 d24,d4,#17 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - vsli.32 d24,d4,#15 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - vshr.u32 d25,d4,#10 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - veor d25,d25,d24 - ldr r2,[sp,#44] - and r3,r3,r12 - vshr.u32 d24,d4,#19 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - vld1.32 {q8},[r14,:128]! - add r8,r8,r2 - vsli.32 d24,d4,#13 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - veor d25,d25,d24 - add r9,r9,r3 - and r2,r2,r5 - vadd.i32 d5,d5,d25 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - vadd.i32 q8,q8,q2 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#48] - and r12,r12,r3 - add r4,r4,r8 - vst1.32 {q8},[r1,:128]! - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vext.8 q8,q3,q0,#4 - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - vext.8 q9,q1,q2,#4 - add r8,r8,r12 - and r2,r2,r4 - eor r12,r0,r4,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vadd.i32 q3,q3,q9 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - vshr.u32 q9,q8,#3 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#52] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - veor q9,q9,q10 - add r6,r6,r2 - vsli.32 q11,q8,#14 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - vshr.u32 d24,d5,#17 - add r7,r7,r3 - and r2,r2,r11 - veor q9,q9,q11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - vsli.32 d24,d5,#15 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - vshr.u32 d25,d5,#10 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - vadd.i32 q3,q3,q9 - add r6,r6,r2 - ldr r2,[sp,#56] - veor d25,d25,d24 - and r12,r12,r3 - add r10,r10,r6 - vshr.u32 d24,d5,#19 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - vsli.32 d24,d5,#13 - add r5,r5,r2 - eor r2,r11,r4 - veor d25,d25,d24 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - vadd.i32 d6,d6,d25 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - vshr.u32 d24,d6,#17 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - vsli.32 d24,d6,#15 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - vshr.u32 d25,d6,#10 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - veor d25,d25,d24 - ldr r2,[sp,#60] - and r3,r3,r12 - vshr.u32 d24,d6,#19 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - vld1.32 {q8},[r14,:128]! - add r4,r4,r2 - vsli.32 d24,d6,#13 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - veor d25,d25,d24 - add r5,r5,r3 - and r2,r2,r9 - vadd.i32 d7,d7,d25 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - vadd.i32 q8,q8,q3 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[r14] - and r12,r12,r3 - add r8,r8,r4 - vst1.32 {q8},[r1,:128]! - add r4,r4,r0,ror#2 - eor r12,r12,r6 - teq r2,#0 @ check for K256 terminator - ldr r2,[sp,#0] - sub r1,r1,#64 - bne .L_00_48 - - ldr r1,[sp,#68] - ldr r0,[sp,#72] - sub r14,r14,#256 @ rewind r14 - teq r1,r0 - subeq r1,r1,#64 @ avoid SEGV - vld1.8 {q0},[r1]! @ load next input block - vld1.8 {q1},[r1]! - vld1.8 {q2},[r1]! - vld1.8 {q3},[r1]! - strne r1,[sp,#68] - mov r1,sp - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - add r4,r4,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r8 - eor r12,r0,r8,ror#19 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vrev32.8 q0,q0 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vadd.i32 q8,q8,q0 - ldr r2,[sp,#4] - and r3,r3,r12 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - add r10,r10,r2 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - add r11,r11,r3 - and r2,r2,r7 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - add r10,r10,r2 - ldr r2,[sp,#8] - and r12,r12,r3 - add r6,r6,r10 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - add r9,r9,r2 - eor r2,r7,r8 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - ldr r2,[sp,#12] - and r3,r3,r12 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - add r8,r8,r2 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - add r9,r9,r3 - and r2,r2,r5 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#16] - and r12,r12,r3 - add r4,r4,r8 - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vst1.32 {q8},[r1,:128]! - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - add r8,r8,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r4 - eor r12,r0,r4,ror#19 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vrev32.8 q1,q1 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vadd.i32 q8,q8,q1 - ldr r2,[sp,#20] - and r3,r3,r12 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - add r6,r6,r2 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - add r7,r7,r3 - and r2,r2,r11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - add r6,r6,r2 - ldr r2,[sp,#24] - and r12,r12,r3 - add r10,r10,r6 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - add r5,r5,r2 - eor r2,r11,r4 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - ldr r2,[sp,#28] - and r3,r3,r12 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - add r4,r4,r2 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - add r5,r5,r3 - and r2,r2,r9 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#32] - and r12,r12,r3 - add r8,r8,r4 - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vst1.32 {q8},[r1,:128]! - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - add r4,r4,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r8 - eor r12,r0,r8,ror#19 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vrev32.8 q2,q2 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vadd.i32 q8,q8,q2 - ldr r2,[sp,#36] - and r3,r3,r12 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - add r10,r10,r2 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - add r11,r11,r3 - and r2,r2,r7 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - add r10,r10,r2 - ldr r2,[sp,#40] - and r12,r12,r3 - add r6,r6,r10 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - add r9,r9,r2 - eor r2,r7,r8 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - ldr r2,[sp,#44] - and r3,r3,r12 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - add r8,r8,r2 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - add r9,r9,r3 - and r2,r2,r5 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#48] - and r12,r12,r3 - add r4,r4,r8 - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vst1.32 {q8},[r1,:128]! - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - add r8,r8,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r4 - eor r12,r0,r4,ror#19 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vrev32.8 q3,q3 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vadd.i32 q8,q8,q3 - ldr r2,[sp,#52] - and r3,r3,r12 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - add r6,r6,r2 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - add r7,r7,r3 - and r2,r2,r11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - add r6,r6,r2 - ldr r2,[sp,#56] - and r12,r12,r3 - add r10,r10,r6 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - add r5,r5,r2 - eor r2,r11,r4 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - ldr r2,[sp,#60] - and r3,r3,r12 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - add r4,r4,r2 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - add r5,r5,r3 - and r2,r2,r9 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#64] - and r12,r12,r3 - add r8,r8,r4 - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vst1.32 {q8},[r1,:128]! - ldr r0,[r2,#0] - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldr r12,[r2,#4] - ldr r3,[r2,#8] - ldr r1,[r2,#12] - add r4,r4,r0 @ accumulate - ldr r0,[r2,#16] - add r5,r5,r12 - ldr r12,[r2,#20] - add r6,r6,r3 - ldr r3,[r2,#24] - add r7,r7,r1 - ldr r1,[r2,#28] - add r8,r8,r0 - str r4,[r2],#4 - add r9,r9,r12 - str r5,[r2],#4 - add r10,r10,r3 - str r6,[r2],#4 - add r11,r11,r1 - str r7,[r2],#4 - stmia r2,{r8-r11} - - movne r1,sp - ldrne r2,[sp,#0] - eorne r12,r12,r12 - ldreq sp,[sp,#76] @ restore original sp - eorne r3,r5,r6 - bne .L_00_48 - - ldmia sp!,{r4-r12,pc} -.size sha256_block_data_order_neon,.-sha256_block_data_order_neon -#endif -#if __ARM_ARCH__>=7 -.type sha256_block_data_order_armv8,%function -.align 5 -sha256_block_data_order_armv8: -.LARMv8: - vld1.32 {q0,q1},[r0] - sub r3,r3,#sha256_block_data_order-K256 - -.Loop_v8: - vld1.8 {q8-q9},[r1]! - vld1.8 {q10-q11},[r1]! - vld1.32 {q12},[r3]! - vrev32.8 q8,q8 - vrev32.8 q9,q9 - vrev32.8 q10,q10 - vrev32.8 q11,q11 - vmov q14,q0 @ offload - vmov q15,q1 - teq r1,r2 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 - vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 - vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 - vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 - vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 - vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 - vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 - vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 - vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 - vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 - vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 - vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 - vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - - vld1.32 {q13},[r3] - vadd.i32 q12,q12,q10 - sub r3,r3,#256-16 @ rewind - vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - - vadd.i32 q13,q13,q11 - vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - - vadd.i32 q0,q0,q14 - vadd.i32 q1,q1,q15 - bne .Loop_v8 - - vst1.32 {q0,q1},[r0] - - bx lr @ bx lr -.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 -#endif -.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " -.align 2 -.comm OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha512-armv4.S b/main/openssl/crypto/sha/asm/sha512-armv4.S new file mode 100644 index 00000000..fd462771 --- /dev/null +++ b/main/openssl/crypto/sha/asm/sha512-armv4.S @@ -0,0 +1,1783 @@ +#include "arm_arch.h" +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +.code 32 +.type K512,%object +.align 5 +K512: +WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) +WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) +WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) +WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) +WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) +WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) +WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) +WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) +WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) +WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) +WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) +WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) +WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) +WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) +WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) +WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) +WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) +WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) +WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) +WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) +WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) +WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) +WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) +WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) +WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) +WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) +WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) +WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) +WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) +WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) +WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) +WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) +WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) +WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) +WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) +WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) +WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) +WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) +WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) +WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha512_block_data_order +.skip 32-4 + +.global sha512_block_data_order +.type sha512_block_data_order,%function +sha512_block_data_order: + sub r3,pc,#8 @ sha512_block_data_order + add r2,r1,r2,lsl#7 @ len to point at the end of inp +#if __ARM_ARCH__>=7 + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#1 + bne .LNEON +#endif + stmdb sp!,{r4-r12,lr} + sub r14,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr r7,[r0,#32+LO] + ldr r8,[r0,#32+HI] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] +.Loop: + str r9, [sp,#48+0] + str r10, [sp,#48+4] + str r11, [sp,#56+0] + str r12, [sp,#56+4] + ldr r5,[r0,#0+LO] + ldr r6,[r0,#0+HI] + ldr r3,[r0,#8+LO] + ldr r4,[r0,#8+HI] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + str r3,[sp,#8+0] + str r4,[sp,#8+4] + str r9, [sp,#16+0] + str r10, [sp,#16+4] + str r11, [sp,#24+0] + str r12, [sp,#24+4] + ldr r3,[r0,#40+LO] + ldr r4,[r0,#40+HI] + str r3,[sp,#40+0] + str r4,[sp,#40+4] + +.L00_15: +#if __ARM_ARCH__<7 + ldrb r3,[r1,#7] + ldrb r9, [r1,#6] + ldrb r10, [r1,#5] + ldrb r11, [r1,#4] + ldrb r4,[r1,#3] + ldrb r12, [r1,#2] + orr r3,r3,r9,lsl#8 + ldrb r9, [r1,#1] + orr r3,r3,r10,lsl#16 + ldrb r10, [r1],#8 + orr r3,r3,r11,lsl#24 + orr r4,r4,r12,lsl#8 + orr r4,r4,r9,lsl#16 + orr r4,r4,r10,lsl#24 +#else + ldr r3,[r1,#4] + ldr r4,[r1],#8 +#ifdef __ARMEL__ + rev r3,r3 + rev r4,r4 +#endif +#endif + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#148 + + ldr r12,[sp,#16+0] @ c.lo + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + tst r14,#1 + beq .L00_15 + ldr r9,[sp,#184+0] + ldr r10,[sp,#184+4] + bic r14,r14,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov r3,r9,lsr#1 + ldr r11,[sp,#80+0] + mov r4,r10,lsr#1 + ldr r12,[sp,#80+4] + eor r3,r3,r10,lsl#31 + eor r4,r4,r9,lsl#31 + eor r3,r3,r9,lsr#8 + eor r4,r4,r10,lsr#8 + eor r3,r3,r10,lsl#24 + eor r4,r4,r9,lsl#24 + eor r3,r3,r9,lsr#7 + eor r4,r4,r10,lsr#7 + eor r3,r3,r10,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov r9,r11,lsr#19 + mov r10,r12,lsr#19 + eor r9,r9,r12,lsl#13 + eor r10,r10,r11,lsl#13 + eor r9,r9,r12,lsr#29 + eor r10,r10,r11,lsr#29 + eor r9,r9,r11,lsl#3 + eor r10,r10,r12,lsl#3 + eor r9,r9,r11,lsr#6 + eor r10,r10,r12,lsr#6 + ldr r11,[sp,#120+0] + eor r9,r9,r12,lsl#26 + + ldr r12,[sp,#120+4] + adds r3,r3,r9 + ldr r9,[sp,#192+0] + adc r4,r4,r10 + + ldr r10,[sp,#192+4] + adds r3,r3,r11 + adc r4,r4,r12 + adds r3,r3,r9 + adc r4,r4,r10 + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#23 + + ldr r12,[sp,#16+0] @ c.lo + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + ldreq r9,[sp,#184+0] + ldreq r10,[sp,#184+4] + beq .L16_79 + bic r14,r14,#1 + + ldr r3,[sp,#8+0] + ldr r4,[sp,#8+4] + ldr r9, [r0,#0+LO] + ldr r10, [r0,#0+HI] + ldr r11, [r0,#8+LO] + ldr r12, [r0,#8+HI] + adds r9,r5,r9 + str r9, [r0,#0+LO] + adc r10,r6,r10 + str r10, [r0,#0+HI] + adds r11,r3,r11 + str r11, [r0,#8+LO] + adc r12,r4,r12 + str r12, [r0,#8+HI] + + ldr r5,[sp,#16+0] + ldr r6,[sp,#16+4] + ldr r3,[sp,#24+0] + ldr r4,[sp,#24+4] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + adds r9,r5,r9 + str r9, [r0,#16+LO] + adc r10,r6,r10 + str r10, [r0,#16+HI] + adds r11,r3,r11 + str r11, [r0,#24+LO] + adc r12,r4,r12 + str r12, [r0,#24+HI] + + ldr r3,[sp,#40+0] + ldr r4,[sp,#40+4] + ldr r9, [r0,#32+LO] + ldr r10, [r0,#32+HI] + ldr r11, [r0,#40+LO] + ldr r12, [r0,#40+HI] + adds r7,r7,r9 + str r7,[r0,#32+LO] + adc r8,r8,r10 + str r8,[r0,#32+HI] + adds r11,r3,r11 + str r11, [r0,#40+LO] + adc r12,r4,r12 + str r12, [r0,#40+HI] + + ldr r5,[sp,#48+0] + ldr r6,[sp,#48+4] + ldr r3,[sp,#56+0] + ldr r4,[sp,#56+4] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] + adds r9,r5,r9 + str r9, [r0,#48+LO] + adc r10,r6,r10 + str r10, [r0,#48+HI] + adds r11,r3,r11 + str r11, [r0,#56+LO] + adc r12,r4,r12 + str r12, [r0,#56+HI] + + add sp,sp,#640 + sub r14,r14,#640 + + teq r1,r2 + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +#if __ARM_ARCH__>=7 +.fpu neon + +.align 4 +.LNEON: + dmb @ errata #451034 on early Cortex A8 + vstmdb sp!,{d8-d15} @ ABI specification says so + sub r3,r3,#672 @ K512 + vldmia r0,{d16-d23} @ load context +.Loop_neon: + vshr.u64 d24,d20,#14 @ 0 +#if 0<16 + vld1.64 {d0},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vsli.64 d26,d20,#23 +#if 0<16 && defined(__ARMEL__) + vrev64.8 d0,d0 +#endif + vadd.i64 d27,d28,d23 + veor d29,d21,d22 + veor d24,d25 + vand d29,d20 + veor d24,d26 @ Sigma1(e) + veor d29,d22 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d16,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d16,#34 + vshr.u64 d26,d16,#39 + vsli.64 d24,d16,#36 + vsli.64 d25,d16,#30 + vsli.64 d26,d16,#25 + vadd.i64 d27,d0 + vorr d30,d16,d18 + vand d29,d16,d18 + veor d23,d24,d25 + vand d30,d17 + veor d23,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d23,d27 + vadd.i64 d19,d27 + vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 1 +#if 1<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vsli.64 d26,d19,#23 +#if 1<16 && defined(__ARMEL__) + vrev64.8 d1,d1 +#endif + vadd.i64 d27,d28,d22 + veor d29,d20,d21 + veor d24,d25 + vand d29,d19 + veor d24,d26 @ Sigma1(e) + veor d29,d21 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d23,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d23,#34 + vshr.u64 d26,d23,#39 + vsli.64 d24,d23,#36 + vsli.64 d25,d23,#30 + vsli.64 d26,d23,#25 + vadd.i64 d27,d1 + vorr d30,d23,d17 + vand d29,d23,d17 + veor d22,d24,d25 + vand d30,d16 + veor d22,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d22,d27 + vadd.i64 d18,d27 + vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 2 +#if 2<16 + vld1.64 {d2},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vsli.64 d26,d18,#23 +#if 2<16 && defined(__ARMEL__) + vrev64.8 d2,d2 +#endif + vadd.i64 d27,d28,d21 + veor d29,d19,d20 + veor d24,d25 + vand d29,d18 + veor d24,d26 @ Sigma1(e) + veor d29,d20 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d22,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d22,#34 + vshr.u64 d26,d22,#39 + vsli.64 d24,d22,#36 + vsli.64 d25,d22,#30 + vsli.64 d26,d22,#25 + vadd.i64 d27,d2 + vorr d30,d22,d16 + vand d29,d22,d16 + veor d21,d24,d25 + vand d30,d23 + veor d21,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d21,d27 + vadd.i64 d17,d27 + vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 3 +#if 3<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vsli.64 d26,d17,#23 +#if 3<16 && defined(__ARMEL__) + vrev64.8 d3,d3 +#endif + vadd.i64 d27,d28,d20 + veor d29,d18,d19 + veor d24,d25 + vand d29,d17 + veor d24,d26 @ Sigma1(e) + veor d29,d19 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d21,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d21,#34 + vshr.u64 d26,d21,#39 + vsli.64 d24,d21,#36 + vsli.64 d25,d21,#30 + vsli.64 d26,d21,#25 + vadd.i64 d27,d3 + vorr d30,d21,d23 + vand d29,d21,d23 + veor d20,d24,d25 + vand d30,d22 + veor d20,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d20,d27 + vadd.i64 d16,d27 + vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 4 +#if 4<16 + vld1.64 {d4},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vsli.64 d26,d16,#23 +#if 4<16 && defined(__ARMEL__) + vrev64.8 d4,d4 +#endif + vadd.i64 d27,d28,d19 + veor d29,d17,d18 + veor d24,d25 + vand d29,d16 + veor d24,d26 @ Sigma1(e) + veor d29,d18 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d20,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d20,#34 + vshr.u64 d26,d20,#39 + vsli.64 d24,d20,#36 + vsli.64 d25,d20,#30 + vsli.64 d26,d20,#25 + vadd.i64 d27,d4 + vorr d30,d20,d22 + vand d29,d20,d22 + veor d19,d24,d25 + vand d30,d21 + veor d19,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d19,d27 + vadd.i64 d23,d27 + vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 5 +#if 5<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vsli.64 d26,d23,#23 +#if 5<16 && defined(__ARMEL__) + vrev64.8 d5,d5 +#endif + vadd.i64 d27,d28,d18 + veor d29,d16,d17 + veor d24,d25 + vand d29,d23 + veor d24,d26 @ Sigma1(e) + veor d29,d17 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d19,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d19,#34 + vshr.u64 d26,d19,#39 + vsli.64 d24,d19,#36 + vsli.64 d25,d19,#30 + vsli.64 d26,d19,#25 + vadd.i64 d27,d5 + vorr d30,d19,d21 + vand d29,d19,d21 + veor d18,d24,d25 + vand d30,d20 + veor d18,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d18,d27 + vadd.i64 d22,d27 + vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 6 +#if 6<16 + vld1.64 {d6},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vsli.64 d26,d22,#23 +#if 6<16 && defined(__ARMEL__) + vrev64.8 d6,d6 +#endif + vadd.i64 d27,d28,d17 + veor d29,d23,d16 + veor d24,d25 + vand d29,d22 + veor d24,d26 @ Sigma1(e) + veor d29,d16 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d18,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d18,#34 + vshr.u64 d26,d18,#39 + vsli.64 d24,d18,#36 + vsli.64 d25,d18,#30 + vsli.64 d26,d18,#25 + vadd.i64 d27,d6 + vorr d30,d18,d20 + vand d29,d18,d20 + veor d17,d24,d25 + vand d30,d19 + veor d17,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d17,d27 + vadd.i64 d21,d27 + vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 7 +#if 7<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vsli.64 d26,d21,#23 +#if 7<16 && defined(__ARMEL__) + vrev64.8 d7,d7 +#endif + vadd.i64 d27,d28,d16 + veor d29,d22,d23 + veor d24,d25 + vand d29,d21 + veor d24,d26 @ Sigma1(e) + veor d29,d23 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d17,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d17,#34 + vshr.u64 d26,d17,#39 + vsli.64 d24,d17,#36 + vsli.64 d25,d17,#30 + vsli.64 d26,d17,#25 + vadd.i64 d27,d7 + vorr d30,d17,d19 + vand d29,d17,d19 + veor d16,d24,d25 + vand d30,d18 + veor d16,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d16,d27 + vadd.i64 d20,d27 + vadd.i64 d16,d30 + vshr.u64 d24,d20,#14 @ 8 +#if 8<16 + vld1.64 {d8},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vsli.64 d26,d20,#23 +#if 8<16 && defined(__ARMEL__) + vrev64.8 d8,d8 +#endif + vadd.i64 d27,d28,d23 + veor d29,d21,d22 + veor d24,d25 + vand d29,d20 + veor d24,d26 @ Sigma1(e) + veor d29,d22 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d16,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d16,#34 + vshr.u64 d26,d16,#39 + vsli.64 d24,d16,#36 + vsli.64 d25,d16,#30 + vsli.64 d26,d16,#25 + vadd.i64 d27,d8 + vorr d30,d16,d18 + vand d29,d16,d18 + veor d23,d24,d25 + vand d30,d17 + veor d23,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d23,d27 + vadd.i64 d19,d27 + vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 9 +#if 9<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vsli.64 d26,d19,#23 +#if 9<16 && defined(__ARMEL__) + vrev64.8 d9,d9 +#endif + vadd.i64 d27,d28,d22 + veor d29,d20,d21 + veor d24,d25 + vand d29,d19 + veor d24,d26 @ Sigma1(e) + veor d29,d21 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d23,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d23,#34 + vshr.u64 d26,d23,#39 + vsli.64 d24,d23,#36 + vsli.64 d25,d23,#30 + vsli.64 d26,d23,#25 + vadd.i64 d27,d9 + vorr d30,d23,d17 + vand d29,d23,d17 + veor d22,d24,d25 + vand d30,d16 + veor d22,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d22,d27 + vadd.i64 d18,d27 + vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 10 +#if 10<16 + vld1.64 {d10},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vsli.64 d26,d18,#23 +#if 10<16 && defined(__ARMEL__) + vrev64.8 d10,d10 +#endif + vadd.i64 d27,d28,d21 + veor d29,d19,d20 + veor d24,d25 + vand d29,d18 + veor d24,d26 @ Sigma1(e) + veor d29,d20 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d22,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d22,#34 + vshr.u64 d26,d22,#39 + vsli.64 d24,d22,#36 + vsli.64 d25,d22,#30 + vsli.64 d26,d22,#25 + vadd.i64 d27,d10 + vorr d30,d22,d16 + vand d29,d22,d16 + veor d21,d24,d25 + vand d30,d23 + veor d21,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d21,d27 + vadd.i64 d17,d27 + vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 11 +#if 11<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vsli.64 d26,d17,#23 +#if 11<16 && defined(__ARMEL__) + vrev64.8 d11,d11 +#endif + vadd.i64 d27,d28,d20 + veor d29,d18,d19 + veor d24,d25 + vand d29,d17 + veor d24,d26 @ Sigma1(e) + veor d29,d19 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d21,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d21,#34 + vshr.u64 d26,d21,#39 + vsli.64 d24,d21,#36 + vsli.64 d25,d21,#30 + vsli.64 d26,d21,#25 + vadd.i64 d27,d11 + vorr d30,d21,d23 + vand d29,d21,d23 + veor d20,d24,d25 + vand d30,d22 + veor d20,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d20,d27 + vadd.i64 d16,d27 + vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 12 +#if 12<16 + vld1.64 {d12},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vsli.64 d26,d16,#23 +#if 12<16 && defined(__ARMEL__) + vrev64.8 d12,d12 +#endif + vadd.i64 d27,d28,d19 + veor d29,d17,d18 + veor d24,d25 + vand d29,d16 + veor d24,d26 @ Sigma1(e) + veor d29,d18 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d20,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d20,#34 + vshr.u64 d26,d20,#39 + vsli.64 d24,d20,#36 + vsli.64 d25,d20,#30 + vsli.64 d26,d20,#25 + vadd.i64 d27,d12 + vorr d30,d20,d22 + vand d29,d20,d22 + veor d19,d24,d25 + vand d30,d21 + veor d19,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d19,d27 + vadd.i64 d23,d27 + vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 13 +#if 13<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vsli.64 d26,d23,#23 +#if 13<16 && defined(__ARMEL__) + vrev64.8 d13,d13 +#endif + vadd.i64 d27,d28,d18 + veor d29,d16,d17 + veor d24,d25 + vand d29,d23 + veor d24,d26 @ Sigma1(e) + veor d29,d17 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d19,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d19,#34 + vshr.u64 d26,d19,#39 + vsli.64 d24,d19,#36 + vsli.64 d25,d19,#30 + vsli.64 d26,d19,#25 + vadd.i64 d27,d13 + vorr d30,d19,d21 + vand d29,d19,d21 + veor d18,d24,d25 + vand d30,d20 + veor d18,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d18,d27 + vadd.i64 d22,d27 + vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 14 +#if 14<16 + vld1.64 {d14},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vsli.64 d26,d22,#23 +#if 14<16 && defined(__ARMEL__) + vrev64.8 d14,d14 +#endif + vadd.i64 d27,d28,d17 + veor d29,d23,d16 + veor d24,d25 + vand d29,d22 + veor d24,d26 @ Sigma1(e) + veor d29,d16 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d18,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d18,#34 + vshr.u64 d26,d18,#39 + vsli.64 d24,d18,#36 + vsli.64 d25,d18,#30 + vsli.64 d26,d18,#25 + vadd.i64 d27,d14 + vorr d30,d18,d20 + vand d29,d18,d20 + veor d17,d24,d25 + vand d30,d19 + veor d17,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d17,d27 + vadd.i64 d21,d27 + vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 15 +#if 15<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vsli.64 d26,d21,#23 +#if 15<16 && defined(__ARMEL__) + vrev64.8 d15,d15 +#endif + vadd.i64 d27,d28,d16 + veor d29,d22,d23 + veor d24,d25 + vand d29,d21 + veor d24,d26 @ Sigma1(e) + veor d29,d23 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d17,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d17,#34 + vshr.u64 d26,d17,#39 + vsli.64 d24,d17,#36 + vsli.64 d25,d17,#30 + vsli.64 d26,d17,#25 + vadd.i64 d27,d15 + vorr d30,d17,d19 + vand d29,d17,d19 + veor d16,d24,d25 + vand d30,d18 + veor d16,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d16,d27 + vadd.i64 d20,d27 + vadd.i64 d16,d30 + mov r12,#4 +.L16_79_neon: + subs r12,#1 + vshr.u64 q12,q7,#19 + vshr.u64 q13,q7,#61 + vshr.u64 q15,q7,#6 + vsli.64 q12,q7,#45 + vext.8 q14,q0,q1,#8 @ X[i+1] + vsli.64 q13,q7,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q0,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q4,q5,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q0,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q0,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vsli.64 d26,d20,#23 +#if 16<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d23 + veor d29,d21,d22 + veor d24,d25 + vand d29,d20 + veor d24,d26 @ Sigma1(e) + veor d29,d22 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d16,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d16,#34 + vshr.u64 d26,d16,#39 + vsli.64 d24,d16,#36 + vsli.64 d25,d16,#30 + vsli.64 d26,d16,#25 + vadd.i64 d27,d0 + vorr d30,d16,d18 + vand d29,d16,d18 + veor d23,d24,d25 + vand d30,d17 + veor d23,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d23,d27 + vadd.i64 d19,d27 + vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 17 +#if 17<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vsli.64 d26,d19,#23 +#if 17<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d22 + veor d29,d20,d21 + veor d24,d25 + vand d29,d19 + veor d24,d26 @ Sigma1(e) + veor d29,d21 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d23,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d23,#34 + vshr.u64 d26,d23,#39 + vsli.64 d24,d23,#36 + vsli.64 d25,d23,#30 + vsli.64 d26,d23,#25 + vadd.i64 d27,d1 + vorr d30,d23,d17 + vand d29,d23,d17 + veor d22,d24,d25 + vand d30,d16 + veor d22,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d22,d27 + vadd.i64 d18,d27 + vadd.i64 d22,d30 + vshr.u64 q12,q0,#19 + vshr.u64 q13,q0,#61 + vshr.u64 q15,q0,#6 + vsli.64 q12,q0,#45 + vext.8 q14,q1,q2,#8 @ X[i+1] + vsli.64 q13,q0,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q1,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q5,q6,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q1,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q1,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vsli.64 d26,d18,#23 +#if 18<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d21 + veor d29,d19,d20 + veor d24,d25 + vand d29,d18 + veor d24,d26 @ Sigma1(e) + veor d29,d20 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d22,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d22,#34 + vshr.u64 d26,d22,#39 + vsli.64 d24,d22,#36 + vsli.64 d25,d22,#30 + vsli.64 d26,d22,#25 + vadd.i64 d27,d2 + vorr d30,d22,d16 + vand d29,d22,d16 + veor d21,d24,d25 + vand d30,d23 + veor d21,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d21,d27 + vadd.i64 d17,d27 + vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 19 +#if 19<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vsli.64 d26,d17,#23 +#if 19<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d20 + veor d29,d18,d19 + veor d24,d25 + vand d29,d17 + veor d24,d26 @ Sigma1(e) + veor d29,d19 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d21,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d21,#34 + vshr.u64 d26,d21,#39 + vsli.64 d24,d21,#36 + vsli.64 d25,d21,#30 + vsli.64 d26,d21,#25 + vadd.i64 d27,d3 + vorr d30,d21,d23 + vand d29,d21,d23 + veor d20,d24,d25 + vand d30,d22 + veor d20,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d20,d27 + vadd.i64 d16,d27 + vadd.i64 d20,d30 + vshr.u64 q12,q1,#19 + vshr.u64 q13,q1,#61 + vshr.u64 q15,q1,#6 + vsli.64 q12,q1,#45 + vext.8 q14,q2,q3,#8 @ X[i+1] + vsli.64 q13,q1,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q2,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q6,q7,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q2,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q2,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vsli.64 d26,d16,#23 +#if 20<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d19 + veor d29,d17,d18 + veor d24,d25 + vand d29,d16 + veor d24,d26 @ Sigma1(e) + veor d29,d18 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d20,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d20,#34 + vshr.u64 d26,d20,#39 + vsli.64 d24,d20,#36 + vsli.64 d25,d20,#30 + vsli.64 d26,d20,#25 + vadd.i64 d27,d4 + vorr d30,d20,d22 + vand d29,d20,d22 + veor d19,d24,d25 + vand d30,d21 + veor d19,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d19,d27 + vadd.i64 d23,d27 + vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 21 +#if 21<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vsli.64 d26,d23,#23 +#if 21<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d18 + veor d29,d16,d17 + veor d24,d25 + vand d29,d23 + veor d24,d26 @ Sigma1(e) + veor d29,d17 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d19,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d19,#34 + vshr.u64 d26,d19,#39 + vsli.64 d24,d19,#36 + vsli.64 d25,d19,#30 + vsli.64 d26,d19,#25 + vadd.i64 d27,d5 + vorr d30,d19,d21 + vand d29,d19,d21 + veor d18,d24,d25 + vand d30,d20 + veor d18,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d18,d27 + vadd.i64 d22,d27 + vadd.i64 d18,d30 + vshr.u64 q12,q2,#19 + vshr.u64 q13,q2,#61 + vshr.u64 q15,q2,#6 + vsli.64 q12,q2,#45 + vext.8 q14,q3,q4,#8 @ X[i+1] + vsli.64 q13,q2,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q3,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q7,q0,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q3,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q3,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vsli.64 d26,d22,#23 +#if 22<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d17 + veor d29,d23,d16 + veor d24,d25 + vand d29,d22 + veor d24,d26 @ Sigma1(e) + veor d29,d16 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d18,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d18,#34 + vshr.u64 d26,d18,#39 + vsli.64 d24,d18,#36 + vsli.64 d25,d18,#30 + vsli.64 d26,d18,#25 + vadd.i64 d27,d6 + vorr d30,d18,d20 + vand d29,d18,d20 + veor d17,d24,d25 + vand d30,d19 + veor d17,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d17,d27 + vadd.i64 d21,d27 + vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 23 +#if 23<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vsli.64 d26,d21,#23 +#if 23<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d16 + veor d29,d22,d23 + veor d24,d25 + vand d29,d21 + veor d24,d26 @ Sigma1(e) + veor d29,d23 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d17,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d17,#34 + vshr.u64 d26,d17,#39 + vsli.64 d24,d17,#36 + vsli.64 d25,d17,#30 + vsli.64 d26,d17,#25 + vadd.i64 d27,d7 + vorr d30,d17,d19 + vand d29,d17,d19 + veor d16,d24,d25 + vand d30,d18 + veor d16,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d16,d27 + vadd.i64 d20,d27 + vadd.i64 d16,d30 + vshr.u64 q12,q3,#19 + vshr.u64 q13,q3,#61 + vshr.u64 q15,q3,#6 + vsli.64 q12,q3,#45 + vext.8 q14,q4,q5,#8 @ X[i+1] + vsli.64 q13,q3,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q4,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q0,q1,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q4,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q4,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vsli.64 d26,d20,#23 +#if 24<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d23 + veor d29,d21,d22 + veor d24,d25 + vand d29,d20 + veor d24,d26 @ Sigma1(e) + veor d29,d22 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d16,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d16,#34 + vshr.u64 d26,d16,#39 + vsli.64 d24,d16,#36 + vsli.64 d25,d16,#30 + vsli.64 d26,d16,#25 + vadd.i64 d27,d8 + vorr d30,d16,d18 + vand d29,d16,d18 + veor d23,d24,d25 + vand d30,d17 + veor d23,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d23,d27 + vadd.i64 d19,d27 + vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 25 +#if 25<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vsli.64 d26,d19,#23 +#if 25<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d22 + veor d29,d20,d21 + veor d24,d25 + vand d29,d19 + veor d24,d26 @ Sigma1(e) + veor d29,d21 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d23,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d23,#34 + vshr.u64 d26,d23,#39 + vsli.64 d24,d23,#36 + vsli.64 d25,d23,#30 + vsli.64 d26,d23,#25 + vadd.i64 d27,d9 + vorr d30,d23,d17 + vand d29,d23,d17 + veor d22,d24,d25 + vand d30,d16 + veor d22,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d22,d27 + vadd.i64 d18,d27 + vadd.i64 d22,d30 + vshr.u64 q12,q4,#19 + vshr.u64 q13,q4,#61 + vshr.u64 q15,q4,#6 + vsli.64 q12,q4,#45 + vext.8 q14,q5,q6,#8 @ X[i+1] + vsli.64 q13,q4,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q5,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q1,q2,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q5,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q5,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vsli.64 d26,d18,#23 +#if 26<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d21 + veor d29,d19,d20 + veor d24,d25 + vand d29,d18 + veor d24,d26 @ Sigma1(e) + veor d29,d20 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d22,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d22,#34 + vshr.u64 d26,d22,#39 + vsli.64 d24,d22,#36 + vsli.64 d25,d22,#30 + vsli.64 d26,d22,#25 + vadd.i64 d27,d10 + vorr d30,d22,d16 + vand d29,d22,d16 + veor d21,d24,d25 + vand d30,d23 + veor d21,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d21,d27 + vadd.i64 d17,d27 + vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 27 +#if 27<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vsli.64 d26,d17,#23 +#if 27<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d20 + veor d29,d18,d19 + veor d24,d25 + vand d29,d17 + veor d24,d26 @ Sigma1(e) + veor d29,d19 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d21,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d21,#34 + vshr.u64 d26,d21,#39 + vsli.64 d24,d21,#36 + vsli.64 d25,d21,#30 + vsli.64 d26,d21,#25 + vadd.i64 d27,d11 + vorr d30,d21,d23 + vand d29,d21,d23 + veor d20,d24,d25 + vand d30,d22 + veor d20,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d20,d27 + vadd.i64 d16,d27 + vadd.i64 d20,d30 + vshr.u64 q12,q5,#19 + vshr.u64 q13,q5,#61 + vshr.u64 q15,q5,#6 + vsli.64 q12,q5,#45 + vext.8 q14,q6,q7,#8 @ X[i+1] + vsli.64 q13,q5,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q6,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q2,q3,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q6,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q6,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vsli.64 d26,d16,#23 +#if 28<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d19 + veor d29,d17,d18 + veor d24,d25 + vand d29,d16 + veor d24,d26 @ Sigma1(e) + veor d29,d18 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d20,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d20,#34 + vshr.u64 d26,d20,#39 + vsli.64 d24,d20,#36 + vsli.64 d25,d20,#30 + vsli.64 d26,d20,#25 + vadd.i64 d27,d12 + vorr d30,d20,d22 + vand d29,d20,d22 + veor d19,d24,d25 + vand d30,d21 + veor d19,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d19,d27 + vadd.i64 d23,d27 + vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 29 +#if 29<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vsli.64 d26,d23,#23 +#if 29<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d18 + veor d29,d16,d17 + veor d24,d25 + vand d29,d23 + veor d24,d26 @ Sigma1(e) + veor d29,d17 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d19,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d19,#34 + vshr.u64 d26,d19,#39 + vsli.64 d24,d19,#36 + vsli.64 d25,d19,#30 + vsli.64 d26,d19,#25 + vadd.i64 d27,d13 + vorr d30,d19,d21 + vand d29,d19,d21 + veor d18,d24,d25 + vand d30,d20 + veor d18,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d18,d27 + vadd.i64 d22,d27 + vadd.i64 d18,d30 + vshr.u64 q12,q6,#19 + vshr.u64 q13,q6,#61 + vshr.u64 q15,q6,#6 + vsli.64 q12,q6,#45 + vext.8 q14,q7,q0,#8 @ X[i+1] + vsli.64 q13,q6,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q7,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q3,q4,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q7,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q7,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vsli.64 d26,d22,#23 +#if 30<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d17 + veor d29,d23,d16 + veor d24,d25 + vand d29,d22 + veor d24,d26 @ Sigma1(e) + veor d29,d16 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d18,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d18,#34 + vshr.u64 d26,d18,#39 + vsli.64 d24,d18,#36 + vsli.64 d25,d18,#30 + vsli.64 d26,d18,#25 + vadd.i64 d27,d14 + vorr d30,d18,d20 + vand d29,d18,d20 + veor d17,d24,d25 + vand d30,d19 + veor d17,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d17,d27 + vadd.i64 d21,d27 + vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 31 +#if 31<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vsli.64 d26,d21,#23 +#if 31<16 && defined(__ARMEL__) + vrev64.8 , +#endif + vadd.i64 d27,d28,d16 + veor d29,d22,d23 + veor d24,d25 + vand d29,d21 + veor d24,d26 @ Sigma1(e) + veor d29,d23 @ Ch(e,f,g) + vadd.i64 d27,d24 + vshr.u64 d24,d17,#28 + vadd.i64 d27,d29 + vshr.u64 d25,d17,#34 + vshr.u64 d26,d17,#39 + vsli.64 d24,d17,#36 + vsli.64 d25,d17,#30 + vsli.64 d26,d17,#25 + vadd.i64 d27,d15 + vorr d30,d17,d19 + vand d29,d17,d19 + veor d16,d24,d25 + vand d30,d18 + veor d16,d26 @ Sigma0(a) + vorr d30,d29 @ Maj(a,b,c) + vadd.i64 d16,d27 + vadd.i64 d20,d27 + vadd.i64 d16,d30 + bne .L16_79_neon + + vldmia r0,{d24-d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia r0,{d16-d23} @ save context + teq r1,r2 + sub r3,#640 @ rewind K512 + bne .Loop_neon + + vldmia sp!,{d8-d15} @ epilogue + bx lr @ .word 0xe12fff1e +#endif +.size sha512_block_data_order,.-sha512_block_data_order +.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " +.align 2 +.comm OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha512-armv4.s b/main/openssl/crypto/sha/asm/sha512-armv4.s deleted file mode 100644 index fd462771..00000000 --- a/main/openssl/crypto/sha/asm/sha512-armv4.s +++ /dev/null @@ -1,1783 +0,0 @@ -#include "arm_arch.h" -#ifdef __ARMEL__ -# define LO 0 -# define HI 4 -# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 -#else -# define HI 0 -# define LO 4 -# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 -#endif - -.text -.code 32 -.type K512,%object -.align 5 -K512: -WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) -WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) -WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) -WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) -WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) -WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) -WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) -WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) -WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) -WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) -WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) -WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) -WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) -WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) -WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) -WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) -WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) -WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) -WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) -WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) -WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) -WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) -WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) -WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) -WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) -WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) -WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) -WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) -WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) -WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) -WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) -WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) -WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) -WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) -WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) -WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) -WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) -WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) -WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) -WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) -.size K512,.-K512 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha512_block_data_order -.skip 32-4 - -.global sha512_block_data_order -.type sha512_block_data_order,%function -sha512_block_data_order: - sub r3,pc,#8 @ sha512_block_data_order - add r2,r1,r2,lsl#7 @ len to point at the end of inp -#if __ARM_ARCH__>=7 - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#1 - bne .LNEON -#endif - stmdb sp!,{r4-r12,lr} - sub r14,r3,#672 @ K512 - sub sp,sp,#9*8 - - ldr r7,[r0,#32+LO] - ldr r8,[r0,#32+HI] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] -.Loop: - str r9, [sp,#48+0] - str r10, [sp,#48+4] - str r11, [sp,#56+0] - str r12, [sp,#56+4] - ldr r5,[r0,#0+LO] - ldr r6,[r0,#0+HI] - ldr r3,[r0,#8+LO] - ldr r4,[r0,#8+HI] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - str r3,[sp,#8+0] - str r4,[sp,#8+4] - str r9, [sp,#16+0] - str r10, [sp,#16+4] - str r11, [sp,#24+0] - str r12, [sp,#24+4] - ldr r3,[r0,#40+LO] - ldr r4,[r0,#40+HI] - str r3,[sp,#40+0] - str r4,[sp,#40+4] - -.L00_15: -#if __ARM_ARCH__<7 - ldrb r3,[r1,#7] - ldrb r9, [r1,#6] - ldrb r10, [r1,#5] - ldrb r11, [r1,#4] - ldrb r4,[r1,#3] - ldrb r12, [r1,#2] - orr r3,r3,r9,lsl#8 - ldrb r9, [r1,#1] - orr r3,r3,r10,lsl#16 - ldrb r10, [r1],#8 - orr r3,r3,r11,lsl#24 - orr r4,r4,r12,lsl#8 - orr r4,r4,r9,lsl#16 - orr r4,r4,r10,lsl#24 -#else - ldr r3,[r1,#4] - ldr r4,[r1],#8 -#ifdef __ARMEL__ - rev r3,r3 - rev r4,r4 -#endif -#endif - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#148 - - ldr r12,[sp,#16+0] @ c.lo - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 - tst r14,#1 - beq .L00_15 - ldr r9,[sp,#184+0] - ldr r10,[sp,#184+4] - bic r14,r14,#1 -.L16_79: - @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) - @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 - @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 - mov r3,r9,lsr#1 - ldr r11,[sp,#80+0] - mov r4,r10,lsr#1 - ldr r12,[sp,#80+4] - eor r3,r3,r10,lsl#31 - eor r4,r4,r9,lsl#31 - eor r3,r3,r9,lsr#8 - eor r4,r4,r10,lsr#8 - eor r3,r3,r10,lsl#24 - eor r4,r4,r9,lsl#24 - eor r3,r3,r9,lsr#7 - eor r4,r4,r10,lsr#7 - eor r3,r3,r10,lsl#25 - - @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 - @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 - mov r9,r11,lsr#19 - mov r10,r12,lsr#19 - eor r9,r9,r12,lsl#13 - eor r10,r10,r11,lsl#13 - eor r9,r9,r12,lsr#29 - eor r10,r10,r11,lsr#29 - eor r9,r9,r11,lsl#3 - eor r10,r10,r12,lsl#3 - eor r9,r9,r11,lsr#6 - eor r10,r10,r12,lsr#6 - ldr r11,[sp,#120+0] - eor r9,r9,r12,lsl#26 - - ldr r12,[sp,#120+4] - adds r3,r3,r9 - ldr r9,[sp,#192+0] - adc r4,r4,r10 - - ldr r10,[sp,#192+4] - adds r3,r3,r11 - adc r4,r4,r12 - adds r3,r3,r9 - adc r4,r4,r10 - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#23 - - ldr r12,[sp,#16+0] @ c.lo - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 - ldreq r9,[sp,#184+0] - ldreq r10,[sp,#184+4] - beq .L16_79 - bic r14,r14,#1 - - ldr r3,[sp,#8+0] - ldr r4,[sp,#8+4] - ldr r9, [r0,#0+LO] - ldr r10, [r0,#0+HI] - ldr r11, [r0,#8+LO] - ldr r12, [r0,#8+HI] - adds r9,r5,r9 - str r9, [r0,#0+LO] - adc r10,r6,r10 - str r10, [r0,#0+HI] - adds r11,r3,r11 - str r11, [r0,#8+LO] - adc r12,r4,r12 - str r12, [r0,#8+HI] - - ldr r5,[sp,#16+0] - ldr r6,[sp,#16+4] - ldr r3,[sp,#24+0] - ldr r4,[sp,#24+4] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - adds r9,r5,r9 - str r9, [r0,#16+LO] - adc r10,r6,r10 - str r10, [r0,#16+HI] - adds r11,r3,r11 - str r11, [r0,#24+LO] - adc r12,r4,r12 - str r12, [r0,#24+HI] - - ldr r3,[sp,#40+0] - ldr r4,[sp,#40+4] - ldr r9, [r0,#32+LO] - ldr r10, [r0,#32+HI] - ldr r11, [r0,#40+LO] - ldr r12, [r0,#40+HI] - adds r7,r7,r9 - str r7,[r0,#32+LO] - adc r8,r8,r10 - str r8,[r0,#32+HI] - adds r11,r3,r11 - str r11, [r0,#40+LO] - adc r12,r4,r12 - str r12, [r0,#40+HI] - - ldr r5,[sp,#48+0] - ldr r6,[sp,#48+4] - ldr r3,[sp,#56+0] - ldr r4,[sp,#56+4] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] - adds r9,r5,r9 - str r9, [r0,#48+LO] - adc r10,r6,r10 - str r10, [r0,#48+HI] - adds r11,r3,r11 - str r11, [r0,#56+LO] - adc r12,r4,r12 - str r12, [r0,#56+HI] - - add sp,sp,#640 - sub r14,r14,#640 - - teq r1,r2 - bne .Loop - - add sp,sp,#8*9 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -#if __ARM_ARCH__>=7 -.fpu neon - -.align 4 -.LNEON: - dmb @ errata #451034 on early Cortex A8 - vstmdb sp!,{d8-d15} @ ABI specification says so - sub r3,r3,#672 @ K512 - vldmia r0,{d16-d23} @ load context -.Loop_neon: - vshr.u64 d24,d20,#14 @ 0 -#if 0<16 - vld1.64 {d0},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vsli.64 d26,d20,#23 -#if 0<16 && defined(__ARMEL__) - vrev64.8 d0,d0 -#endif - vadd.i64 d27,d28,d23 - veor d29,d21,d22 - veor d24,d25 - vand d29,d20 - veor d24,d26 @ Sigma1(e) - veor d29,d22 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d16,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d16,#34 - vshr.u64 d26,d16,#39 - vsli.64 d24,d16,#36 - vsli.64 d25,d16,#30 - vsli.64 d26,d16,#25 - vadd.i64 d27,d0 - vorr d30,d16,d18 - vand d29,d16,d18 - veor d23,d24,d25 - vand d30,d17 - veor d23,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d23,d27 - vadd.i64 d19,d27 - vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 1 -#if 1<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vsli.64 d26,d19,#23 -#if 1<16 && defined(__ARMEL__) - vrev64.8 d1,d1 -#endif - vadd.i64 d27,d28,d22 - veor d29,d20,d21 - veor d24,d25 - vand d29,d19 - veor d24,d26 @ Sigma1(e) - veor d29,d21 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d23,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d23,#34 - vshr.u64 d26,d23,#39 - vsli.64 d24,d23,#36 - vsli.64 d25,d23,#30 - vsli.64 d26,d23,#25 - vadd.i64 d27,d1 - vorr d30,d23,d17 - vand d29,d23,d17 - veor d22,d24,d25 - vand d30,d16 - veor d22,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d22,d27 - vadd.i64 d18,d27 - vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 2 -#if 2<16 - vld1.64 {d2},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vsli.64 d26,d18,#23 -#if 2<16 && defined(__ARMEL__) - vrev64.8 d2,d2 -#endif - vadd.i64 d27,d28,d21 - veor d29,d19,d20 - veor d24,d25 - vand d29,d18 - veor d24,d26 @ Sigma1(e) - veor d29,d20 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d22,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d22,#34 - vshr.u64 d26,d22,#39 - vsli.64 d24,d22,#36 - vsli.64 d25,d22,#30 - vsli.64 d26,d22,#25 - vadd.i64 d27,d2 - vorr d30,d22,d16 - vand d29,d22,d16 - veor d21,d24,d25 - vand d30,d23 - veor d21,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d21,d27 - vadd.i64 d17,d27 - vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 3 -#if 3<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vsli.64 d26,d17,#23 -#if 3<16 && defined(__ARMEL__) - vrev64.8 d3,d3 -#endif - vadd.i64 d27,d28,d20 - veor d29,d18,d19 - veor d24,d25 - vand d29,d17 - veor d24,d26 @ Sigma1(e) - veor d29,d19 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d21,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d21,#34 - vshr.u64 d26,d21,#39 - vsli.64 d24,d21,#36 - vsli.64 d25,d21,#30 - vsli.64 d26,d21,#25 - vadd.i64 d27,d3 - vorr d30,d21,d23 - vand d29,d21,d23 - veor d20,d24,d25 - vand d30,d22 - veor d20,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d20,d27 - vadd.i64 d16,d27 - vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 4 -#if 4<16 - vld1.64 {d4},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vsli.64 d26,d16,#23 -#if 4<16 && defined(__ARMEL__) - vrev64.8 d4,d4 -#endif - vadd.i64 d27,d28,d19 - veor d29,d17,d18 - veor d24,d25 - vand d29,d16 - veor d24,d26 @ Sigma1(e) - veor d29,d18 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d20,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d20,#34 - vshr.u64 d26,d20,#39 - vsli.64 d24,d20,#36 - vsli.64 d25,d20,#30 - vsli.64 d26,d20,#25 - vadd.i64 d27,d4 - vorr d30,d20,d22 - vand d29,d20,d22 - veor d19,d24,d25 - vand d30,d21 - veor d19,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d19,d27 - vadd.i64 d23,d27 - vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 5 -#if 5<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vsli.64 d26,d23,#23 -#if 5<16 && defined(__ARMEL__) - vrev64.8 d5,d5 -#endif - vadd.i64 d27,d28,d18 - veor d29,d16,d17 - veor d24,d25 - vand d29,d23 - veor d24,d26 @ Sigma1(e) - veor d29,d17 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d19,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d19,#34 - vshr.u64 d26,d19,#39 - vsli.64 d24,d19,#36 - vsli.64 d25,d19,#30 - vsli.64 d26,d19,#25 - vadd.i64 d27,d5 - vorr d30,d19,d21 - vand d29,d19,d21 - veor d18,d24,d25 - vand d30,d20 - veor d18,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d18,d27 - vadd.i64 d22,d27 - vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 6 -#if 6<16 - vld1.64 {d6},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vsli.64 d26,d22,#23 -#if 6<16 && defined(__ARMEL__) - vrev64.8 d6,d6 -#endif - vadd.i64 d27,d28,d17 - veor d29,d23,d16 - veor d24,d25 - vand d29,d22 - veor d24,d26 @ Sigma1(e) - veor d29,d16 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d18,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d18,#34 - vshr.u64 d26,d18,#39 - vsli.64 d24,d18,#36 - vsli.64 d25,d18,#30 - vsli.64 d26,d18,#25 - vadd.i64 d27,d6 - vorr d30,d18,d20 - vand d29,d18,d20 - veor d17,d24,d25 - vand d30,d19 - veor d17,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d17,d27 - vadd.i64 d21,d27 - vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 7 -#if 7<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vsli.64 d26,d21,#23 -#if 7<16 && defined(__ARMEL__) - vrev64.8 d7,d7 -#endif - vadd.i64 d27,d28,d16 - veor d29,d22,d23 - veor d24,d25 - vand d29,d21 - veor d24,d26 @ Sigma1(e) - veor d29,d23 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d17,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d17,#34 - vshr.u64 d26,d17,#39 - vsli.64 d24,d17,#36 - vsli.64 d25,d17,#30 - vsli.64 d26,d17,#25 - vadd.i64 d27,d7 - vorr d30,d17,d19 - vand d29,d17,d19 - veor d16,d24,d25 - vand d30,d18 - veor d16,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d16,d27 - vadd.i64 d20,d27 - vadd.i64 d16,d30 - vshr.u64 d24,d20,#14 @ 8 -#if 8<16 - vld1.64 {d8},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vsli.64 d26,d20,#23 -#if 8<16 && defined(__ARMEL__) - vrev64.8 d8,d8 -#endif - vadd.i64 d27,d28,d23 - veor d29,d21,d22 - veor d24,d25 - vand d29,d20 - veor d24,d26 @ Sigma1(e) - veor d29,d22 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d16,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d16,#34 - vshr.u64 d26,d16,#39 - vsli.64 d24,d16,#36 - vsli.64 d25,d16,#30 - vsli.64 d26,d16,#25 - vadd.i64 d27,d8 - vorr d30,d16,d18 - vand d29,d16,d18 - veor d23,d24,d25 - vand d30,d17 - veor d23,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d23,d27 - vadd.i64 d19,d27 - vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 9 -#if 9<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vsli.64 d26,d19,#23 -#if 9<16 && defined(__ARMEL__) - vrev64.8 d9,d9 -#endif - vadd.i64 d27,d28,d22 - veor d29,d20,d21 - veor d24,d25 - vand d29,d19 - veor d24,d26 @ Sigma1(e) - veor d29,d21 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d23,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d23,#34 - vshr.u64 d26,d23,#39 - vsli.64 d24,d23,#36 - vsli.64 d25,d23,#30 - vsli.64 d26,d23,#25 - vadd.i64 d27,d9 - vorr d30,d23,d17 - vand d29,d23,d17 - veor d22,d24,d25 - vand d30,d16 - veor d22,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d22,d27 - vadd.i64 d18,d27 - vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 10 -#if 10<16 - vld1.64 {d10},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vsli.64 d26,d18,#23 -#if 10<16 && defined(__ARMEL__) - vrev64.8 d10,d10 -#endif - vadd.i64 d27,d28,d21 - veor d29,d19,d20 - veor d24,d25 - vand d29,d18 - veor d24,d26 @ Sigma1(e) - veor d29,d20 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d22,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d22,#34 - vshr.u64 d26,d22,#39 - vsli.64 d24,d22,#36 - vsli.64 d25,d22,#30 - vsli.64 d26,d22,#25 - vadd.i64 d27,d10 - vorr d30,d22,d16 - vand d29,d22,d16 - veor d21,d24,d25 - vand d30,d23 - veor d21,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d21,d27 - vadd.i64 d17,d27 - vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 11 -#if 11<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vsli.64 d26,d17,#23 -#if 11<16 && defined(__ARMEL__) - vrev64.8 d11,d11 -#endif - vadd.i64 d27,d28,d20 - veor d29,d18,d19 - veor d24,d25 - vand d29,d17 - veor d24,d26 @ Sigma1(e) - veor d29,d19 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d21,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d21,#34 - vshr.u64 d26,d21,#39 - vsli.64 d24,d21,#36 - vsli.64 d25,d21,#30 - vsli.64 d26,d21,#25 - vadd.i64 d27,d11 - vorr d30,d21,d23 - vand d29,d21,d23 - veor d20,d24,d25 - vand d30,d22 - veor d20,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d20,d27 - vadd.i64 d16,d27 - vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 12 -#if 12<16 - vld1.64 {d12},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vsli.64 d26,d16,#23 -#if 12<16 && defined(__ARMEL__) - vrev64.8 d12,d12 -#endif - vadd.i64 d27,d28,d19 - veor d29,d17,d18 - veor d24,d25 - vand d29,d16 - veor d24,d26 @ Sigma1(e) - veor d29,d18 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d20,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d20,#34 - vshr.u64 d26,d20,#39 - vsli.64 d24,d20,#36 - vsli.64 d25,d20,#30 - vsli.64 d26,d20,#25 - vadd.i64 d27,d12 - vorr d30,d20,d22 - vand d29,d20,d22 - veor d19,d24,d25 - vand d30,d21 - veor d19,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d19,d27 - vadd.i64 d23,d27 - vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 13 -#if 13<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vsli.64 d26,d23,#23 -#if 13<16 && defined(__ARMEL__) - vrev64.8 d13,d13 -#endif - vadd.i64 d27,d28,d18 - veor d29,d16,d17 - veor d24,d25 - vand d29,d23 - veor d24,d26 @ Sigma1(e) - veor d29,d17 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d19,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d19,#34 - vshr.u64 d26,d19,#39 - vsli.64 d24,d19,#36 - vsli.64 d25,d19,#30 - vsli.64 d26,d19,#25 - vadd.i64 d27,d13 - vorr d30,d19,d21 - vand d29,d19,d21 - veor d18,d24,d25 - vand d30,d20 - veor d18,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d18,d27 - vadd.i64 d22,d27 - vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 14 -#if 14<16 - vld1.64 {d14},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vsli.64 d26,d22,#23 -#if 14<16 && defined(__ARMEL__) - vrev64.8 d14,d14 -#endif - vadd.i64 d27,d28,d17 - veor d29,d23,d16 - veor d24,d25 - vand d29,d22 - veor d24,d26 @ Sigma1(e) - veor d29,d16 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d18,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d18,#34 - vshr.u64 d26,d18,#39 - vsli.64 d24,d18,#36 - vsli.64 d25,d18,#30 - vsli.64 d26,d18,#25 - vadd.i64 d27,d14 - vorr d30,d18,d20 - vand d29,d18,d20 - veor d17,d24,d25 - vand d30,d19 - veor d17,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d17,d27 - vadd.i64 d21,d27 - vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 15 -#if 15<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vsli.64 d26,d21,#23 -#if 15<16 && defined(__ARMEL__) - vrev64.8 d15,d15 -#endif - vadd.i64 d27,d28,d16 - veor d29,d22,d23 - veor d24,d25 - vand d29,d21 - veor d24,d26 @ Sigma1(e) - veor d29,d23 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d17,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d17,#34 - vshr.u64 d26,d17,#39 - vsli.64 d24,d17,#36 - vsli.64 d25,d17,#30 - vsli.64 d26,d17,#25 - vadd.i64 d27,d15 - vorr d30,d17,d19 - vand d29,d17,d19 - veor d16,d24,d25 - vand d30,d18 - veor d16,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d16,d27 - vadd.i64 d20,d27 - vadd.i64 d16,d30 - mov r12,#4 -.L16_79_neon: - subs r12,#1 - vshr.u64 q12,q7,#19 - vshr.u64 q13,q7,#61 - vshr.u64 q15,q7,#6 - vsli.64 q12,q7,#45 - vext.8 q14,q0,q1,#8 @ X[i+1] - vsli.64 q13,q7,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q0,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q4,q5,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q0,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q0,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vsli.64 d26,d20,#23 -#if 16<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d23 - veor d29,d21,d22 - veor d24,d25 - vand d29,d20 - veor d24,d26 @ Sigma1(e) - veor d29,d22 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d16,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d16,#34 - vshr.u64 d26,d16,#39 - vsli.64 d24,d16,#36 - vsli.64 d25,d16,#30 - vsli.64 d26,d16,#25 - vadd.i64 d27,d0 - vorr d30,d16,d18 - vand d29,d16,d18 - veor d23,d24,d25 - vand d30,d17 - veor d23,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d23,d27 - vadd.i64 d19,d27 - vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 17 -#if 17<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vsli.64 d26,d19,#23 -#if 17<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d22 - veor d29,d20,d21 - veor d24,d25 - vand d29,d19 - veor d24,d26 @ Sigma1(e) - veor d29,d21 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d23,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d23,#34 - vshr.u64 d26,d23,#39 - vsli.64 d24,d23,#36 - vsli.64 d25,d23,#30 - vsli.64 d26,d23,#25 - vadd.i64 d27,d1 - vorr d30,d23,d17 - vand d29,d23,d17 - veor d22,d24,d25 - vand d30,d16 - veor d22,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d22,d27 - vadd.i64 d18,d27 - vadd.i64 d22,d30 - vshr.u64 q12,q0,#19 - vshr.u64 q13,q0,#61 - vshr.u64 q15,q0,#6 - vsli.64 q12,q0,#45 - vext.8 q14,q1,q2,#8 @ X[i+1] - vsli.64 q13,q0,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q1,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q5,q6,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q1,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q1,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vsli.64 d26,d18,#23 -#if 18<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d21 - veor d29,d19,d20 - veor d24,d25 - vand d29,d18 - veor d24,d26 @ Sigma1(e) - veor d29,d20 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d22,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d22,#34 - vshr.u64 d26,d22,#39 - vsli.64 d24,d22,#36 - vsli.64 d25,d22,#30 - vsli.64 d26,d22,#25 - vadd.i64 d27,d2 - vorr d30,d22,d16 - vand d29,d22,d16 - veor d21,d24,d25 - vand d30,d23 - veor d21,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d21,d27 - vadd.i64 d17,d27 - vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 19 -#if 19<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vsli.64 d26,d17,#23 -#if 19<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d20 - veor d29,d18,d19 - veor d24,d25 - vand d29,d17 - veor d24,d26 @ Sigma1(e) - veor d29,d19 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d21,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d21,#34 - vshr.u64 d26,d21,#39 - vsli.64 d24,d21,#36 - vsli.64 d25,d21,#30 - vsli.64 d26,d21,#25 - vadd.i64 d27,d3 - vorr d30,d21,d23 - vand d29,d21,d23 - veor d20,d24,d25 - vand d30,d22 - veor d20,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d20,d27 - vadd.i64 d16,d27 - vadd.i64 d20,d30 - vshr.u64 q12,q1,#19 - vshr.u64 q13,q1,#61 - vshr.u64 q15,q1,#6 - vsli.64 q12,q1,#45 - vext.8 q14,q2,q3,#8 @ X[i+1] - vsli.64 q13,q1,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q2,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q6,q7,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q2,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q2,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vsli.64 d26,d16,#23 -#if 20<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d19 - veor d29,d17,d18 - veor d24,d25 - vand d29,d16 - veor d24,d26 @ Sigma1(e) - veor d29,d18 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d20,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d20,#34 - vshr.u64 d26,d20,#39 - vsli.64 d24,d20,#36 - vsli.64 d25,d20,#30 - vsli.64 d26,d20,#25 - vadd.i64 d27,d4 - vorr d30,d20,d22 - vand d29,d20,d22 - veor d19,d24,d25 - vand d30,d21 - veor d19,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d19,d27 - vadd.i64 d23,d27 - vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 21 -#if 21<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vsli.64 d26,d23,#23 -#if 21<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d18 - veor d29,d16,d17 - veor d24,d25 - vand d29,d23 - veor d24,d26 @ Sigma1(e) - veor d29,d17 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d19,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d19,#34 - vshr.u64 d26,d19,#39 - vsli.64 d24,d19,#36 - vsli.64 d25,d19,#30 - vsli.64 d26,d19,#25 - vadd.i64 d27,d5 - vorr d30,d19,d21 - vand d29,d19,d21 - veor d18,d24,d25 - vand d30,d20 - veor d18,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d18,d27 - vadd.i64 d22,d27 - vadd.i64 d18,d30 - vshr.u64 q12,q2,#19 - vshr.u64 q13,q2,#61 - vshr.u64 q15,q2,#6 - vsli.64 q12,q2,#45 - vext.8 q14,q3,q4,#8 @ X[i+1] - vsli.64 q13,q2,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q3,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q7,q0,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q3,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q3,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vsli.64 d26,d22,#23 -#if 22<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d17 - veor d29,d23,d16 - veor d24,d25 - vand d29,d22 - veor d24,d26 @ Sigma1(e) - veor d29,d16 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d18,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d18,#34 - vshr.u64 d26,d18,#39 - vsli.64 d24,d18,#36 - vsli.64 d25,d18,#30 - vsli.64 d26,d18,#25 - vadd.i64 d27,d6 - vorr d30,d18,d20 - vand d29,d18,d20 - veor d17,d24,d25 - vand d30,d19 - veor d17,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d17,d27 - vadd.i64 d21,d27 - vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 23 -#if 23<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vsli.64 d26,d21,#23 -#if 23<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d16 - veor d29,d22,d23 - veor d24,d25 - vand d29,d21 - veor d24,d26 @ Sigma1(e) - veor d29,d23 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d17,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d17,#34 - vshr.u64 d26,d17,#39 - vsli.64 d24,d17,#36 - vsli.64 d25,d17,#30 - vsli.64 d26,d17,#25 - vadd.i64 d27,d7 - vorr d30,d17,d19 - vand d29,d17,d19 - veor d16,d24,d25 - vand d30,d18 - veor d16,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d16,d27 - vadd.i64 d20,d27 - vadd.i64 d16,d30 - vshr.u64 q12,q3,#19 - vshr.u64 q13,q3,#61 - vshr.u64 q15,q3,#6 - vsli.64 q12,q3,#45 - vext.8 q14,q4,q5,#8 @ X[i+1] - vsli.64 q13,q3,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q4,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q0,q1,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q4,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q4,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vsli.64 d26,d20,#23 -#if 24<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d23 - veor d29,d21,d22 - veor d24,d25 - vand d29,d20 - veor d24,d26 @ Sigma1(e) - veor d29,d22 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d16,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d16,#34 - vshr.u64 d26,d16,#39 - vsli.64 d24,d16,#36 - vsli.64 d25,d16,#30 - vsli.64 d26,d16,#25 - vadd.i64 d27,d8 - vorr d30,d16,d18 - vand d29,d16,d18 - veor d23,d24,d25 - vand d30,d17 - veor d23,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d23,d27 - vadd.i64 d19,d27 - vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 25 -#if 25<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vsli.64 d26,d19,#23 -#if 25<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d22 - veor d29,d20,d21 - veor d24,d25 - vand d29,d19 - veor d24,d26 @ Sigma1(e) - veor d29,d21 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d23,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d23,#34 - vshr.u64 d26,d23,#39 - vsli.64 d24,d23,#36 - vsli.64 d25,d23,#30 - vsli.64 d26,d23,#25 - vadd.i64 d27,d9 - vorr d30,d23,d17 - vand d29,d23,d17 - veor d22,d24,d25 - vand d30,d16 - veor d22,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d22,d27 - vadd.i64 d18,d27 - vadd.i64 d22,d30 - vshr.u64 q12,q4,#19 - vshr.u64 q13,q4,#61 - vshr.u64 q15,q4,#6 - vsli.64 q12,q4,#45 - vext.8 q14,q5,q6,#8 @ X[i+1] - vsli.64 q13,q4,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q5,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q1,q2,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q5,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q5,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vsli.64 d26,d18,#23 -#if 26<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d21 - veor d29,d19,d20 - veor d24,d25 - vand d29,d18 - veor d24,d26 @ Sigma1(e) - veor d29,d20 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d22,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d22,#34 - vshr.u64 d26,d22,#39 - vsli.64 d24,d22,#36 - vsli.64 d25,d22,#30 - vsli.64 d26,d22,#25 - vadd.i64 d27,d10 - vorr d30,d22,d16 - vand d29,d22,d16 - veor d21,d24,d25 - vand d30,d23 - veor d21,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d21,d27 - vadd.i64 d17,d27 - vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 27 -#if 27<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vsli.64 d26,d17,#23 -#if 27<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d20 - veor d29,d18,d19 - veor d24,d25 - vand d29,d17 - veor d24,d26 @ Sigma1(e) - veor d29,d19 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d21,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d21,#34 - vshr.u64 d26,d21,#39 - vsli.64 d24,d21,#36 - vsli.64 d25,d21,#30 - vsli.64 d26,d21,#25 - vadd.i64 d27,d11 - vorr d30,d21,d23 - vand d29,d21,d23 - veor d20,d24,d25 - vand d30,d22 - veor d20,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d20,d27 - vadd.i64 d16,d27 - vadd.i64 d20,d30 - vshr.u64 q12,q5,#19 - vshr.u64 q13,q5,#61 - vshr.u64 q15,q5,#6 - vsli.64 q12,q5,#45 - vext.8 q14,q6,q7,#8 @ X[i+1] - vsli.64 q13,q5,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q6,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q2,q3,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q6,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q6,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vsli.64 d26,d16,#23 -#if 28<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d19 - veor d29,d17,d18 - veor d24,d25 - vand d29,d16 - veor d24,d26 @ Sigma1(e) - veor d29,d18 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d20,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d20,#34 - vshr.u64 d26,d20,#39 - vsli.64 d24,d20,#36 - vsli.64 d25,d20,#30 - vsli.64 d26,d20,#25 - vadd.i64 d27,d12 - vorr d30,d20,d22 - vand d29,d20,d22 - veor d19,d24,d25 - vand d30,d21 - veor d19,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d19,d27 - vadd.i64 d23,d27 - vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 29 -#if 29<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vsli.64 d26,d23,#23 -#if 29<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d18 - veor d29,d16,d17 - veor d24,d25 - vand d29,d23 - veor d24,d26 @ Sigma1(e) - veor d29,d17 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d19,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d19,#34 - vshr.u64 d26,d19,#39 - vsli.64 d24,d19,#36 - vsli.64 d25,d19,#30 - vsli.64 d26,d19,#25 - vadd.i64 d27,d13 - vorr d30,d19,d21 - vand d29,d19,d21 - veor d18,d24,d25 - vand d30,d20 - veor d18,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d18,d27 - vadd.i64 d22,d27 - vadd.i64 d18,d30 - vshr.u64 q12,q6,#19 - vshr.u64 q13,q6,#61 - vshr.u64 q15,q6,#6 - vsli.64 q12,q6,#45 - vext.8 q14,q7,q0,#8 @ X[i+1] - vsli.64 q13,q6,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q7,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q3,q4,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q7,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q7,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vsli.64 d26,d22,#23 -#if 30<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d17 - veor d29,d23,d16 - veor d24,d25 - vand d29,d22 - veor d24,d26 @ Sigma1(e) - veor d29,d16 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d18,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d18,#34 - vshr.u64 d26,d18,#39 - vsli.64 d24,d18,#36 - vsli.64 d25,d18,#30 - vsli.64 d26,d18,#25 - vadd.i64 d27,d14 - vorr d30,d18,d20 - vand d29,d18,d20 - veor d17,d24,d25 - vand d30,d19 - veor d17,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d17,d27 - vadd.i64 d21,d27 - vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 31 -#if 31<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vsli.64 d26,d21,#23 -#if 31<16 && defined(__ARMEL__) - vrev64.8 , -#endif - vadd.i64 d27,d28,d16 - veor d29,d22,d23 - veor d24,d25 - vand d29,d21 - veor d24,d26 @ Sigma1(e) - veor d29,d23 @ Ch(e,f,g) - vadd.i64 d27,d24 - vshr.u64 d24,d17,#28 - vadd.i64 d27,d29 - vshr.u64 d25,d17,#34 - vshr.u64 d26,d17,#39 - vsli.64 d24,d17,#36 - vsli.64 d25,d17,#30 - vsli.64 d26,d17,#25 - vadd.i64 d27,d15 - vorr d30,d17,d19 - vand d29,d17,d19 - veor d16,d24,d25 - vand d30,d18 - veor d16,d26 @ Sigma0(a) - vorr d30,d29 @ Maj(a,b,c) - vadd.i64 d16,d27 - vadd.i64 d20,d27 - vadd.i64 d16,d30 - bne .L16_79_neon - - vldmia r0,{d24-d31} @ load context to temp - vadd.i64 q8,q12 @ vectorized accumulate - vadd.i64 q9,q13 - vadd.i64 q10,q14 - vadd.i64 q11,q15 - vstmia r0,{d16-d23} @ save context - teq r1,r2 - sub r3,#640 @ rewind K512 - bne .Loop_neon - - vldmia sp!,{d8-d15} @ epilogue - bx lr @ .word 0xe12fff1e -#endif -.size sha512_block_data_order,.-sha512_block_data_order -.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " -.align 2 -.comm OPENSSL_armcap_P,4,4 -- cgit v1.2.3