diff options
65 files changed, 13775 insertions, 3848 deletions
diff --git a/main/openssl/Apps.mk b/main/openssl/Apps.mk index 3fb94dbe..b2d871c1 100644 --- a/main/openssl/Apps.mk +++ b/main/openssl/Apps.mk @@ -1,9 +1,12 @@  # Copyright 2006 The Android Open Source Project -LOCAL_PATH:= $(call my-dir) +LOCAL_PATH := $(call my-dir)  include $(CLEAR_VARS) -LOCAL_MODULE:= openssl +LOCAL_MODULE := openssl +LOCAL_MULTILIB := both +LOCAL_MODULE_STEM_32 := openssl +LOCAL_MODULE_STEM_64 := openssl64  LOCAL_CLANG := true  LOCAL_MODULE_TAGS := optional  LOCAL_SHARED_LIBRARIES := libssl libcrypto @@ -13,7 +16,7 @@ LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/A  include $(BUILD_EXECUTABLE)  include $(CLEAR_VARS) -LOCAL_MODULE:= openssl +LOCAL_MODULE := openssl  LOCAL_MODULE_TAGS := optional  LOCAL_SHARED_LIBRARIES := libssl-host libcrypto-host  include $(LOCAL_PATH)/Apps-config-host.mk diff --git a/main/openssl/Crypto-config-host.mk b/main/openssl/Crypto-config-host.mk index 988df044..5b643792 100644 --- a/main/openssl/Crypto-config-host.mk +++ b/main/openssl/Crypto-config-host.mk @@ -332,7 +332,6 @@ common_src_files := \    crypto/evp/m_md5.c \    crypto/evp/m_mdc2.c \    crypto/evp/m_null.c \ -  crypto/evp/m_ripemd.c \    crypto/evp/m_sha1.c \    crypto/evp/m_sigver.c \    crypto/evp/m_wp.c \ @@ -438,8 +437,6 @@ common_src_files := \    crypto/rc4/rc4_enc.c \    crypto/rc4/rc4_skey.c \    crypto/rc4/rc4_utl.c \ -  crypto/ripemd/rmd_dgst.c \ -  crypto/ripemd/rmd_one.c \    crypto/rsa/rsa_ameth.c \    crypto/rsa/rsa_asn1.c \    crypto/rsa/rsa_chk.c \ @@ -546,6 +543,7 @@ common_c_includes := \  arm_cflags := \    -DAES_ASM \    -DBSAES_ASM \ +  -DDES_UNROLL \    -DGHASH_ASM \    -DOPENSSL_BN_ASM_GF2m \    -DOPENSSL_BN_ASM_MONT \ @@ -556,12 +554,14 @@ arm_cflags := \  arm_src_files := \    crypto/aes/asm/aes-armv4.S \ +  crypto/aes/asm/aesv8-armx.S \    crypto/aes/asm/bsaes-armv7.S \    crypto/armcap.c \    crypto/armv4cpuid.S \    crypto/bn/asm/armv4-gf2m.S \    crypto/bn/asm/armv4-mont.S \    crypto/modes/asm/ghash-armv4.S \ +  crypto/modes/asm/ghashv8-armx.S \    crypto/sha/asm/sha1-armv4-large.S \    crypto/sha/asm/sha256-armv4.S \    crypto/sha/asm/sha512-armv4.S \ @@ -571,9 +571,20 @@ arm_exclude_files := \    crypto/mem_clr.c \  arm64_cflags := \ -  -DOPENSSL_NO_ASM \ +  -DDES_UNROLL \ +  -DOPENSSL_CPUID_OBJ \ +  -DSHA1_ASM \ +  -DSHA256_ASM \ +  -DSHA512_ASM \ -arm64_src_files := +arm64_src_files := \ +  crypto/aes/asm/aesv8-armx-64.S \ +  crypto/arm64cpuid.S \ +  crypto/armcap.c \ +  crypto/modes/asm/ghashv8-armx-64.S \ +  crypto/sha/asm/sha1-armv8.S \ +  crypto/sha/asm/sha256-armv8.S \ +  crypto/sha/asm/sha512-armv8.S \  arm64_exclude_files := @@ -589,6 +600,8 @@ x86_cflags := \    -DOPENSSL_BN_ASM_PART_WORDS \    -DOPENSSL_CPUID_OBJ \    -DOPENSSL_IA32_SSE2 \ +  -DRC4_INDEX \ +  -DRMD160_ASM \    -DSHA1_ASM \    -DSHA256_ASM \    -DSHA512_ASM \ @@ -624,8 +637,6 @@ x86_exclude_files := \  x86_64_cflags := \    -DAES_ASM \    -DBSAES_ASM \ -  -DDES_PTR \ -  -DDES_RISC1 \    -DDES_UNROLL \    -DGHASH_ASM \    -DMD5_ASM \ @@ -633,6 +644,7 @@ x86_64_cflags := \    -DOPENSSL_BN_ASM_MONT \    -DOPENSSL_BN_ASM_MONT5 \    -DOPENSSL_CPUID_OBJ \ +  -DOPENSSL_IA32_SSE2 \    -DSHA1_ASM \    -DSHA256_ASM \    -DSHA512_ASM \ diff --git a/main/openssl/Crypto-config-target.mk b/main/openssl/Crypto-config-target.mk index 6dc14066..bd29dfe5 100644 --- a/main/openssl/Crypto-config-target.mk +++ b/main/openssl/Crypto-config-target.mk @@ -332,7 +332,6 @@ common_src_files := \    crypto/evp/m_md5.c \    crypto/evp/m_mdc2.c \    crypto/evp/m_null.c \ -  crypto/evp/m_ripemd.c \    crypto/evp/m_sha1.c \    crypto/evp/m_sigver.c \    crypto/evp/m_wp.c \ @@ -438,8 +437,6 @@ common_src_files := \    crypto/rc4/rc4_enc.c \    crypto/rc4/rc4_skey.c \    crypto/rc4/rc4_utl.c \ -  crypto/ripemd/rmd_dgst.c \ -  crypto/ripemd/rmd_one.c \    crypto/rsa/rsa_ameth.c \    crypto/rsa/rsa_asn1.c \    crypto/rsa/rsa_chk.c \ @@ -546,6 +543,7 @@ common_c_includes := \  arm_cflags := \    -DAES_ASM \    -DBSAES_ASM \ +  -DDES_UNROLL \    -DGHASH_ASM \    -DOPENSSL_BN_ASM_GF2m \    -DOPENSSL_BN_ASM_MONT \ @@ -556,12 +554,14 @@ arm_cflags := \  arm_src_files := \    crypto/aes/asm/aes-armv4.S \ +  crypto/aes/asm/aesv8-armx.S \    crypto/aes/asm/bsaes-armv7.S \    crypto/armcap.c \    crypto/armv4cpuid.S \    crypto/bn/asm/armv4-gf2m.S \    crypto/bn/asm/armv4-mont.S \    crypto/modes/asm/ghash-armv4.S \ +  crypto/modes/asm/ghashv8-armx.S \    crypto/sha/asm/sha1-armv4-large.S \    crypto/sha/asm/sha256-armv4.S \    crypto/sha/asm/sha512-armv4.S \ @@ -571,9 +571,20 @@ arm_exclude_files := \    crypto/mem_clr.c \  arm64_cflags := \ -  -DOPENSSL_NO_ASM \ +  -DDES_UNROLL \ +  -DOPENSSL_CPUID_OBJ \ +  -DSHA1_ASM \ +  -DSHA256_ASM \ +  -DSHA512_ASM \ -arm64_src_files := +arm64_src_files := \ +  crypto/aes/asm/aesv8-armx-64.S \ +  crypto/arm64cpuid.S \ +  crypto/armcap.c \ +  crypto/modes/asm/ghashv8-armx-64.S \ +  crypto/sha/asm/sha1-armv8.S \ +  crypto/sha/asm/sha256-armv8.S \ +  crypto/sha/asm/sha512-armv8.S \  arm64_exclude_files := @@ -589,6 +600,8 @@ x86_cflags := \    -DOPENSSL_BN_ASM_PART_WORDS \    -DOPENSSL_CPUID_OBJ \    -DOPENSSL_IA32_SSE2 \ +  -DRC4_INDEX \ +  -DRMD160_ASM \    -DSHA1_ASM \    -DSHA256_ASM \    -DSHA512_ASM \ @@ -624,8 +637,6 @@ x86_exclude_files := \  x86_64_cflags := \    -DAES_ASM \    -DBSAES_ASM \ -  -DDES_PTR \ -  -DDES_RISC1 \    -DDES_UNROLL \    -DGHASH_ASM \    -DMD5_ASM \ @@ -633,6 +644,7 @@ x86_64_cflags := \    -DOPENSSL_BN_ASM_MONT \    -DOPENSSL_BN_ASM_MONT5 \    -DOPENSSL_CPUID_OBJ \ +  -DOPENSSL_IA32_SSE2 \    -DSHA1_ASM \    -DSHA256_ASM \    -DSHA512_ASM \ diff --git a/main/openssl/Crypto.mk b/main/openssl/Crypto.mk index 16448465..6565f97c 100644 --- a/main/openssl/Crypto.mk +++ b/main/openssl/Crypto.mk @@ -9,7 +9,7 @@ LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)  LOCAL_SDK_VERSION := 9  LOCAL_MODULE_TAGS := optional -LOCAL_MODULE:= libcrypto_static +LOCAL_MODULE := libcrypto_static  LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Crypto.mk  include $(LOCAL_PATH)/Crypto-config-target.mk  include $(LOCAL_PATH)/android-config.mk @@ -31,7 +31,7 @@ LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)  # in the NDK.  ifeq (,$(TARGET_BUILD_APPS))  LOCAL_CLANG := true -ifeq ($(HOST_OS), darwinXXX) +ifeq ($(HOST_OS), darwin_does_not_wrok)  LOCAL_ASFLAGS += -no-integrated-as  LOCAL_CFLAGS += -no-integrated-as  endif @@ -41,7 +41,7 @@ endif  LOCAL_LDFLAGS += -ldl  LOCAL_MODULE_TAGS := optional -LOCAL_MODULE:= libcrypto +LOCAL_MODULE := libcrypto  LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Crypto.mk  include $(LOCAL_PATH)/Crypto-config-target.mk  include $(LOCAL_PATH)/android-config.mk @@ -50,16 +50,16 @@ include $(BUILD_SHARED_LIBRARY)  #######################################  # host shared library -# include $(CLEAR_VARS) -# LOCAL_SHARED_LIBRARIES := $(log_shared_libraries) -# LOCAL_CFLAGS += -DPURIFY -# LOCAL_LDLIBS += -ldl -# LOCAL_MODULE_TAGS := optional -# LOCAL_MODULE:= libcrypto-host -# LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Crypto.mk -# include $(LOCAL_PATH)/Crypto-config-host.mk -# include $(LOCAL_PATH)/android-config.mk -# include $(BUILD_HOST_SHARED_LIBRARY) +#include $(CLEAR_VARS) +#LOCAL_SHARED_LIBRARIES := $(log_shared_libraries) +#LOCAL_CFLAGS += -DPURIFY +#LOCAL_LDLIBS += -ldl +#LOCAL_MODULE_TAGS := optional +#LOCAL_MODULE := libcrypto-host +#LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Crypto.mk +#include $(LOCAL_PATH)/Crypto-config-host.mk +#include $(LOCAL_PATH)/android-config.mk +#include $(BUILD_HOST_SHARED_LIBRARY)  ########################################  # host static library, which is used by some SDK tools. @@ -69,8 +69,9 @@ include $(BUILD_SHARED_LIBRARY)  # LOCAL_CFLAGS += -DPURIFY  # LOCAL_LDLIBS += -ldl  # LOCAL_MODULE_TAGS := optional -# LOCAL_MODULE:= libcrypto_static +# LOCAL_MODULE := libcrypto_static  # LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Crypto.mk  # include $(LOCAL_PATH)/Crypto-config-host.mk  # include $(LOCAL_PATH)/android-config.mk  # include $(BUILD_HOST_STATIC_LIBRARY) + diff --git a/main/openssl/Ssl.mk b/main/openssl/Ssl.mk index 8ce82d9b..6c04950a 100644 --- a/main/openssl/Ssl.mk +++ b/main/openssl/Ssl.mk @@ -12,7 +12,7 @@ LOCAL_CFLAGS += $(target_c_flags)  LOCAL_C_INCLUDES += $(target_c_includes)  LOCAL_SHARED_LIBRARIES = $(log_shared_libraries)  LOCAL_MODULE_TAGS := optional -LOCAL_MODULE:= libssl_static +LOCAL_MODULE := libssl_static  LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Ssl.mk  include $(LOCAL_PATH)/Ssl-config-target.mk  include $(LOCAL_PATH)/android-config.mk @@ -35,19 +35,20 @@ endif  LOCAL_SHARED_LIBRARIES += libcrypto $(log_shared_libraries)  LOCAL_MODULE_TAGS := optional -LOCAL_MODULE:= libssl +LOCAL_MODULE := libssl  LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Ssl.mk  include $(LOCAL_PATH)/Ssl-config-target.mk  include $(LOCAL_PATH)/android-config.mk  include $(LOCAL_PATH)/ndk-build.mk  include $(BUILD_SHARED_LIBRARY) +  # #######################################  # # host shared library  # include $(CLEAR_VARS)  # LOCAL_SHARED_LIBRARIES += libcrypto-host $(log_shared_libraries)  # LOCAL_MODULE_TAGS := optional -# LOCAL_MODULE:= libssl-host +# LOCAL_MODULE := libssl-host  # LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Ssl.mk  # include $(LOCAL_PATH)/Ssl-config-host.mk  # include $(LOCAL_PATH)/android-config.mk @@ -56,9 +57,12 @@ include $(BUILD_SHARED_LIBRARY)  # #######################################  # # ssltest  # include $(CLEAR_VARS) -# LOCAL_SRC_FILES:= ssl/ssltest.c +# LOCAL_SRC_FILES := ssl/ssltest.c  # LOCAL_SHARED_LIBRARIES := libssl libcrypto $(log_shared_libraries) -# LOCAL_MODULE:= ssltest +# LOCAL_MODULE := ssltest +# LOCAL_MULTILIB := both +# LOCAL_MODULE_STEM_32 := ssltest +# LOCAL_MODULE_STEM_64 := ssltest64  # LOCAL_MODULE_TAGS := optional  # LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Ssl.mk  # include $(LOCAL_PATH)/Ssl-config-host.mk diff --git a/main/openssl/build-config-32.mk b/main/openssl/build-config-32.mk index bc2aa442..d035f1e4 100644 --- a/main/openssl/build-config-32.mk +++ b/main/openssl/build-config-32.mk @@ -24,6 +24,7 @@ openssl_cflags_32 := \    -DOPENSSL_NO_RC5 \    -DOPENSSL_NO_RDRAND \    -DOPENSSL_NO_RFC3779 \ +  -DOPENSSL_NO_RIPEMD \    -DOPENSSL_NO_RSAX \    -DOPENSSL_NO_SCTP \    -DOPENSSL_NO_SEED \ @@ -52,6 +53,7 @@ openssl_cflags_static_32 := \    -DOPENSSL_NO_RC5 \    -DOPENSSL_NO_RDRAND \    -DOPENSSL_NO_RFC3779 \ +  -DOPENSSL_NO_RIPEMD \    -DOPENSSL_NO_RSAX \    -DOPENSSL_NO_SCTP \    -DOPENSSL_NO_SEED \ diff --git a/main/openssl/build-config-64.mk b/main/openssl/build-config-64.mk index fde3b6ab..45a8141d 100644 --- a/main/openssl/build-config-64.mk +++ b/main/openssl/build-config-64.mk @@ -24,6 +24,7 @@ openssl_cflags_64 := \    -DOPENSSL_NO_RC5 \    -DOPENSSL_NO_RDRAND \    -DOPENSSL_NO_RFC3779 \ +  -DOPENSSL_NO_RIPEMD \    -DOPENSSL_NO_RSAX \    -DOPENSSL_NO_SCTP \    -DOPENSSL_NO_SEED \ @@ -52,6 +53,7 @@ openssl_cflags_static_64 := \    -DOPENSSL_NO_RC5 \    -DOPENSSL_NO_RDRAND \    -DOPENSSL_NO_RFC3779 \ +  -DOPENSSL_NO_RIPEMD \    -DOPENSSL_NO_RSAX \    -DOPENSSL_NO_SCTP \    -DOPENSSL_NO_SEED \ diff --git a/main/openssl/check-all-builds.sh b/main/openssl/check-all-builds.sh index cff2ba5d..9743872a 100755 --- a/main/openssl/check-all-builds.sh +++ b/main/openssl/check-all-builds.sh @@ -143,7 +143,7 @@ esac  # NOTE: x86_64 is not ready yet, while the toolchain is in  # prebuilts/ it doesn't have a sysroot which means it requires  # a platform build to get Bionic and stuff. -ANDROID_ARCHS="arm x86 mips" +ANDROID_ARCHS="arm arm64 x86 x86_64 mips"  BUILD_TYPES=  for ARCH in $ANDROID_ARCHS; do @@ -311,11 +311,14 @@ get_build_arch () {  # Out: GNU configuration target (e.g. arm-linux-androideabi)  get_build_arch_target () {    case $1 in +    arm64) +      echo "aarch64-linux-android" +      ;;      arm)        echo "arm-linux-androideabi"        ;;      x86) -      echo "i686-linux-android" +      echo "x86_64-linux-android"        ;;      x86_64)        echo "x86_64-linux-android" @@ -329,8 +332,8 @@ get_build_arch_target () {    esac  } -GCC_VERSION=4.7 -CLANG_VERSION=3.1 +GCC_VERSION=4.8 +CLANG_VERSION=3.2  get_prebuilt_gcc_dir_for_arch () {    local arch=$1 @@ -341,6 +344,9 @@ get_prebuilt_gcc_dir_for_arch () {      x86_64)          arch=x86          ;; +    arm64) +        arch=aarch64 +        ;;    esac    echo "$ANDROID_BUILD_TOP/prebuilts/gcc/$ANDROID_HOST_TAG/$arch/$target-$GCC_VERSION"  } @@ -397,7 +403,7 @@ get_build_compiler () {    # Force -m32 flag when needed for 32-bit builds.    case $1 in -    *-linux-x86|*-darwin-x86|*-generic32) +    *-x86|*-generic32)        result="$result -m32"        ;;    esac diff --git a/main/openssl/crypto/aes/asm/aes-armv4.pl b/main/openssl/crypto/aes/asm/aes-armv4.pl index 86b86c4a..4f891708 100644 --- a/main/openssl/crypto/aes/asm/aes-armv4.pl +++ b/main/openssl/crypto/aes/asm/aes-armv4.pl @@ -1,7 +1,7 @@  #!/usr/bin/env perl  # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL  # project. The module is, however, dual licensed under OpenSSL and  # CRYPTOGAMS licenses depending on where you obtain it. For further  # details see http://www.openssl.org/~appro/cryptogams/. @@ -51,9 +51,23 @@ $key="r11";  $rounds="r12";  $code=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif +  .text +#if __ARM_ARCH__<7 +.code	32 +#else +.syntax	unified +# ifdef __thumb2__ +.thumb +# else  .code	32 +# endif +#endif  .type	AES_Te,%object  .align	5 @@ -167,7 +181,11 @@ AES_Te:  .type   AES_encrypt,%function  .align	5  AES_encrypt: +#if __ARM_ARCH__<7  	sub	r3,pc,#8		@ AES_encrypt +#else +	adr	r3,AES_encrypt +#endif  	stmdb   sp!,{r1,r4-r12,lr}  	mov	$rounds,r0		@ inp  	mov	$key,r2 @@ -409,11 +427,21 @@ _armv4_AES_encrypt:  .align	5  private_AES_set_encrypt_key:  _armv4_AES_set_encrypt_key: +#if __ARM_ARCH__<7  	sub	r3,pc,#8		@ AES_set_encrypt_key +#else +	adr	r3,private_AES_set_encrypt_key +#endif  	teq	r0,#0 +#if __ARM_ARCH__>=7 +	itt	eq			@ Thumb2 thing, sanity check in ARM +#endif  	moveq	r0,#-1  	beq	.Labrt  	teq	r2,#0 +#if __ARM_ARCH__>=7 +	itt	eq			@ Thumb2 thing, sanity check in ARM +#endif  	moveq	r0,#-1  	beq	.Labrt @@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key:  	teq	r1,#192  	beq	.Lok  	teq	r1,#256 +#if __ARM_ARCH__>=7 +	itt	ne			@ Thumb2 thing, sanity check in ARM +#endif  	movne	r0,#-1  	bne	.Labrt @@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key:  	str	$s2,[$key,#-16]  	subs	$rounds,$rounds,#1  	str	$s3,[$key,#-12] +#if __ARM_ARCH__>=7 +	itt	eq				@ Thumb2 thing, sanity check in ARM +#endif  	subeq	r2,$key,#216  	beq	.Ldone @@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key:  	str	$s2,[$key,#-24]  	subs	$rounds,$rounds,#1  	str	$s3,[$key,#-20] +#if __ARM_ARCH__>=7 +	itt	eq				@ Thumb2 thing, sanity check in ARM +#endif  	subeq	r2,$key,#256  	beq	.Ldone @@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key:  	str	$i3,[$key,#-4]  	b	.L256_loop +.align	2  .Ldone:	mov	r0,#0  	ldmia   sp!,{r4-r12,lr} -.Labrt:	tst	lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 +	ret				@ bx lr +#else +	tst	lr,#1  	moveq	pc,lr			@ be binary compatible with V4, yet  	bx	lr			@ interoperable with Thumb ISA:-) +#endif  .size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key  .global private_AES_set_decrypt_key @@ -688,34 +731,57 @@ private_AES_set_decrypt_key:  	str	lr,[sp,#-4]!            @ push lr  	bl	_armv4_AES_set_encrypt_key  	teq	r0,#0 -	ldrne	lr,[sp],#4              @ pop lr +	ldr	lr,[sp],#4              @ pop lr  	bne	.Labrt -	stmdb   sp!,{r4-r12} +	mov	r0,r2			@ AES_set_encrypt_key preserves r2, +	mov	r1,r2			@ which is AES_KEY *key +	b	_armv4_AES_set_enc2dec_key +.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key -	ldr	$rounds,[r2,#240]	@ AES_set_encrypt_key preserves r2, -	mov	$key,r2			@ which is AES_KEY *key -	mov	$i1,r2 -	add	$i2,r2,$rounds,lsl#4 +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.global	AES_set_enc2dec_key +.type	AES_set_enc2dec_key,%function +.align	5 +AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: +	stmdb   sp!,{r4-r12,lr} + +	ldr	$rounds,[r0,#240] +	mov	$i1,r0			@ input +	add	$i2,r0,$rounds,lsl#4 +	mov	$key,r1			@ ouput +	add	$tbl,r1,$rounds,lsl#4 +	str	$rounds,[r1,#240] + +.Linv:	ldr	$s0,[$i1],#16 +	ldr	$s1,[$i1,#-12] +	ldr	$s2,[$i1,#-8] +	ldr	$s3,[$i1,#-4] +	ldr	$t1,[$i2],#-16 +	ldr	$t2,[$i2,#16+4] +	ldr	$t3,[$i2,#16+8] +	ldr	$i3,[$i2,#16+12] +	str	$s0,[$tbl],#-16 +	str	$s1,[$tbl,#16+4] +	str	$s2,[$tbl,#16+8] +	str	$s3,[$tbl,#16+12] +	str	$t1,[$key],#16 +	str	$t2,[$key,#-12] +	str	$t3,[$key,#-8] +	str	$i3,[$key,#-4] +	teq	$i1,$i2 +	bne	.Linv -.Linv:	ldr	$s0,[$i1] +	ldr	$s0,[$i1]  	ldr	$s1,[$i1,#4]  	ldr	$s2,[$i1,#8]  	ldr	$s3,[$i1,#12] -	ldr	$t1,[$i2] -	ldr	$t2,[$i2,#4] -	ldr	$t3,[$i2,#8] -	ldr	$i3,[$i2,#12] -	str	$s0,[$i2],#-16 -	str	$s1,[$i2,#16+4] -	str	$s2,[$i2,#16+8] -	str	$s3,[$i2,#16+12] -	str	$t1,[$i1],#16 -	str	$t2,[$i1,#-12] -	str	$t3,[$i1,#-8] -	str	$i3,[$i1,#-4] -	teq	$i1,$i2 -	bne	.Linv +	str	$s0,[$key] +	str	$s1,[$key,#4] +	str	$s2,[$key,#8] +	str	$s3,[$key,#12] +	sub	$key,$key,$rounds,lsl#3  ___  $mask80=$i1;  $mask1b=$i2; @@ -773,7 +839,7 @@ $code.=<<___;  	moveq	pc,lr			@ be binary compatible with V4, yet  	bx	lr			@ interoperable with Thumb ISA:-)  #endif -.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size	AES_set_enc2dec_key,.-AES_set_enc2dec_key  .type	AES_Td,%object  .align	5 @@ -883,7 +949,11 @@ AES_Td:  .type   AES_decrypt,%function  .align	5  AES_decrypt: +#if __ARM_ARCH__<7  	sub	r3,pc,#8		@ AES_decrypt +#else +	adr	r3,AES_decrypt +#endif  	stmdb   sp!,{r1,r4-r12,lr}  	mov	$rounds,r0		@ inp  	mov	$key,r2 @@ -1080,8 +1150,9 @@ _armv4_AES_decrypt:  	ldrb	$t3,[$tbl,$i3]		@ Td4[s0>>0]  	and	$i3,lr,$s1,lsr#8 +	add	$s1,$tbl,$s1,lsr#24  	ldrb	$i1,[$tbl,$i1]		@ Td4[s1>>0] -	ldrb	$s1,[$tbl,$s1,lsr#24]	@ Td4[s1>>24] +	ldrb	$s1,[$s1]		@ Td4[s1>>24]  	ldrb	$i2,[$tbl,$i2]		@ Td4[s1>>16]  	eor	$s0,$i1,$s0,lsl#24  	ldrb	$i3,[$tbl,$i3]		@ Td4[s1>>8] @@ -1094,7 +1165,8 @@ _armv4_AES_decrypt:  	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]  	and	$i3,lr,$s2,lsr#16 -	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24] +	add	$s2,$tbl,$s2,lsr#24 +	ldrb	$s2,[$s2]		@ Td4[s2>>24]  	eor	$s0,$s0,$i1,lsl#8  	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]  	eor	$s1,$i2,$s1,lsl#16 @@ -1106,8 +1178,9 @@ _armv4_AES_decrypt:  	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]  	and	$i3,lr,$s3		@ i2 +	add	$s3,$tbl,$s3,lsr#24  	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0] -	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24] +	ldrb	$s3,[$s3]		@ Td4[s3>>24]  	eor	$s0,$s0,$i1,lsl#16  	ldr	$i1,[$key,#0]  	eor	$s1,$s1,$i2,lsl#8 @@ -1130,5 +1203,15 @@ _armv4_AES_decrypt:  ___  $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx\tlr/gm; + +open SELF,$0; +while(<SELF>) { +	next if (/^#!/); +	last if (!s/^#/@/ and !/^$/); +	print; +} +close SELF; +  print $code;  close STDOUT;	# enforce flush diff --git a/main/openssl/crypto/aes/asm/aes-armv4.s b/main/openssl/crypto/aes/asm/aes-armv4.s index 2697d4ce..333a5227 100644 --- a/main/openssl/crypto/aes/asm/aes-armv4.s +++ b/main/openssl/crypto/aes/asm/aes-armv4.s @@ -1,6 +1,53 @@ -#include "arm_arch.h" + +@ ==================================================================== +@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ AES for ARMv4 + +@ January 2007. +@ +@ Code uses single 1K S-box and is >2 times faster than code generated +@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which +@ allows to merge logical or arithmetic operation with shift or rotate +@ in one instruction and emit combined result every cycle. The module +@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit +@ key [on single-issue Xscale PXA250 core]. + +@ May 2007. +@ +@ AES_set_[en|de]crypt_key is added. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 12% improvement on +@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~21.5 cycles per byte. + +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif +  .text +#if __ARM_ARCH__<7 +.code	32 +#else +.syntax	unified +# ifdef __thumb2__ +.thumb +# else  .code	32 +# endif +#endif  .type	AES_Te,%object  .align	5 @@ -114,7 +161,11 @@ AES_Te:  .type   AES_encrypt,%function  .align	5  AES_encrypt: +#if __ARM_ARCH__<7  	sub	r3,pc,#8		@ AES_encrypt +#else +	adr	r3,AES_encrypt +#endif  	stmdb   sp!,{r1,r4-r12,lr}  	mov	r12,r0		@ inp  	mov	r11,r2 @@ -356,11 +407,21 @@ _armv4_AES_encrypt:  .align	5  private_AES_set_encrypt_key:  _armv4_AES_set_encrypt_key: +#if __ARM_ARCH__<7  	sub	r3,pc,#8		@ AES_set_encrypt_key +#else +	adr	r3,private_AES_set_encrypt_key +#endif  	teq	r0,#0 +#if __ARM_ARCH__>=7 +	itt	eq			@ Thumb2 thing, sanity check in ARM +#endif  	moveq	r0,#-1  	beq	.Labrt  	teq	r2,#0 +#if __ARM_ARCH__>=7 +	itt	eq			@ Thumb2 thing, sanity check in ARM +#endif  	moveq	r0,#-1  	beq	.Labrt @@ -369,6 +430,9 @@ _armv4_AES_set_encrypt_key:  	teq	r1,#192  	beq	.Lok  	teq	r1,#256 +#if __ARM_ARCH__>=7 +	itt	ne			@ Thumb2 thing, sanity check in ARM +#endif  	movne	r0,#-1  	bne	.Labrt @@ -523,6 +587,9 @@ _armv4_AES_set_encrypt_key:  	str	r2,[r11,#-16]  	subs	r12,r12,#1  	str	r3,[r11,#-12] +#if __ARM_ARCH__>=7 +	itt	eq				@ Thumb2 thing, sanity check in ARM +#endif  	subeq	r2,r11,#216  	beq	.Ldone @@ -592,6 +659,9 @@ _armv4_AES_set_encrypt_key:  	str	r2,[r11,#-24]  	subs	r12,r12,#1  	str	r3,[r11,#-20] +#if __ARM_ARCH__>=7 +	itt	eq				@ Thumb2 thing, sanity check in ARM +#endif  	subeq	r2,r11,#256  	beq	.Ldone @@ -621,11 +691,17 @@ _armv4_AES_set_encrypt_key:  	str	r9,[r11,#-4]  	b	.L256_loop +.align	2  .Ldone:	mov	r0,#0  	ldmia   sp!,{r4-r12,lr} -.Labrt:	tst	lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 +	bx	lr				@ .word	0xe12fff1e +#else +	tst	lr,#1  	moveq	pc,lr			@ be binary compatible with V4, yet  	.word	0xe12fff1e			@ interoperable with Thumb ISA:-) +#endif  .size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key  .global private_AES_set_decrypt_key @@ -635,34 +711,57 @@ private_AES_set_decrypt_key:  	str	lr,[sp,#-4]!            @ push lr  	bl	_armv4_AES_set_encrypt_key  	teq	r0,#0 -	ldrne	lr,[sp],#4              @ pop lr +	ldr	lr,[sp],#4              @ pop lr  	bne	.Labrt -	stmdb   sp!,{r4-r12} +	mov	r0,r2			@ AES_set_encrypt_key preserves r2, +	mov	r1,r2			@ which is AES_KEY *key +	b	_armv4_AES_set_enc2dec_key +.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key + +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.global	AES_set_enc2dec_key +.type	AES_set_enc2dec_key,%function +.align	5 +AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: +	stmdb   sp!,{r4-r12,lr} + +	ldr	r12,[r0,#240] +	mov	r7,r0			@ input +	add	r8,r0,r12,lsl#4 +	mov	r11,r1			@ ouput +	add	r10,r1,r12,lsl#4 +	str	r12,[r1,#240] -	ldr	r12,[r2,#240]	@ AES_set_encrypt_key preserves r2, -	mov	r11,r2			@ which is AES_KEY *key -	mov	r7,r2 -	add	r8,r2,r12,lsl#4 +.Linv:	ldr	r0,[r7],#16 +	ldr	r1,[r7,#-12] +	ldr	r2,[r7,#-8] +	ldr	r3,[r7,#-4] +	ldr	r4,[r8],#-16 +	ldr	r5,[r8,#16+4] +	ldr	r6,[r8,#16+8] +	ldr	r9,[r8,#16+12] +	str	r0,[r10],#-16 +	str	r1,[r10,#16+4] +	str	r2,[r10,#16+8] +	str	r3,[r10,#16+12] +	str	r4,[r11],#16 +	str	r5,[r11,#-12] +	str	r6,[r11,#-8] +	str	r9,[r11,#-4] +	teq	r7,r8 +	bne	.Linv -.Linv:	ldr	r0,[r7] +	ldr	r0,[r7]  	ldr	r1,[r7,#4]  	ldr	r2,[r7,#8]  	ldr	r3,[r7,#12] -	ldr	r4,[r8] -	ldr	r5,[r8,#4] -	ldr	r6,[r8,#8] -	ldr	r9,[r8,#12] -	str	r0,[r8],#-16 -	str	r1,[r8,#16+4] -	str	r2,[r8,#16+8] -	str	r3,[r8,#16+12] -	str	r4,[r7],#16 -	str	r5,[r7,#-12] -	str	r6,[r7,#-8] -	str	r9,[r7,#-4] -	teq	r7,r8 -	bne	.Linv +	str	r0,[r11] +	str	r1,[r11,#4] +	str	r2,[r11,#8] +	str	r3,[r11,#12] +	sub	r11,r11,r12,lsl#3  	ldr	r0,[r11,#16]!		@ prefetch tp1  	mov	r7,#0x80  	mov	r8,#0x1b @@ -715,7 +814,7 @@ private_AES_set_decrypt_key:  	moveq	pc,lr			@ be binary compatible with V4, yet  	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)  #endif -.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size	AES_set_enc2dec_key,.-AES_set_enc2dec_key  .type	AES_Td,%object  .align	5 @@ -825,7 +924,11 @@ AES_Td:  .type   AES_decrypt,%function  .align	5  AES_decrypt: +#if __ARM_ARCH__<7  	sub	r3,pc,#8		@ AES_decrypt +#else +	adr	r3,AES_decrypt +#endif  	stmdb   sp!,{r1,r4-r12,lr}  	mov	r12,r0		@ inp  	mov	r11,r2 @@ -1022,8 +1125,9 @@ _armv4_AES_decrypt:  	ldrb	r6,[r10,r9]		@ Td4[s0>>0]  	and	r9,lr,r1,lsr#8 +	add	r1,r10,r1,lsr#24  	ldrb	r7,[r10,r7]		@ Td4[s1>>0] -	ldrb	r1,[r10,r1,lsr#24]	@ Td4[s1>>24] +	ldrb	r1,[r1]		@ Td4[s1>>24]  	ldrb	r8,[r10,r8]		@ Td4[s1>>16]  	eor	r0,r7,r0,lsl#24  	ldrb	r9,[r10,r9]		@ Td4[s1>>8] @@ -1036,7 +1140,8 @@ _armv4_AES_decrypt:  	ldrb	r8,[r10,r8]		@ Td4[s2>>0]  	and	r9,lr,r2,lsr#16 -	ldrb	r2,[r10,r2,lsr#24]	@ Td4[s2>>24] +	add	r2,r10,r2,lsr#24 +	ldrb	r2,[r2]		@ Td4[s2>>24]  	eor	r0,r0,r7,lsl#8  	ldrb	r9,[r10,r9]		@ Td4[s2>>16]  	eor	r1,r8,r1,lsl#16 @@ -1048,8 +1153,9 @@ _armv4_AES_decrypt:  	ldrb	r8,[r10,r8]		@ Td4[s3>>8]  	and	r9,lr,r3		@ i2 +	add	r3,r10,r3,lsr#24  	ldrb	r9,[r10,r9]		@ Td4[s3>>0] -	ldrb	r3,[r10,r3,lsr#24]	@ Td4[s3>>24] +	ldrb	r3,[r3]		@ Td4[s3>>24]  	eor	r0,r0,r7,lsl#16  	ldr	r7,[r11,#0]  	eor	r1,r1,r8,lsl#8 diff --git a/main/openssl/crypto/aes/asm/aesv8-armx-64.S b/main/openssl/crypto/aes/asm/aesv8-armx-64.S new file mode 100644 index 00000000..be0a13df --- /dev/null +++ b/main/openssl/crypto/aes/asm/aesv8-armx-64.S @@ -0,0 +1,761 @@ +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +.arch	armv8-a+crypto +.align	5 +rcon: +.long	0x01,0x01,0x01,0x01 +.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat +.long	0x1b,0x1b,0x1b,0x1b + +.globl	aes_v8_set_encrypt_key +.type	aes_v8_set_encrypt_key,%function +.align	5 +aes_v8_set_encrypt_key: +.Lenc_key: +	stp	x29,x30,[sp,#-16]! +	add	x29,sp,#0 +	adr	x3,rcon +	cmp	w1,#192 + +	eor	v0.16b,v0.16b,v0.16b +	ld1	{v3.16b},[x0],#16 +	mov	w1,#8		// reuse w1 +	ld1	{v1.4s,v2.4s},[x3],#32 + +	b.lt	.Loop128 +	b.eq	.L192 +	b	.L256 + +.align	4 +.Loop128: +	tbl	v6.16b,{v3.16b},v2.16b +	ext	v5.16b,v0.16b,v3.16b,#12 +	st1	{v3.4s},[x2],#16 +	aese	v6.16b,v0.16b +	subs	w1,w1,#1 + +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	 eor	v6.16b,v6.16b,v1.16b +	eor	v3.16b,v3.16b,v5.16b +	shl	v1.16b,v1.16b,#1 +	eor	v3.16b,v3.16b,v6.16b +	b.ne	.Loop128 + +	ld1	{v1.4s},[x3] + +	tbl	v6.16b,{v3.16b},v2.16b +	ext	v5.16b,v0.16b,v3.16b,#12 +	st1	{v3.4s},[x2],#16 +	aese	v6.16b,v0.16b + +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	 eor	v6.16b,v6.16b,v1.16b +	eor	v3.16b,v3.16b,v5.16b +	shl	v1.16b,v1.16b,#1 +	eor	v3.16b,v3.16b,v6.16b + +	tbl	v6.16b,{v3.16b},v2.16b +	ext	v5.16b,v0.16b,v3.16b,#12 +	st1	{v3.4s},[x2],#16 +	aese	v6.16b,v0.16b + +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	 eor	v6.16b,v6.16b,v1.16b +	eor	v3.16b,v3.16b,v5.16b +	eor	v3.16b,v3.16b,v6.16b +	st1	{v3.4s},[x2] +	add	x2,x2,#0x50 + +	mov	w12,#10 +	b	.Ldone + +.align	4 +.L192: +	ld1	{v4.8b},[x0],#8 +	movi	v6.16b,#8			// borrow v6.16b +	st1	{v3.4s},[x2],#16 +	sub	v2.16b,v2.16b,v6.16b	// adjust the mask + +.Loop192: +	tbl	v6.16b,{v4.16b},v2.16b +	ext	v5.16b,v0.16b,v3.16b,#12 +	st1	{v4.8b},[x2],#8 +	aese	v6.16b,v0.16b +	subs	w1,w1,#1 + +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	eor	v3.16b,v3.16b,v5.16b + +	dup	v5.4s,v3.s[3] +	eor	v5.16b,v5.16b,v4.16b +	 eor	v6.16b,v6.16b,v1.16b +	ext	v4.16b,v0.16b,v4.16b,#12 +	shl	v1.16b,v1.16b,#1 +	eor	v4.16b,v4.16b,v5.16b +	eor	v3.16b,v3.16b,v6.16b +	eor	v4.16b,v4.16b,v6.16b +	st1	{v3.4s},[x2],#16 +	b.ne	.Loop192 + +	mov	w12,#12 +	add	x2,x2,#0x20 +	b	.Ldone + +.align	4 +.L256: +	ld1	{v4.16b},[x0] +	mov	w1,#7 +	mov	w12,#14 +	st1	{v3.4s},[x2],#16 + +.Loop256: +	tbl	v6.16b,{v4.16b},v2.16b +	ext	v5.16b,v0.16b,v3.16b,#12 +	st1	{v4.4s},[x2],#16 +	aese	v6.16b,v0.16b +	subs	w1,w1,#1 + +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	eor	v3.16b,v3.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	 eor	v6.16b,v6.16b,v1.16b +	eor	v3.16b,v3.16b,v5.16b +	shl	v1.16b,v1.16b,#1 +	eor	v3.16b,v3.16b,v6.16b +	st1	{v3.4s},[x2],#16 +	b.eq	.Ldone + +	dup	v6.4s,v3.s[3]		// just splat +	ext	v5.16b,v0.16b,v4.16b,#12 +	aese	v6.16b,v0.16b + +	eor	v4.16b,v4.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	eor	v4.16b,v4.16b,v5.16b +	ext	v5.16b,v0.16b,v5.16b,#12 +	eor	v4.16b,v4.16b,v5.16b + +	eor	v4.16b,v4.16b,v6.16b +	b	.Loop256 + +.Ldone: +	str	w12,[x2] + +	eor	x0,x0,x0		// return value +	ldr	x29,[sp],#16 +	ret +.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key + +.globl	aes_v8_set_decrypt_key +.type	aes_v8_set_decrypt_key,%function +.align	5 +aes_v8_set_decrypt_key: +	stp	x29,x30,[sp,#-16]! +	add	x29,sp,#0 +	bl	.Lenc_key + +	sub	x2,x2,#240		// restore original x2 +	mov	x4,#-16 +	add	x0,x2,x12,lsl#4	// end of key schedule + +	ld1	{v0.4s},[x2] +	ld1	{v1.4s},[x0] +	st1	{v0.4s},[x0],x4 +	st1	{v1.4s},[x2],#16 + +.Loop_imc: +	ld1	{v0.4s},[x2] +	ld1	{v1.4s},[x0] +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	st1	{v0.4s},[x0],x4 +	st1	{v1.4s},[x2],#16 +	cmp	x0,x2 +	b.hi	.Loop_imc + +	ld1	{v0.4s},[x2] +	aesimc	v0.16b,v0.16b +	st1	{v0.4s},[x0] + +	eor	x0,x0,x0		// return value +	ldp	x29,x30,[sp],#16 +	ret +.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key +.globl	aes_v8_encrypt +.type	aes_v8_encrypt,%function +.align	5 +aes_v8_encrypt: +	ldr	w3,[x2,#240] +	ld1	{v0.4s},[x2],#16 +	ld1	{v2.16b},[x0] +	sub	w3,w3,#2 +	ld1	{v1.4s},[x2],#16 + +.Loop_enc: +	aese	v2.16b,v0.16b +	ld1	{v0.4s},[x2],#16 +	aesmc	v2.16b,v2.16b +	subs	w3,w3,#2 +	aese	v2.16b,v1.16b +	ld1	{v1.4s},[x2],#16 +	aesmc	v2.16b,v2.16b +	b.gt	.Loop_enc + +	aese	v2.16b,v0.16b +	ld1	{v0.4s},[x2] +	aesmc	v2.16b,v2.16b +	aese	v2.16b,v1.16b +	eor	v2.16b,v2.16b,v0.16b + +	st1	{v2.16b},[x1] +	ret +.size	aes_v8_encrypt,.-aes_v8_encrypt +.globl	aes_v8_decrypt +.type	aes_v8_decrypt,%function +.align	5 +aes_v8_decrypt: +	ldr	w3,[x2,#240] +	ld1	{v0.4s},[x2],#16 +	ld1	{v2.16b},[x0] +	sub	w3,w3,#2 +	ld1	{v1.4s},[x2],#16 + +.Loop_dec: +	aesd	v2.16b,v0.16b +	ld1	{v0.4s},[x2],#16 +	aesimc	v2.16b,v2.16b +	subs	w3,w3,#2 +	aesd	v2.16b,v1.16b +	ld1	{v1.4s},[x2],#16 +	aesimc	v2.16b,v2.16b +	b.gt	.Loop_dec + +	aesd	v2.16b,v0.16b +	ld1	{v0.4s},[x2] +	aesimc	v2.16b,v2.16b +	aesd	v2.16b,v1.16b +	eor	v2.16b,v2.16b,v0.16b + +	st1	{v2.16b},[x1] +	ret +.size	aes_v8_decrypt,.-aes_v8_decrypt +.globl	aes_v8_cbc_encrypt +.type	aes_v8_cbc_encrypt,%function +.align	5 +aes_v8_cbc_encrypt: +	stp	x29,x30,[sp,#-16]! +	add	x29,sp,#0 +	subs	x2,x2,#16 +	mov	x8,#16 +	b.lo	.Lcbc_abort +	csel	x8,xzr,x8,eq + +	cmp	w5,#0			// en- or decrypting? +	ldr	w5,[x3,#240] +	and	x2,x2,#-16 +	ld1	{v6.16b},[x4] +	ld1	{v0.16b},[x0],x8 + +	ld1	{v16.4s-v17.4s},[x3]		// load key schedule... +	sub	w5,w5,#6 +	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys +	sub	w5,w5,#2 +	ld1	{v18.4s-v19.4s},[x7],#32 +	ld1	{v20.4s-v21.4s},[x7],#32 +	ld1	{v22.4s-v23.4s},[x7],#32 +	ld1	{v7.4s},[x7] + +	add	x7,x3,#32 +	mov	w6,w5 +	b.eq	.Lcbc_dec + +	cmp	w5,#2 +	eor	v0.16b,v0.16b,v6.16b +	eor	v5.16b,v16.16b,v7.16b +	b.eq	.Lcbc_enc128 + +.Loop_cbc_enc: +	aese	v0.16b,v16.16b +	ld1	{v16.4s},[x7],#16 +	aesmc	v0.16b,v0.16b +	subs	w6,w6,#2 +	aese	v0.16b,v17.16b +	ld1	{v17.4s},[x7],#16 +	aesmc	v0.16b,v0.16b +	b.gt	.Loop_cbc_enc + +	aese	v0.16b,v16.16b +	aesmc	v0.16b,v0.16b +	 subs	x2,x2,#16 +	aese	v0.16b,v17.16b +	aesmc	v0.16b,v0.16b +	 csel	x8,xzr,x8,eq +	aese	v0.16b,v18.16b +	aesmc	v0.16b,v0.16b +	 add	x7,x3,#16 +	aese	v0.16b,v19.16b +	aesmc	v0.16b,v0.16b +	 ld1	{v16.16b},[x0],x8 +	aese	v0.16b,v20.16b +	aesmc	v0.16b,v0.16b +	 eor	v16.16b,v16.16b,v5.16b +	aese	v0.16b,v21.16b +	aesmc	v0.16b,v0.16b +	 ld1 {v17.4s},[x7],#16	// re-pre-load rndkey[1] +	aese	v0.16b,v22.16b +	aesmc	v0.16b,v0.16b +	aese	v0.16b,v23.16b + +	 mov	w6,w5 +	eor	v6.16b,v0.16b,v7.16b +	st1	{v6.16b},[x1],#16 +	b.hs	.Loop_cbc_enc + +	b	.Lcbc_done + +.align	5 +.Lcbc_enc128: +	ld1	{v2.4s-v3.4s},[x7] +	aese	v0.16b,v16.16b +	aesmc	v0.16b,v0.16b +	b	.Lenter_cbc_enc128 +.Loop_cbc_enc128: +	aese	v0.16b,v16.16b +	aesmc	v0.16b,v0.16b +	 st1	{v6.16b},[x1],#16 +.Lenter_cbc_enc128: +	aese	v0.16b,v17.16b +	aesmc	v0.16b,v0.16b +	 subs	x2,x2,#16 +	aese	v0.16b,v2.16b +	aesmc	v0.16b,v0.16b +	 csel	x8,xzr,x8,eq +	aese	v0.16b,v3.16b +	aesmc	v0.16b,v0.16b +	aese	v0.16b,v18.16b +	aesmc	v0.16b,v0.16b +	aese	v0.16b,v19.16b +	aesmc	v0.16b,v0.16b +	 ld1	{v16.16b},[x0],x8 +	aese	v0.16b,v20.16b +	aesmc	v0.16b,v0.16b +	aese	v0.16b,v21.16b +	aesmc	v0.16b,v0.16b +	aese	v0.16b,v22.16b +	aesmc	v0.16b,v0.16b +	 eor	v16.16b,v16.16b,v5.16b +	aese	v0.16b,v23.16b +	eor	v6.16b,v0.16b,v7.16b +	b.hs	.Loop_cbc_enc128 + +	st1	{v6.16b},[x1],#16 +	b	.Lcbc_done + +.align	5 +.Lcbc_dec128: +	ld1	{v4.4s-v5.4s},[x7] +	eor	v6.16b,v6.16b,v7.16b +	eor	v2.16b,v0.16b,v7.16b +	mov	x12,x8 + +.Loop2x_cbc_dec128: +	aesd	v0.16b,v16.16b +	aesd	v1.16b,v16.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	 subs	x2,x2,#32 +	aesd	v0.16b,v17.16b +	aesd	v1.16b,v17.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	 csel	x8,xzr,x8,lo +	aesd	v0.16b,v4.16b +	aesd	v1.16b,v4.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	 csel	x12,xzr,x12,ls +	aesd	v0.16b,v5.16b +	aesd	v1.16b,v5.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	aesd	v0.16b,v18.16b +	aesd	v1.16b,v18.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	aesd	v0.16b,v19.16b +	aesd	v1.16b,v19.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	aesd	v0.16b,v20.16b +	aesd	v1.16b,v20.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	aesd	v0.16b,v21.16b +	aesd	v1.16b,v21.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	aesd	v0.16b,v22.16b +	aesd	v1.16b,v22.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	aesd	v0.16b,v23.16b +	aesd	v1.16b,v23.16b + +	eor	v6.16b,v6.16b,v0.16b +	ld1	{v0.16b},[x0],x8 +	eor	v2.16b,v2.16b,v1.16b +	ld1	{v1.16b},[x0],x12 +	st1	{v6.16b},[x1],#16 +	eor	v6.16b,v3.16b,v7.16b +	st1	{v2.16b},[x1],#16 +	eor	v2.16b,v0.16b,v7.16b +	orr	v3.16b,v1.16b,v1.16b +	b.hs	.Loop2x_cbc_dec128 + +	adds	x2,x2,#32 +	eor	v6.16b,v6.16b,v7.16b +	b.eq	.Lcbc_done +	eor	v2.16b,v2.16b,v7.16b +	b	.Lcbc_dec_tail + +.align	5 +.Lcbc_dec: +	subs	x2,x2,#16 +	orr	v2.16b,v0.16b,v0.16b +	b.lo	.Lcbc_dec_tail + +	csel	x8,xzr,x8,eq +	cmp	w5,#2 +	ld1	{v1.16b},[x0],x8 +	orr	v3.16b,v1.16b,v1.16b +	b.eq	.Lcbc_dec128 + +.Loop2x_cbc_dec: +	aesd	v0.16b,v16.16b +	aesd	v1.16b,v16.16b +	ld1	{v16.4s},[x7],#16 +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	subs	w6,w6,#2 +	aesd	v0.16b,v17.16b +	aesd	v1.16b,v17.16b +	ld1	{v17.4s},[x7],#16 +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	b.gt	.Loop2x_cbc_dec + +	aesd	v0.16b,v16.16b +	aesd	v1.16b,v16.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	 eor	v4.16b,v6.16b,v7.16b +	 eor	v5.16b,v2.16b,v7.16b +	aesd	v0.16b,v17.16b +	aesd	v1.16b,v17.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	 orr	v6.16b,v3.16b,v3.16b +	 subs	x2,x2,#32 +	aesd	v0.16b,v18.16b +	aesd	v1.16b,v18.16b +	aesimc	v0.16b,v0.16b +	 csel	x8,xzr,x8,lo +	aesimc	v1.16b,v1.16b +	 mov	x7,x3 +	aesd	v0.16b,v19.16b +	aesd	v1.16b,v19.16b +	aesimc	v0.16b,v0.16b +	 ld1	{v2.16b},[x0],x8 +	aesimc	v1.16b,v1.16b +	 csel	x8,xzr,x8,ls +	aesd	v0.16b,v20.16b +	aesd	v1.16b,v20.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	 ld1	{v3.16b},[x0],x8 +	aesd	v0.16b,v21.16b +	aesd	v1.16b,v21.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	 ld1 {v16.4s},[x7],#16	// re-pre-load rndkey[0] +	aesd	v0.16b,v22.16b +	aesd	v1.16b,v22.16b +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	 ld1 {v17.4s},[x7],#16	// re-pre-load rndkey[1] +	aesd	v0.16b,v23.16b +	aesd	v1.16b,v23.16b + +	 mov	w6,w5 +	eor	v4.16b,v4.16b,v0.16b +	eor	v5.16b,v5.16b,v1.16b +	 orr	v0.16b,v2.16b,v2.16b +	st1	{v4.16b},[x1],#16 +	 orr	v1.16b,v3.16b,v3.16b +	st1	{v5.16b},[x1],#16 +	b.hs	.Loop2x_cbc_dec + +	adds	x2,x2,#32 +	b.eq	.Lcbc_done + +.Lcbc_dec_tail: +	aesd	v0.16b,v16.16b +	ld1	{v16.4s},[x7],#16 +	aesimc	v0.16b,v0.16b +	subs	w6,w6,#2 +	aesd	v0.16b,v17.16b +	ld1	{v17.4s},[x7],#16 +	aesimc	v0.16b,v0.16b +	b.gt	.Lcbc_dec_tail + +	aesd	v0.16b,v16.16b +	aesimc	v0.16b,v0.16b +	aesd	v0.16b,v17.16b +	aesimc	v0.16b,v0.16b +	 eor	v4.16b,v6.16b,v7.16b +	aesd	v0.16b,v18.16b +	aesimc	v0.16b,v0.16b +	 orr	v6.16b,v2.16b,v2.16b +	aesd	v0.16b,v19.16b +	aesimc	v0.16b,v0.16b +	aesd	v0.16b,v20.16b +	aesimc	v0.16b,v0.16b +	aesd	v0.16b,v21.16b +	aesimc	v0.16b,v0.16b +	aesd	v0.16b,v22.16b +	aesimc	v0.16b,v0.16b +	aesd	v0.16b,v23.16b + +	eor	v4.16b,v4.16b,v0.16b +	st1	{v4.16b},[x1],#16 + +.Lcbc_done: +	st1	{v6.16b},[x4] +.Lcbc_abort: +	ldr	x29,[sp],#16 +	ret +.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl	aes_v8_ctr32_encrypt_blocks +.type	aes_v8_ctr32_encrypt_blocks,%function +.align	5 +aes_v8_ctr32_encrypt_blocks: +	stp		x29,x30,[sp,#-16]! +	add		x29,sp,#0 +	ldr		w5,[x3,#240] + +	ldr		w8, [x4, #12] +	ld1		{v0.4s},[x4] + +	ld1		{v16.4s-v17.4s},[x3]		// load key schedule... +	sub		w5,w5,#6 +	add		x7,x3,x5,lsl#4	// pointer to last 7 round keys +	sub		w5,w5,#2 +	ld1		{v18.4s-v19.4s},[x7],#32 +	ld1		{v20.4s-v21.4s},[x7],#32 +	ld1		{v22.4s-v23.4s},[x7],#32 +	ld1		{v7.4s},[x7] + +	add		x7,x3,#32 +	mov		w6,w5 + +	subs		x2,x2,#2 +	b.lo		.Lctr32_tail + +#ifndef __ARMEB__ +	rev		w8, w8 +#endif +	orr		v1.16b,v0.16b,v0.16b +	add		w8, w8, #1 +	orr		v6.16b,v0.16b,v0.16b +	rev		w10, w8 +	cmp		w5,#2 +	mov		v1.s[3],w10 +	b.eq		.Lctr32_128 + +.Loop2x_ctr32: +	aese		v0.16b,v16.16b +	aese		v1.16b,v16.16b +	ld1		{v16.4s},[x7],#16 +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	subs		w6,w6,#2 +	aese		v0.16b,v17.16b +	aese		v1.16b,v17.16b +	ld1		{v17.4s},[x7],#16 +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	b.gt		.Loop2x_ctr32 + +	aese		v0.16b,v16.16b +	aese		v1.16b,v16.16b +	aesmc		v4.16b,v0.16b +	 orr		v0.16b,v6.16b,v6.16b +	aesmc		v5.16b,v1.16b +	 orr		v1.16b,v6.16b,v6.16b +	aese		v4.16b,v17.16b +	aese		v5.16b,v17.16b +	 ld1		{v2.16b},[x0],#16 +	aesmc		v4.16b,v4.16b +	 ld1		{v3.16b},[x0],#16 +	aesmc		v5.16b,v5.16b +	 add		w8,w8,#1 +	aese		v4.16b,v18.16b +	aese		v5.16b,v18.16b +	 rev		w9,w8 +	aesmc		v4.16b,v4.16b +	aesmc		v5.16b,v5.16b +	 add		w8,w8,#1 +	aese		v4.16b,v19.16b +	aese		v5.16b,v19.16b +	 eor		v2.16b,v2.16b,v7.16b +	 rev		w10,w8 +	aesmc		v4.16b,v4.16b +	aesmc		v5.16b,v5.16b +	 eor		v3.16b,v3.16b,v7.16b +	 mov		x7,x3 +	aese		v4.16b,v20.16b +	aese		v5.16b,v20.16b +	 subs		x2,x2,#2 +	aesmc		v4.16b,v4.16b +	aesmc		v5.16b,v5.16b +	 ld1	 {v16.4s-v17.4s},[x7],#32	// re-pre-load rndkey[0-1] +	aese		v4.16b,v21.16b +	aese		v5.16b,v21.16b +	aesmc		v4.16b,v4.16b +	aesmc		v5.16b,v5.16b +	aese		v4.16b,v22.16b +	aese		v5.16b,v22.16b +	 mov	v0.s[3], w9 +	aesmc		v4.16b,v4.16b +	 mov	v1.s[3], w10 +	aesmc		v5.16b,v5.16b +	aese		v4.16b,v23.16b +	aese		v5.16b,v23.16b + +	 mov		w6,w5 +	eor		v2.16b,v2.16b,v4.16b +	eor		v3.16b,v3.16b,v5.16b +	st1		{v2.16b},[x1],#16 +	st1		{v3.16b},[x1],#16 +	b.hs		.Loop2x_ctr32 + +	adds		x2,x2,#2 +	b.eq		.Lctr32_done +	b		.Lctr32_tail + +.Lctr32_128: +	ld1		{v4.4s-v5.4s},[x7] + +.Loop2x_ctr32_128: +	aese		v0.16b,v16.16b +	aese		v1.16b,v16.16b +	aesmc		v0.16b,v0.16b +	 ld1		{v2.16b},[x0],#16 +	aesmc		v1.16b,v1.16b +	 ld1		{v3.16b},[x0],#16 +	aese		v0.16b,v17.16b +	aese		v1.16b,v17.16b +	 add		w8,w8,#1 +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	 rev		w9,w8 +	aese		v0.16b,v4.16b +	aese		v1.16b,v4.16b +	 add		w8,w8,#1 +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	 rev		w10,w8 +	aese		v0.16b,v5.16b +	aese		v1.16b,v5.16b +	 subs		x2,x2,#2 +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	aese		v0.16b,v18.16b +	aese		v1.16b,v18.16b +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	aese		v0.16b,v19.16b +	aese		v1.16b,v19.16b +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	aese		v0.16b,v20.16b +	aese		v1.16b,v20.16b +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	aese		v0.16b,v21.16b +	aese		v1.16b,v21.16b +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	aese		v0.16b,v22.16b +	aese		v1.16b,v22.16b +	aesmc		v0.16b,v0.16b +	aesmc		v1.16b,v1.16b +	 eor		v2.16b,v2.16b,v7.16b +	aese		v0.16b,v23.16b +	 eor		v3.16b,v3.16b,v7.16b +	aese		v1.16b,v23.16b + +	eor		v2.16b,v2.16b,v0.16b +	orr		v0.16b,v6.16b,v6.16b +	eor		v3.16b,v3.16b,v1.16b +	orr		v1.16b,v6.16b,v6.16b +	st1		{v2.16b},[x1],#16 +	mov		v0.s[3], w9 +	st1		{v3.16b},[x1],#16 +	mov		v1.s[3], w10 +	b.hs		.Loop2x_ctr32_128 + +	adds		x2,x2,#2 +	b.eq		.Lctr32_done + +.Lctr32_tail: +	aese		v0.16b,v16.16b +	ld1		{v16.4s},[x7],#16 +	aesmc		v0.16b,v0.16b +	subs		w6,w6,#2 +	aese		v0.16b,v17.16b +	ld1		{v17.4s},[x7],#16 +	aesmc		v0.16b,v0.16b +	b.gt		.Lctr32_tail + +	aese		v0.16b,v16.16b +	aesmc		v0.16b,v0.16b +	aese		v0.16b,v17.16b +	aesmc		v0.16b,v0.16b +	 ld1		{v2.16b},[x0] +	aese		v0.16b,v18.16b +	aesmc		v0.16b,v0.16b +	aese		v0.16b,v19.16b +	aesmc		v0.16b,v0.16b +	aese		v0.16b,v20.16b +	aesmc		v0.16b,v0.16b +	aese		v0.16b,v21.16b +	aesmc		v0.16b,v0.16b +	aese		v0.16b,v22.16b +	aesmc		v0.16b,v0.16b +	 eor		v2.16b,v2.16b,v7.16b +	aese		v0.16b,v23.16b + +	eor		v2.16b,v2.16b,v0.16b +	st1		{v2.16b},[x1] + +.Lctr32_done: +	ldr		x29,[sp],#16 +	ret +.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +#endif diff --git a/main/openssl/crypto/aes/asm/aesv8-armx.S b/main/openssl/crypto/aes/asm/aesv8-armx.S new file mode 100644 index 00000000..1637e4d4 --- /dev/null +++ b/main/openssl/crypto/aes/asm/aesv8-armx.S @@ -0,0 +1,767 @@ +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +.fpu	neon +.code	32 +.align	5 +rcon: +.long	0x01,0x01,0x01,0x01 +.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat +.long	0x1b,0x1b,0x1b,0x1b + +.globl	aes_v8_set_encrypt_key +.type	aes_v8_set_encrypt_key,%function +.align	5 +aes_v8_set_encrypt_key: +.Lenc_key: +	adr	r3,rcon +	cmp	r1,#192 + +	veor	q0,q0,q0 +	vld1.8	{q3},[r0]! +	mov	r1,#8		@ reuse r1 +	vld1.32	{q1,q2},[r3]! + +	blt	.Loop128 +	beq	.L192 +	b	.L256 + +.align	4 +.Loop128: +	vtbl.8	d20,{q3},d4 +	vtbl.8	d21,{q3},d5 +	vext.8	q9,q0,q3,#12 +	vst1.32	{q3},[r2]! +	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0 +	subs	r1,r1,#1 + +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	 veor	q10,q10,q1 +	veor	q3,q3,q9 +	vshl.u8	q1,q1,#1 +	veor	q3,q3,q10 +	bne	.Loop128 + +	vld1.32	{q1},[r3] + +	vtbl.8	d20,{q3},d4 +	vtbl.8	d21,{q3},d5 +	vext.8	q9,q0,q3,#12 +	vst1.32	{q3},[r2]! +	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0 + +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	 veor	q10,q10,q1 +	veor	q3,q3,q9 +	vshl.u8	q1,q1,#1 +	veor	q3,q3,q10 + +	vtbl.8	d20,{q3},d4 +	vtbl.8	d21,{q3},d5 +	vext.8	q9,q0,q3,#12 +	vst1.32	{q3},[r2]! +	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0 + +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	 veor	q10,q10,q1 +	veor	q3,q3,q9 +	veor	q3,q3,q10 +	vst1.32	{q3},[r2] +	add	r2,r2,#0x50 + +	mov	r12,#10 +	b	.Ldone + +.align	4 +.L192: +	vld1.8	{d16},[r0]! +	vmov.i8	q10,#8			@ borrow q10 +	vst1.32	{q3},[r2]! +	vsub.i8	q2,q2,q10	@ adjust the mask + +.Loop192: +	vtbl.8	d20,{q8},d4 +	vtbl.8	d21,{q8},d5 +	vext.8	q9,q0,q3,#12 +	vst1.32	{d16},[r2]! +	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0 +	subs	r1,r1,#1 + +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	veor	q3,q3,q9 + +	vdup.32	q9,d7[1] +	veor	q9,q9,q8 +	 veor	q10,q10,q1 +	vext.8	q8,q0,q8,#12 +	vshl.u8	q1,q1,#1 +	veor	q8,q8,q9 +	veor	q3,q3,q10 +	veor	q8,q8,q10 +	vst1.32	{q3},[r2]! +	bne	.Loop192 + +	mov	r12,#12 +	add	r2,r2,#0x20 +	b	.Ldone + +.align	4 +.L256: +	vld1.8	{q8},[r0] +	mov	r1,#7 +	mov	r12,#14 +	vst1.32	{q3},[r2]! + +.Loop256: +	vtbl.8	d20,{q8},d4 +	vtbl.8	d21,{q8},d5 +	vext.8	q9,q0,q3,#12 +	vst1.32	{q8},[r2]! +	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0 +	subs	r1,r1,#1 + +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	veor	q3,q3,q9 +	vext.8	q9,q0,q9,#12 +	 veor	q10,q10,q1 +	veor	q3,q3,q9 +	vshl.u8	q1,q1,#1 +	veor	q3,q3,q10 +	vst1.32	{q3},[r2]! +	beq	.Ldone + +	vdup.32	q10,d7[1] +	vext.8	q9,q0,q8,#12 +	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0 + +	veor	q8,q8,q9 +	vext.8	q9,q0,q9,#12 +	veor	q8,q8,q9 +	vext.8	q9,q0,q9,#12 +	veor	q8,q8,q9 + +	veor	q8,q8,q10 +	b	.Loop256 + +.Ldone: +	str	r12,[r2] + +	eor	r0,r0,r0		@ return value +	 +	bx	lr +.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key + +.globl	aes_v8_set_decrypt_key +.type	aes_v8_set_decrypt_key,%function +.align	5 +aes_v8_set_decrypt_key: +	stmdb	sp!,{r4,lr} +	bl	.Lenc_key + +	sub	r2,r2,#240		@ restore original r2 +	mov	r4,#-16 +	add	r0,r2,r12,lsl#4	@ end of key schedule + +	vld1.32	{q0},[r2] +	vld1.32	{q1},[r0] +	vst1.32	{q0},[r0],r4 +	vst1.32	{q1},[r2]! + +.Loop_imc: +	vld1.32	{q0},[r2] +	vld1.32	{q1},[r0] +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	vst1.32	{q0},[r0],r4 +	vst1.32	{q1},[r2]! +	cmp	r0,r2 +	bhi	.Loop_imc + +	vld1.32	{q0},[r2] +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	vst1.32	{q0},[r0] + +	eor	r0,r0,r0		@ return value +	ldmia	sp!,{r4,pc} +.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key +.globl	aes_v8_encrypt +.type	aes_v8_encrypt,%function +.align	5 +aes_v8_encrypt: +	ldr	r3,[r2,#240] +	vld1.32	{q0},[r2]! +	vld1.8	{q2},[r0] +	sub	r3,r3,#2 +	vld1.32	{q1},[r2]! + +.Loop_enc: +	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0 +	vld1.32	{q0},[r2]! +	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2 +	subs	r3,r3,#2 +	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1 +	vld1.32	{q1},[r2]! +	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2 +	bgt	.Loop_enc + +	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0 +	vld1.32	{q0},[r2] +	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2 +	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1 +	veor	q2,q2,q0 + +	vst1.8	{q2},[r1] +	bx	lr +.size	aes_v8_encrypt,.-aes_v8_encrypt +.globl	aes_v8_decrypt +.type	aes_v8_decrypt,%function +.align	5 +aes_v8_decrypt: +	ldr	r3,[r2,#240] +	vld1.32	{q0},[r2]! +	vld1.8	{q2},[r0] +	sub	r3,r3,#2 +	vld1.32	{q1},[r2]! + +.Loop_dec: +	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0 +	vld1.32	{q0},[r2]! +	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2 +	subs	r3,r3,#2 +	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1 +	vld1.32	{q1},[r2]! +	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2 +	bgt	.Loop_dec + +	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0 +	vld1.32	{q0},[r2] +	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2 +	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1 +	veor	q2,q2,q0 + +	vst1.8	{q2},[r1] +	bx	lr +.size	aes_v8_decrypt,.-aes_v8_decrypt +.globl	aes_v8_cbc_encrypt +.type	aes_v8_cbc_encrypt,%function +.align	5 +aes_v8_cbc_encrypt: +	mov	ip,sp +	stmdb	sp!,{r4-r8,lr} +	vstmdb	sp!,{d8-d15}            @ ABI specification says so +	ldmia	ip,{r4-r5}		@ load remaining args +	subs	r2,r2,#16 +	mov	r8,#16 +	blo	.Lcbc_abort +	moveq	r8,#0 + +	cmp	r5,#0			@ en- or decrypting? +	ldr	r5,[r3,#240] +	and	r2,r2,#-16 +	vld1.8	{q6},[r4] +	vld1.8	{q0},[r0],r8 + +	vld1.32	{q8-q9},[r3]		@ load key schedule... +	sub	r5,r5,#6 +	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys +	sub	r5,r5,#2 +	vld1.32	{q10-q11},[r7]! +	vld1.32	{q12-q13},[r7]! +	vld1.32	{q14-q15},[r7]! +	vld1.32	{q7},[r7] + +	add	r7,r3,#32 +	mov	r6,r5 +	beq	.Lcbc_dec + +	cmp	r5,#2 +	veor	q0,q0,q6 +	veor	q5,q8,q7 +	beq	.Lcbc_enc128 + +.Loop_cbc_enc: +	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8 +	vld1.32	{q8},[r7]! +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	subs	r6,r6,#2 +	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9 +	vld1.32	{q9},[r7]! +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	bgt	.Loop_cbc_enc + +	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 subs	r2,r2,#16 +	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 moveq	r8,#0 +	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 add	r7,r3,#16 +	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 vld1.8	{q8},[r0],r8 +	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 veor	q8,q8,q5 +	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1] +	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15 + +	 mov	r6,r5 +	veor	q6,q0,q7 +	vst1.8	{q6},[r1]! +	bhs	.Loop_cbc_enc + +	b	.Lcbc_done + +.align	5 +.Lcbc_enc128: +	vld1.32	{q2-q3},[r7] +	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	b	.Lenter_cbc_enc128 +.Loop_cbc_enc128: +	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 vst1.8	{q6},[r1]! +.Lenter_cbc_enc128: +	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 subs	r2,r2,#16 +	.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 moveq	r8,#0 +	.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 vld1.8	{q8},[r0],r8 +	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 veor	q8,q8,q5 +	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15 +	veor	q6,q0,q7 +	bhs	.Loop_cbc_enc128 + +	vst1.8	{q6},[r1]! +	b	.Lcbc_done + +.align	5 +.Lcbc_dec128: +	vld1.32	{q4-q5},[r7] +	veor	q6,q6,q7 +	veor	q2,q0,q7 +	mov	r12,r8 + +.Loop2x_cbc_dec128: +	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8 +	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 subs	r2,r2,#32 +	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9 +	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 movlo	r8,#0 +	.byte	0x48,0x03,0xb0,0xf3	@ aesd q0,q4 +	.byte	0x48,0x23,0xb0,0xf3	@ aesd q1,q4 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 movls	r12,#0 +	.byte	0x4a,0x03,0xb0,0xf3	@ aesd q0,q5 +	.byte	0x4a,0x23,0xb0,0xf3	@ aesd q1,q5 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10 +	.byte	0x64,0x23,0xb0,0xf3	@ aesd q1,q10 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11 +	.byte	0x66,0x23,0xb0,0xf3	@ aesd q1,q11 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12 +	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13 +	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14 +	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15 +	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15 + +	veor	q6,q6,q0 +	vld1.8	{q0},[r0],r8 +	veor	q2,q2,q1 +	vld1.8	{q1},[r0],r12 +	vst1.8	{q6},[r1]! +	veor	q6,q3,q7 +	vst1.8	{q2},[r1]! +	veor	q2,q0,q7 +	vorr	q3,q1,q1 +	bhs	.Loop2x_cbc_dec128 + +	adds	r2,r2,#32 +	veor	q6,q6,q7 +	beq	.Lcbc_done +	veor	q2,q2,q7 +	b	.Lcbc_dec_tail + +.align	5 +.Lcbc_dec: +	subs	r2,r2,#16 +	vorr	q2,q0,q0 +	blo	.Lcbc_dec_tail + +	moveq	r8,#0 +	cmp	r5,#2 +	vld1.8	{q1},[r0],r8 +	vorr	q3,q1,q1 +	beq	.Lcbc_dec128 + +.Loop2x_cbc_dec: +	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8 +	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8 +	vld1.32	{q8},[r7]! +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	subs	r6,r6,#2 +	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9 +	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9 +	vld1.32	{q9},[r7]! +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	bgt	.Loop2x_cbc_dec + +	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8 +	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 veor	q4,q6,q7 +	 veor	q5,q2,q7 +	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9 +	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 vorr	q6,q3,q3 +	 subs	r2,r2,#32 +	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10 +	.byte	0x64,0x23,0xb0,0xf3	@ aesd q1,q10 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	 movlo	r8,#0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 mov	r7,r3 +	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11 +	.byte	0x66,0x23,0xb0,0xf3	@ aesd q1,q11 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	 vld1.8	{q2},[r0],r8 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 movls	r8,#0 +	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12 +	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 vld1.8	{q3},[r0],r8 +	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13 +	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 vld1.32 {q8},[r7]!	@ re-pre-load rndkey[0] +	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14 +	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1 +	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1] +	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15 +	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15 + +	 mov	r6,r5 +	veor	q4,q4,q0 +	veor	q5,q5,q1 +	 vorr	q0,q2,q2 +	vst1.8	{q4},[r1]! +	 vorr	q1,q3,q3 +	vst1.8	{q5},[r1]! +	bhs	.Loop2x_cbc_dec + +	adds	r2,r2,#32 +	beq	.Lcbc_done + +.Lcbc_dec_tail: +	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8 +	vld1.32	{q8},[r7]! +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	subs	r6,r6,#2 +	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9 +	vld1.32	{q9},[r7]! +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	bgt	.Lcbc_dec_tail + +	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	 veor	q4,q6,q7 +	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	 vorr	q6,q2,q2 +	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14 +	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0 +	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15 + +	veor	q4,q4,q0 +	vst1.8	{q4},[r1]! + +.Lcbc_done: +	vst1.8	{q6},[r4] +.Lcbc_abort: +	vldmia	sp!,{d8-d15} +	ldmia	sp!,{r4-r8,pc} +.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl	aes_v8_ctr32_encrypt_blocks +.type	aes_v8_ctr32_encrypt_blocks,%function +.align	5 +aes_v8_ctr32_encrypt_blocks: +	mov		ip,sp +	stmdb		sp!,{r4-r10,lr} +	vstmdb		sp!,{d8-d15}            @ ABI specification says so +	ldr		r4, [ip]		@ load remaining arg +	ldr		r5,[r3,#240] + +	ldr		r8, [r4, #12] +	vld1.32		{q0},[r4] + +	vld1.32		{q8-q9},[r3]		@ load key schedule... +	sub		r5,r5,#6 +	add		r7,r3,r5,lsl#4	@ pointer to last 7 round keys +	sub		r5,r5,#2 +	vld1.32		{q10-q11},[r7]! +	vld1.32		{q12-q13},[r7]! +	vld1.32		{q14-q15},[r7]! +	vld1.32		{q7},[r7] + +	add		r7,r3,#32 +	mov		r6,r5 + +	subs		r2,r2,#2 +	blo		.Lctr32_tail + +#ifndef __ARMEB__ +	rev		r8, r8 +#endif +	vorr		q1,q0,q0 +	add		r8, r8, #1 +	vorr		q6,q0,q0 +	rev		r10, r8 +	cmp		r5,#2 +	vmov.32	d3[1],r10 +	beq		.Lctr32_128 + +.Loop2x_ctr32: +	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8 +	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8 +	vld1.32		{q8},[r7]! +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	subs		r6,r6,#2 +	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9 +	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9 +	vld1.32		{q9},[r7]! +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	bgt		.Loop2x_ctr32 + +	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8 +	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8 +	.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0 +	 vorr		q0,q6,q6 +	.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1 +	 vorr		q1,q6,q6 +	.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9 +	.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9 +	 vld1.8		{q2},[r0]! +	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4 +	 vld1.8		{q3},[r0]! +	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5 +	 add		r8,r8,#1 +	.byte	0x24,0x83,0xb0,0xf3	@ aese q4,q10 +	.byte	0x24,0xa3,0xb0,0xf3	@ aese q5,q10 +	 rev		r9,r8 +	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4 +	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5 +	 add		r8,r8,#1 +	.byte	0x26,0x83,0xb0,0xf3	@ aese q4,q11 +	.byte	0x26,0xa3,0xb0,0xf3	@ aese q5,q11 +	 veor		q2,q2,q7 +	 rev		r10,r8 +	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4 +	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5 +	 veor		q3,q3,q7 +	 mov		r7,r3 +	.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12 +	.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12 +	 subs		r2,r2,#2 +	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4 +	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5 +	 vld1.32	 {q8-q9},[r7]!	@ re-pre-load rndkey[0-1] +	.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13 +	.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13 +	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4 +	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5 +	.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14 +	.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14 +	 vmov.32	d1[1], r9 +	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4 +	 vmov.32	d3[1], r10 +	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5 +	.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15 +	.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15 + +	 mov		r6,r5 +	veor		q2,q2,q4 +	veor		q3,q3,q5 +	vst1.8		{q2},[r1]! +	vst1.8		{q3},[r1]! +	bhs		.Loop2x_ctr32 + +	adds		r2,r2,#2 +	beq		.Lctr32_done +	b		.Lctr32_tail + +.Lctr32_128: +	vld1.32		{q4-q5},[r7] + +.Loop2x_ctr32_128: +	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8 +	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 vld1.8		{q2},[r0]! +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	 vld1.8		{q3},[r0]! +	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9 +	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9 +	 add		r8,r8,#1 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	 rev		r9,r8 +	.byte	0x08,0x03,0xb0,0xf3	@ aese q0,q4 +	.byte	0x08,0x23,0xb0,0xf3	@ aese q1,q4 +	 add		r8,r8,#1 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	 rev		r10,r8 +	.byte	0x0a,0x03,0xb0,0xf3	@ aese q0,q5 +	.byte	0x0a,0x23,0xb0,0xf3	@ aese q1,q5 +	 subs		r2,r2,#2 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10 +	.byte	0x24,0x23,0xb0,0xf3	@ aese q1,q10 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11 +	.byte	0x26,0x23,0xb0,0xf3	@ aese q1,q11 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12 +	.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13 +	.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14 +	.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1 +	 veor		q2,q2,q7 +	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15 +	 veor		q3,q3,q7 +	.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15 + +	veor		q2,q2,q0 +	vorr		q0,q6,q6 +	veor		q3,q3,q1 +	vorr		q1,q6,q6 +	vst1.8		{q2},[r1]! +	vmov.32	d1[1], r9 +	vst1.8		{q3},[r1]! +	vmov.32	d3[1], r10 +	bhs		.Loop2x_ctr32_128 + +	adds		r2,r2,#2 +	beq		.Lctr32_done + +.Lctr32_tail: +	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8 +	vld1.32		{q8},[r7]! +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	subs		r6,r6,#2 +	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9 +	vld1.32		{q9},[r7]! +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	bgt		.Lctr32_tail + +	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 vld1.8		{q2},[r0] +	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14 +	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0 +	 veor		q2,q2,q7 +	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15 + +	veor		q2,q2,q0 +	vst1.8		{q2},[r1] + +.Lctr32_done: +	vldmia		sp!,{d8-d15} +	ldmia		sp!,{r4-r10,pc} +.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +#endif diff --git a/main/openssl/crypto/aes/asm/aesv8-armx.pl b/main/openssl/crypto/aes/asm/aesv8-armx.pl new file mode 100644 index 00000000..415dc04a --- /dev/null +++ b/main/openssl/crypto/aes/asm/aesv8-armx.pl @@ -0,0 +1,980 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for ARMv8 AES instructions. The +# module is endian-agnostic in sense that it supports both big- and +# little-endian cases. As does it support both 32- and 64-bit modes +# of operation. Latter is achieved by limiting amount of utilized +# registers to 16, which implies additional instructions. This has +# no effect on mighty Apple A7, as results are literally equal to +# the theoretical estimates based on instruction latencies and issue +# rate. It remains to be seen how does it affect other platforms... +# +# Performance in cycles per byte processed with 128-bit key: +# +#		CBC enc		CBC dec		CTR +# Apple A7	2.39		1.20		1.20 +# Cortex-A5x	n/a		n/a		n/a + +$flavour = shift; +open STDOUT,">".shift; + +$prefix="aes_v8"; + +$code=<<___; +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +___ +$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/); +$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/); + +# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, +# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to +# maintain both 32- and 64-bit codes within single module and +# transliterate common code to either flavour with regex vodoo. +# +{{{ +my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); +my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= +	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); + + +$code.=<<___; +.align	5 +rcon: +.long	0x01,0x01,0x01,0x01 +.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat +.long	0x1b,0x1b,0x1b,0x1b + +.globl	${prefix}_set_encrypt_key +.type	${prefix}_set_encrypt_key,%function +.align	5 +${prefix}_set_encrypt_key: +.Lenc_key: +___ +$code.=<<___	if ($flavour =~ /64/); +	stp	x29,x30,[sp,#-16]! +	add	x29,sp,#0 +___ +$code.=<<___; +	adr	$ptr,rcon +	cmp	$bits,#192 + +	veor	$zero,$zero,$zero +	vld1.8	{$in0},[$inp],#16 +	mov	$bits,#8		// reuse $bits +	vld1.32	{$rcon,$mask},[$ptr],#32 + +	b.lt	.Loop128 +	b.eq	.L192 +	b	.L256 + +.align	4 +.Loop128: +	vtbl.8	$key,{$in0},$mask +	vext.8	$tmp,$zero,$in0,#12 +	vst1.32	{$in0},[$out],#16 +	aese	$key,$zero +	subs	$bits,$bits,#1 + +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	 veor	$key,$key,$rcon +	veor	$in0,$in0,$tmp +	vshl.u8	$rcon,$rcon,#1 +	veor	$in0,$in0,$key +	b.ne	.Loop128 + +	vld1.32	{$rcon},[$ptr] + +	vtbl.8	$key,{$in0},$mask +	vext.8	$tmp,$zero,$in0,#12 +	vst1.32	{$in0},[$out],#16 +	aese	$key,$zero + +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	 veor	$key,$key,$rcon +	veor	$in0,$in0,$tmp +	vshl.u8	$rcon,$rcon,#1 +	veor	$in0,$in0,$key + +	vtbl.8	$key,{$in0},$mask +	vext.8	$tmp,$zero,$in0,#12 +	vst1.32	{$in0},[$out],#16 +	aese	$key,$zero + +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	 veor	$key,$key,$rcon +	veor	$in0,$in0,$tmp +	veor	$in0,$in0,$key +	vst1.32	{$in0},[$out] +	add	$out,$out,#0x50 + +	mov	$rounds,#10 +	b	.Ldone + +.align	4 +.L192: +	vld1.8	{$in1},[$inp],#8 +	vmov.i8	$key,#8			// borrow $key +	vst1.32	{$in0},[$out],#16 +	vsub.i8	$mask,$mask,$key	// adjust the mask + +.Loop192: +	vtbl.8	$key,{$in1},$mask +	vext.8	$tmp,$zero,$in0,#12 +	vst1.32	{$in1},[$out],#8 +	aese	$key,$zero +	subs	$bits,$bits,#1 + +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	veor	$in0,$in0,$tmp + +	vdup.32	$tmp,${in0}[3] +	veor	$tmp,$tmp,$in1 +	 veor	$key,$key,$rcon +	vext.8	$in1,$zero,$in1,#12 +	vshl.u8	$rcon,$rcon,#1 +	veor	$in1,$in1,$tmp +	veor	$in0,$in0,$key +	veor	$in1,$in1,$key +	vst1.32	{$in0},[$out],#16 +	b.ne	.Loop192 + +	mov	$rounds,#12 +	add	$out,$out,#0x20 +	b	.Ldone + +.align	4 +.L256: +	vld1.8	{$in1},[$inp] +	mov	$bits,#7 +	mov	$rounds,#14 +	vst1.32	{$in0},[$out],#16 + +.Loop256: +	vtbl.8	$key,{$in1},$mask +	vext.8	$tmp,$zero,$in0,#12 +	vst1.32	{$in1},[$out],#16 +	aese	$key,$zero +	subs	$bits,$bits,#1 + +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	veor	$in0,$in0,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	 veor	$key,$key,$rcon +	veor	$in0,$in0,$tmp +	vshl.u8	$rcon,$rcon,#1 +	veor	$in0,$in0,$key +	vst1.32	{$in0},[$out],#16 +	b.eq	.Ldone + +	vdup.32	$key,${in0}[3]		// just splat +	vext.8	$tmp,$zero,$in1,#12 +	aese	$key,$zero + +	veor	$in1,$in1,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	veor	$in1,$in1,$tmp +	vext.8	$tmp,$zero,$tmp,#12 +	veor	$in1,$in1,$tmp + +	veor	$in1,$in1,$key +	b	.Loop256 + +.Ldone: +	str	$rounds,[$out] + +	eor	x0,x0,x0		// return value +	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)` +	ret +.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key + +.globl	${prefix}_set_decrypt_key +.type	${prefix}_set_decrypt_key,%function +.align	5 +${prefix}_set_decrypt_key: +___ +$code.=<<___	if ($flavour =~ /64/); +	stp	x29,x30,[sp,#-16]! +	add	x29,sp,#0 +___ +$code.=<<___	if ($flavour !~ /64/); +	stmdb	sp!,{r4,lr} +___ +$code.=<<___; +	bl	.Lenc_key + +	sub	$out,$out,#240		// restore original $out +	mov	x4,#-16 +	add	$inp,$out,x12,lsl#4	// end of key schedule + +	vld1.32	{v0.16b},[$out] +	vld1.32	{v1.16b},[$inp] +	vst1.32	{v0.16b},[$inp],x4 +	vst1.32	{v1.16b},[$out],#16 + +.Loop_imc: +	vld1.32	{v0.16b},[$out] +	vld1.32	{v1.16b},[$inp] +	aesimc	v0.16b,v0.16b +	aesimc	v1.16b,v1.16b +	vst1.32	{v0.16b},[$inp],x4 +	vst1.32	{v1.16b},[$out],#16 +	cmp	$inp,$out +	b.hi	.Loop_imc + +	vld1.32	{v0.16b},[$out] +	aesimc	v0.16b,v0.16b +	vst1.32	{v0.16b},[$inp] + +	eor	x0,x0,x0		// return value +___ +$code.=<<___	if ($flavour !~ /64/); +	ldmia	sp!,{r4,pc} +___ +$code.=<<___	if ($flavour =~ /64/); +	ldp	x29,x30,[sp],#16 +	ret +___ +$code.=<<___; +.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} +{{{ +sub gen_block () { +my $dir = shift; +my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); +my ($inp,$out,$key)=map("x$_",(0..2)); +my $rounds="w3"; +my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); + +$code.=<<___; +.globl	${prefix}_${dir}crypt +.type	${prefix}_${dir}crypt,%function +.align	5 +${prefix}_${dir}crypt: +	ldr	$rounds,[$key,#240] +	vld1.32	{$rndkey0},[$key],#16 +	vld1.8	{$inout},[$inp] +	sub	$rounds,$rounds,#2 +	vld1.32	{$rndkey1},[$key],#16 + +.Loop_${dir}c: +	aes$e	$inout,$rndkey0 +	vld1.32	{$rndkey0},[$key],#16 +	aes$mc	$inout,$inout +	subs	$rounds,$rounds,#2 +	aes$e	$inout,$rndkey1 +	vld1.32	{$rndkey1},[$key],#16 +	aes$mc	$inout,$inout +	b.gt	.Loop_${dir}c + +	aes$e	$inout,$rndkey0 +	vld1.32	{$rndkey0},[$key] +	aes$mc	$inout,$inout +	aes$e	$inout,$rndkey1 +	veor	$inout,$inout,$rndkey0 + +	vst1.8	{$inout},[$out] +	ret +.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; +my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + +my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); + +### q8-q15	preloaded key schedule + +$code.=<<___; +.globl	${prefix}_cbc_encrypt +.type	${prefix}_cbc_encrypt,%function +.align	5 +${prefix}_cbc_encrypt: +___ +$code.=<<___	if ($flavour =~ /64/); +	stp	x29,x30,[sp,#-16]! +	add	x29,sp,#0 +___ +$code.=<<___	if ($flavour !~ /64/); +	mov	ip,sp +	stmdb	sp!,{r4-r8,lr} +	vstmdb	sp!,{d8-d15}            @ ABI specification says so +	ldmia	ip,{r4-r5}		@ load remaining args +___ +$code.=<<___; +	subs	$len,$len,#16 +	mov	$step,#16 +	b.lo	.Lcbc_abort +	cclr	$step,eq + +	cmp	$enc,#0			// en- or decrypting? +	ldr	$rounds,[$key,#240] +	and	$len,$len,#-16 +	vld1.8	{$ivec},[$ivp] +	vld1.8	{$dat},[$inp],$step + +	vld1.32	{q8-q9},[$key]		// load key schedule... +	sub	$rounds,$rounds,#6 +	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys +	sub	$rounds,$rounds,#2 +	vld1.32	{q10-q11},[$key_],#32 +	vld1.32	{q12-q13},[$key_],#32 +	vld1.32	{q14-q15},[$key_],#32 +	vld1.32	{$rndlast},[$key_] + +	add	$key_,$key,#32 +	mov	$cnt,$rounds +	b.eq	.Lcbc_dec + +	cmp	$rounds,#2 +	veor	$dat,$dat,$ivec +	veor	$rndzero_n_last,q8,$rndlast +	b.eq	.Lcbc_enc128 + +.Loop_cbc_enc: +	aese	$dat,q8 +	vld1.32	{q8},[$key_],#16 +	aesmc	$dat,$dat +	subs	$cnt,$cnt,#2 +	aese	$dat,q9 +	vld1.32	{q9},[$key_],#16 +	aesmc	$dat,$dat +	b.gt	.Loop_cbc_enc + +	aese	$dat,q8 +	aesmc	$dat,$dat +	 subs	$len,$len,#16 +	aese	$dat,q9 +	aesmc	$dat,$dat +	 cclr	$step,eq +	aese	$dat,q10 +	aesmc	$dat,$dat +	 add	$key_,$key,#16 +	aese	$dat,q11 +	aesmc	$dat,$dat +	 vld1.8	{q8},[$inp],$step +	aese	$dat,q12 +	aesmc	$dat,$dat +	 veor	q8,q8,$rndzero_n_last +	aese	$dat,q13 +	aesmc	$dat,$dat +	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1] +	aese	$dat,q14 +	aesmc	$dat,$dat +	aese	$dat,q15 + +	 mov	$cnt,$rounds +	veor	$ivec,$dat,$rndlast +	vst1.8	{$ivec},[$out],#16 +	b.hs	.Loop_cbc_enc + +	b	.Lcbc_done + +.align	5 +.Lcbc_enc128: +	vld1.32	{$in0-$in1},[$key_] +	aese	$dat,q8 +	aesmc	$dat,$dat +	b	.Lenter_cbc_enc128 +.Loop_cbc_enc128: +	aese	$dat,q8 +	aesmc	$dat,$dat +	 vst1.8	{$ivec},[$out],#16 +.Lenter_cbc_enc128: +	aese	$dat,q9 +	aesmc	$dat,$dat +	 subs	$len,$len,#16 +	aese	$dat,$in0 +	aesmc	$dat,$dat +	 cclr	$step,eq +	aese	$dat,$in1 +	aesmc	$dat,$dat +	aese	$dat,q10 +	aesmc	$dat,$dat +	aese	$dat,q11 +	aesmc	$dat,$dat +	 vld1.8	{q8},[$inp],$step +	aese	$dat,q12 +	aesmc	$dat,$dat +	aese	$dat,q13 +	aesmc	$dat,$dat +	aese	$dat,q14 +	aesmc	$dat,$dat +	 veor	q8,q8,$rndzero_n_last +	aese	$dat,q15 +	veor	$ivec,$dat,$rndlast +	b.hs	.Loop_cbc_enc128 + +	vst1.8	{$ivec},[$out],#16 +	b	.Lcbc_done + +.align	5 +.Lcbc_dec128: +	vld1.32	{$tmp0-$tmp1},[$key_] +	veor	$ivec,$ivec,$rndlast +	veor	$in0,$dat0,$rndlast +	mov	$step1,$step + +.Loop2x_cbc_dec128: +	aesd	$dat0,q8 +	aesd	$dat1,q8 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	 subs	$len,$len,#32 +	aesd	$dat0,q9 +	aesd	$dat1,q9 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	 cclr	$step,lo +	aesd	$dat0,$tmp0 +	aesd	$dat1,$tmp0 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	 cclr	$step1,ls +	aesd	$dat0,$tmp1 +	aesd	$dat1,$tmp1 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	aesd	$dat0,q10 +	aesd	$dat1,q10 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	aesd	$dat0,q11 +	aesd	$dat1,q11 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	aesd	$dat0,q12 +	aesd	$dat1,q12 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	aesd	$dat0,q13 +	aesd	$dat1,q13 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	aesd	$dat0,q14 +	aesd	$dat1,q14 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	aesd	$dat0,q15 +	aesd	$dat1,q15 + +	veor	$ivec,$ivec,$dat0 +	vld1.8	{$dat0},[$inp],$step +	veor	$in0,$in0,$dat1 +	vld1.8	{$dat1},[$inp],$step1 +	vst1.8	{$ivec},[$out],#16 +	veor	$ivec,$in1,$rndlast +	vst1.8	{$in0},[$out],#16 +	veor	$in0,$dat0,$rndlast +	vorr	$in1,$dat1,$dat1 +	b.hs	.Loop2x_cbc_dec128 + +	adds	$len,$len,#32 +	veor	$ivec,$ivec,$rndlast +	b.eq	.Lcbc_done +	veor	$in0,$in0,$rndlast +	b	.Lcbc_dec_tail + +.align	5 +.Lcbc_dec: +	subs	$len,$len,#16 +	vorr	$in0,$dat,$dat +	b.lo	.Lcbc_dec_tail + +	cclr	$step,eq +	cmp	$rounds,#2 +	vld1.8	{$dat1},[$inp],$step +	vorr	$in1,$dat1,$dat1 +	b.eq	.Lcbc_dec128 + +.Loop2x_cbc_dec: +	aesd	$dat0,q8 +	aesd	$dat1,q8 +	vld1.32	{q8},[$key_],#16 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	subs	$cnt,$cnt,#2 +	aesd	$dat0,q9 +	aesd	$dat1,q9 +	vld1.32	{q9},[$key_],#16 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	b.gt	.Loop2x_cbc_dec + +	aesd	$dat0,q8 +	aesd	$dat1,q8 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	 veor	$tmp0,$ivec,$rndlast +	 veor	$tmp1,$in0,$rndlast +	aesd	$dat0,q9 +	aesd	$dat1,q9 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	 vorr	$ivec,$in1,$in1 +	 subs	$len,$len,#32 +	aesd	$dat0,q10 +	aesd	$dat1,q10 +	aesimc	$dat0,$dat0 +	 cclr	$step,lo +	aesimc	$dat1,$dat1 +	 mov	$key_,$key +	aesd	$dat0,q11 +	aesd	$dat1,q11 +	aesimc	$dat0,$dat0 +	 vld1.8	{$in0},[$inp],$step +	aesimc	$dat1,$dat1 +	 cclr	$step,ls +	aesd	$dat0,q12 +	aesd	$dat1,q12 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	 vld1.8	{$in1},[$inp],$step +	aesd	$dat0,q13 +	aesd	$dat1,q13 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0] +	aesd	$dat0,q14 +	aesd	$dat1,q14 +	aesimc	$dat0,$dat0 +	aesimc	$dat1,$dat1 +	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1] +	aesd	$dat0,q15 +	aesd	$dat1,q15 + +	 mov	$cnt,$rounds +	veor	$tmp0,$tmp0,$dat0 +	veor	$tmp1,$tmp1,$dat1 +	 vorr	$dat0,$in0,$in0 +	vst1.8	{$tmp0},[$out],#16 +	 vorr	$dat1,$in1,$in1 +	vst1.8	{$tmp1},[$out],#16 +	b.hs	.Loop2x_cbc_dec + +	adds	$len,$len,#32 +	b.eq	.Lcbc_done + +.Lcbc_dec_tail: +	aesd	$dat,q8 +	vld1.32	{q8},[$key_],#16 +	aesimc	$dat,$dat +	subs	$cnt,$cnt,#2 +	aesd	$dat,q9 +	vld1.32	{q9},[$key_],#16 +	aesimc	$dat,$dat +	b.gt	.Lcbc_dec_tail + +	aesd	$dat,q8 +	aesimc	$dat,$dat +	aesd	$dat,q9 +	aesimc	$dat,$dat +	 veor	$tmp,$ivec,$rndlast +	aesd	$dat,q10 +	aesimc	$dat,$dat +	 vorr	$ivec,$in0,$in0 +	aesd	$dat,q11 +	aesimc	$dat,$dat +	aesd	$dat,q12 +	aesimc	$dat,$dat +	aesd	$dat,q13 +	aesimc	$dat,$dat +	aesd	$dat,q14 +	aesimc	$dat,$dat +	aesd	$dat,q15 + +	veor	$tmp,$tmp,$dat +	vst1.8	{$tmp},[$out],#16 + +.Lcbc_done: +	vst1.8	{$ivec},[$ivp] +.Lcbc_abort: +___ +$code.=<<___	if ($flavour !~ /64/); +	vldmia	sp!,{d8-d15} +	ldmia	sp!,{r4-r8,pc} +___ +$code.=<<___	if ($flavour =~ /64/); +	ldr	x29,[sp],#16 +	ret +___ +$code.=<<___; +.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); +my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10"); +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + +my ($dat,$tmp)=($dat0,$tmp0); + +### q8-q15	preloaded key schedule + +$code.=<<___; +.globl	${prefix}_ctr32_encrypt_blocks +.type	${prefix}_ctr32_encrypt_blocks,%function +.align	5 +${prefix}_ctr32_encrypt_blocks: +___ +$code.=<<___	if ($flavour =~ /64/); +	stp		x29,x30,[sp,#-16]! +	add		x29,sp,#0 +___ +$code.=<<___	if ($flavour !~ /64/); +	mov		ip,sp +	stmdb		sp!,{r4-r10,lr} +	vstmdb		sp!,{d8-d15}            @ ABI specification says so +	ldr		r4, [ip]		@ load remaining arg +___ +$code.=<<___; +	ldr		$rounds,[$key,#240] + +	ldr		$ctr, [$ivp, #12] +	vld1.32		{$dat0},[$ivp] + +	vld1.32		{q8-q9},[$key]		// load key schedule... +	sub		$rounds,$rounds,#6 +	add		$key_,$key,x5,lsl#4	// pointer to last 7 round keys +	sub		$rounds,$rounds,#2 +	vld1.32		{q10-q11},[$key_],#32 +	vld1.32		{q12-q13},[$key_],#32 +	vld1.32		{q14-q15},[$key_],#32 +	vld1.32		{$rndlast},[$key_] + +	add		$key_,$key,#32 +	mov		$cnt,$rounds + +	subs		$len,$len,#2 +	b.lo		.Lctr32_tail + +#ifndef __ARMEB__ +	rev		$ctr, $ctr +#endif +	vorr		$dat1,$dat0,$dat0 +	add		$ctr, $ctr, #1 +	vorr		$ivec,$dat0,$dat0 +	rev		$tctr1, $ctr +	cmp		$rounds,#2 +	vmov.32		${dat1}[3],$tctr1 +	b.eq		.Lctr32_128 + +.Loop2x_ctr32: +	aese		$dat0,q8 +	aese		$dat1,q8 +	vld1.32		{q8},[$key_],#16 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	subs		$cnt,$cnt,#2 +	aese		$dat0,q9 +	aese		$dat1,q9 +	vld1.32		{q9},[$key_],#16 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	b.gt		.Loop2x_ctr32 + +	aese		$dat0,q8 +	aese		$dat1,q8 +	aesmc		$tmp0,$dat0 +	 vorr		$dat0,$ivec,$ivec +	aesmc		$tmp1,$dat1 +	 vorr		$dat1,$ivec,$ivec +	aese		$tmp0,q9 +	aese		$tmp1,q9 +	 vld1.8		{$in0},[$inp],#16 +	aesmc		$tmp0,$tmp0 +	 vld1.8		{$in1},[$inp],#16 +	aesmc		$tmp1,$tmp1 +	 add		$ctr,$ctr,#1 +	aese		$tmp0,q10 +	aese		$tmp1,q10 +	 rev		$tctr,$ctr +	aesmc		$tmp0,$tmp0 +	aesmc		$tmp1,$tmp1 +	 add		$ctr,$ctr,#1 +	aese		$tmp0,q11 +	aese		$tmp1,q11 +	 veor		$in0,$in0,$rndlast +	 rev		$tctr1,$ctr +	aesmc		$tmp0,$tmp0 +	aesmc		$tmp1,$tmp1 +	 veor		$in1,$in1,$rndlast +	 mov		$key_,$key +	aese		$tmp0,q12 +	aese		$tmp1,q12 +	 subs		$len,$len,#2 +	aesmc		$tmp0,$tmp0 +	aesmc		$tmp1,$tmp1 +	 vld1.32	 {q8-q9},[$key_],#32	// re-pre-load rndkey[0-1] +	aese		$tmp0,q13 +	aese		$tmp1,q13 +	aesmc		$tmp0,$tmp0 +	aesmc		$tmp1,$tmp1 +	aese		$tmp0,q14 +	aese		$tmp1,q14 +	 vmov.32	${dat0}[3], $tctr +	aesmc		$tmp0,$tmp0 +	 vmov.32	${dat1}[3], $tctr1 +	aesmc		$tmp1,$tmp1 +	aese		$tmp0,q15 +	aese		$tmp1,q15 + +	 mov		$cnt,$rounds +	veor		$in0,$in0,$tmp0 +	veor		$in1,$in1,$tmp1 +	vst1.8		{$in0},[$out],#16 +	vst1.8		{$in1},[$out],#16 +	b.hs		.Loop2x_ctr32 + +	adds		$len,$len,#2 +	b.eq		.Lctr32_done +	b		.Lctr32_tail + +.Lctr32_128: +	vld1.32		{$tmp0-$tmp1},[$key_] + +.Loop2x_ctr32_128: +	aese		$dat0,q8 +	aese		$dat1,q8 +	aesmc		$dat0,$dat0 +	 vld1.8		{$in0},[$inp],#16 +	aesmc		$dat1,$dat1 +	 vld1.8		{$in1},[$inp],#16 +	aese		$dat0,q9 +	aese		$dat1,q9 +	 add		$ctr,$ctr,#1 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	 rev		$tctr,$ctr +	aese		$dat0,$tmp0 +	aese		$dat1,$tmp0 +	 add		$ctr,$ctr,#1 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	 rev		$tctr1,$ctr +	aese		$dat0,$tmp1 +	aese		$dat1,$tmp1 +	 subs		$len,$len,#2 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	aese		$dat0,q10 +	aese		$dat1,q10 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	aese		$dat0,q11 +	aese		$dat1,q11 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	aese		$dat0,q12 +	aese		$dat1,q12 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	aese		$dat0,q13 +	aese		$dat1,q13 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	aese		$dat0,q14 +	aese		$dat1,q14 +	aesmc		$dat0,$dat0 +	aesmc		$dat1,$dat1 +	 veor		$in0,$in0,$rndlast +	aese		$dat0,q15 +	 veor		$in1,$in1,$rndlast +	aese		$dat1,q15 + +	veor		$in0,$in0,$dat0 +	vorr		$dat0,$ivec,$ivec +	veor		$in1,$in1,$dat1 +	vorr		$dat1,$ivec,$ivec +	vst1.8		{$in0},[$out],#16 +	vmov.32		${dat0}[3], $tctr +	vst1.8		{$in1},[$out],#16 +	vmov.32		${dat1}[3], $tctr1 +	b.hs		.Loop2x_ctr32_128 + +	adds		$len,$len,#2 +	b.eq		.Lctr32_done + +.Lctr32_tail: +	aese		$dat,q8 +	vld1.32		{q8},[$key_],#16 +	aesmc		$dat,$dat +	subs		$cnt,$cnt,#2 +	aese		$dat,q9 +	vld1.32		{q9},[$key_],#16 +	aesmc		$dat,$dat +	b.gt		.Lctr32_tail + +	aese		$dat,q8 +	aesmc		$dat,$dat +	aese		$dat,q9 +	aesmc		$dat,$dat +	 vld1.8		{$in0},[$inp] +	aese		$dat,q10 +	aesmc		$dat,$dat +	aese		$dat,q11 +	aesmc		$dat,$dat +	aese		$dat,q12 +	aesmc		$dat,$dat +	aese		$dat,q13 +	aesmc		$dat,$dat +	aese		$dat,q14 +	aesmc		$dat,$dat +	 veor		$in0,$in0,$rndlast +	aese		$dat,q15 + +	veor		$in0,$in0,$dat +	vst1.8		{$in0},[$out] + +.Lctr32_done: +___ +$code.=<<___	if ($flavour !~ /64/); +	vldmia		sp!,{d8-d15} +	ldmia		sp!,{r4-r10,pc} +___ +$code.=<<___	if ($flavour =~ /64/); +	ldr		x29,[sp],#16 +	ret +___ +$code.=<<___; +.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +$code.=<<___; +#endif +___ +######################################## +if ($flavour =~ /64/) {			######## 64-bit code +    my %opcode = ( +	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800, +	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	); + +    local *unaes = sub { +	my ($mnemonic,$arg)=@_; + +	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&& +	sprintf ".inst\t0x%08x\t//%s %s", +			$opcode{$mnemonic}|$1|($2<<5), +			$mnemonic,$arg; +    }; + +    foreach(split("\n",$code)) { +        s/\`([^\`]*)\`/eval($1)/geo; + +	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers +        s/@\s/\/\//o;			# old->new style commentary + +	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or +	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or +        s/vmov\.i8/movi/o	or	# fix up legacy mnemonics +        s/vext\.8/ext/o		or +        s/vrev32\.8/rev32/o	or +        s/vtst\.8/cmtst/o	or +        s/vshr/ushr/o		or +        s/^(\s+)v/$1/o		or	# strip off v prefix +	s/\bbx\s+lr\b/ret/o; + +	# fix up remainig legacy suffixes +	s/\.[ui]?8//o; +	m/\],#8/o and s/\.16b/\.8b/go; +        s/\.[ui]?32//o and s/\.16b/\.4s/go; +        s/\.[ui]?64//o and s/\.16b/\.2d/go; +	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + +        print $_,"\n"; +    } +} else {				######## 32-bit code +    my %opcode = ( +	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300, +	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	); + +    local *unaes = sub { +	my ($mnemonic,$arg)=@_; + +	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { +	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) +					 |(($2&7)<<1) |(($2&8)<<2); +	    # since ARMv7 instructions are always encoded little-endian. +	    # correct solution is to use .inst directive, but older +	    # assemblers don't implement it:-( +	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", +			$word&0xff,($word>>8)&0xff, +			($word>>16)&0xff,($word>>24)&0xff, +			$mnemonic,$arg; +	} +    }; + +    sub unvtbl { +	my $arg=shift; + +	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && +	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t". +		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;	 +    } + +    sub unvdup32 { +	my $arg=shift; + +	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && +	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;	 +    } + +    sub unvmov32 { +	my $arg=shift; + +	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && +	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;	 +    } + +    foreach(split("\n",$code)) { +        s/\`([^\`]*)\`/eval($1)/geo; + +	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers +	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers +        s/\/\/\s?/@ /o;				# new->old style commentary + +	# fix up remainig new-style suffixes +	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or +	s/\],#[0-9]+/]!/o; + +	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or +	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or +	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or +	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or +	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or +	s/^(\s+)b\./$1b/o				or +	s/^(\s+)ret/$1bx\tlr/o; + +        print $_,"\n"; +    } +} + +close STDOUT; diff --git a/main/openssl/crypto/arm64cpuid.S b/main/openssl/crypto/arm64cpuid.S new file mode 100644 index 00000000..4778ac1d --- /dev/null +++ b/main/openssl/crypto/arm64cpuid.S @@ -0,0 +1,46 @@ +#include "arm_arch.h" + +.text +.arch	armv8-a+crypto + +.align	5 +.global	_armv7_neon_probe +.type	_armv7_neon_probe,%function +_armv7_neon_probe: +	orr	v15.16b, v15.16b, v15.16b +	ret +.size	_armv7_neon_probe,.-_armv7_neon_probe + +.global	_armv7_tick +.type	_armv7_tick,%function +_armv7_tick: +	mrs	x0, CNTVCT_EL0 +	ret +.size	_armv7_tick,.-_armv7_tick + +.global	_armv8_aes_probe +.type	_armv8_aes_probe,%function +_armv8_aes_probe: +	aese	v0.16b, v0.16b +	ret +.size	_armv8_aes_probe,.-_armv8_aes_probe + +.global	_armv8_sha1_probe +.type	_armv8_sha1_probe,%function +_armv8_sha1_probe: +	sha1h	s0, s0 +	ret +.size	_armv8_sha1_probe,.-_armv8_sha1_probe + +.global	_armv8_sha256_probe +.type	_armv8_sha256_probe,%function +_armv8_sha256_probe: +	sha256su0	v0.4s, v0.4s +	ret +.size	_armv8_sha256_probe,.-_armv8_sha256_probe +.global	_armv8_pmull_probe +.type	_armv8_pmull_probe,%function +_armv8_pmull_probe: +	pmull	v0.1q, v0.1d, v0.1d +	ret +.size	_armv8_pmull_probe,.-_armv8_pmull_probe diff --git a/main/openssl/crypto/arm_arch.h b/main/openssl/crypto/arm_arch.h index 5a831076..6fa87244 100644 --- a/main/openssl/crypto/arm_arch.h +++ b/main/openssl/crypto/arm_arch.h @@ -10,13 +10,24 @@  #   define __ARMEL__  #  endif  # elif defined(__GNUC__) +#  if	defined(__aarch64__) +#   define __ARM_ARCH__ 8 +#   if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__ +#    define __ARMEB__ +#   else +#    define __ARMEL__ +#   endif    /*     * Why doesn't gcc define __ARM_ARCH__? Instead it defines     * bunch of below macros. See all_architectires[] table in     * gcc/config/arm/arm.c. On a side note it defines     * __ARMEL__/__ARMEB__ for little-/big-endian.     */ -#  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \ +#  elif defined(__ARM_ARCH) +#   define __ARM_ARCH__ __ARM_ARCH +#  elif	defined(__ARM_ARCH_8A__) +#   define __ARM_ARCH__ 8 +#  elif	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \  	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \  	defined(__ARM_ARCH_7EM__)  #   define __ARM_ARCH__ 7 @@ -43,9 +54,13 @@  #if !__ASSEMBLER__  extern unsigned int OPENSSL_armcap_P; +#endif  #define ARMV7_NEON      (1<<0)  #define ARMV7_TICK      (1<<1) -#endif +#define ARMV8_AES       (1<<2) +#define ARMV8_SHA1      (1<<3) +#define ARMV8_SHA256    (1<<4) +#define ARMV8_PMULL     (1<<5)  #endif diff --git a/main/openssl/crypto/armcap.c b/main/openssl/crypto/armcap.c index 9abaf396..7e46d07a 100644 --- a/main/openssl/crypto/armcap.c +++ b/main/openssl/crypto/armcap.c @@ -19,9 +19,13 @@ static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }   * ARM compilers support inline assembler...   */  void _armv7_neon_probe(void); -unsigned int _armv7_tick(void); +void _armv8_aes_probe(void); +void _armv8_sha1_probe(void); +void _armv8_sha256_probe(void); +void _armv8_pmull_probe(void); +unsigned long _armv7_tick(void); -unsigned int OPENSSL_rdtsc(void) +unsigned long OPENSSL_rdtsc(void)  	{  	if (OPENSSL_armcap_P & ARMV7_TICK)  		return _armv7_tick(); @@ -29,9 +33,41 @@ unsigned int OPENSSL_rdtsc(void)  		return 0;  	} +/* + * Use a weak reference to getauxval() so we can use it if it is available but + * don't break the build if it is not. + */  #if defined(__GNUC__) && __GNUC__>=2  void OPENSSL_cpuid_setup(void) __attribute__((constructor)); +extern unsigned long getauxval(unsigned long type) __attribute__((weak)); +#else +static unsigned long (*getauxval)(unsigned long) = NULL;  #endif + +/* + * ARM puts the the feature bits for Crypto Extensions in AT_HWCAP2, whereas + * AArch64 used AT_HWCAP. + */ +#if defined(__arm__) || defined (__arm) +# define HWCAP			16	/* AT_HWCAP */ +# define HWCAP_NEON		(1 << 12) + +# define HWCAP_CE		26	/* AT_HWCAP2 */ +# define HWCAP_CE_AES		(1 << 0) +# define HWCAP_CE_PMULL		(1 << 1) +# define HWCAP_CE_SHA1		(1 << 2) +# define HWCAP_CE_SHA256	(1 << 3) +#elif defined(__aarch64__) +# define HWCAP			16	/* AT_HWCAP */ +# define HWCAP_NEON		(1 << 1) + +# define HWCAP_CE		HWCAP +# define HWCAP_CE_AES		(1 << 3) +# define HWCAP_CE_PMULL		(1 << 4) +# define HWCAP_CE_SHA1		(1 << 5) +# define HWCAP_CE_SHA256	(1 << 6) +#endif +  void OPENSSL_cpuid_setup(void)  	{  	char *e; @@ -44,7 +80,7 @@ void OPENSSL_cpuid_setup(void)  	if ((e=getenv("OPENSSL_armcap")))  		{ -		OPENSSL_armcap_P=strtoul(e,NULL,0); +		OPENSSL_armcap_P=(unsigned int)strtoul(e,NULL,0);  		return;  		} @@ -64,10 +100,51 @@ void OPENSSL_cpuid_setup(void)  	sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);  	sigaction(SIGILL,&ill_act,&ill_oact); -	if (sigsetjmp(ill_jmp,1) == 0) +	if (getauxval != NULL) +		{ +		if (getauxval(HWCAP) & HWCAP_NEON) +			{ +			unsigned long hwcap = getauxval(HWCAP_CE); + +			OPENSSL_armcap_P |= ARMV7_NEON; + +			if (hwcap & HWCAP_CE_AES) +				OPENSSL_armcap_P |= ARMV8_AES; + +			if (hwcap & HWCAP_CE_PMULL) +				OPENSSL_armcap_P |= ARMV8_PMULL; + +			if (hwcap & HWCAP_CE_SHA1) +				OPENSSL_armcap_P |= ARMV8_SHA1; + +			if (hwcap & HWCAP_CE_SHA256) +				OPENSSL_armcap_P |= ARMV8_SHA256; +			} +		} +	else if (sigsetjmp(ill_jmp,1) == 0)  		{  		_armv7_neon_probe();  		OPENSSL_armcap_P |= ARMV7_NEON; +		if (sigsetjmp(ill_jmp,1) == 0) +			{ +			_armv8_pmull_probe(); +			OPENSSL_armcap_P |= ARMV8_PMULL|ARMV8_AES; +			} +		else if (sigsetjmp(ill_jmp,1) == 0) +			{ +			_armv8_aes_probe(); +			OPENSSL_armcap_P |= ARMV8_AES; +			} +		if (sigsetjmp(ill_jmp,1) == 0) +			{ +			_armv8_sha1_probe(); +			OPENSSL_armcap_P |= ARMV8_SHA1; +			} +		if (sigsetjmp(ill_jmp,1) == 0) +			{ +			_armv8_sha256_probe(); +			OPENSSL_armcap_P |= ARMV8_SHA256; +			}  		}  	if (sigsetjmp(ill_jmp,1) == 0)  		{ diff --git a/main/openssl/crypto/armv4cpuid.S b/main/openssl/crypto/armv4cpuid.S index 2d618dea..add11d40 100644 --- a/main/openssl/crypto/armv4cpuid.S +++ b/main/openssl/crypto/armv4cpuid.S @@ -7,17 +7,49 @@  .global	_armv7_neon_probe  .type	_armv7_neon_probe,%function  _armv7_neon_probe: -	.word	0xf26ee1fe	@ vorr	q15,q15,q15 -	.word	0xe12fff1e	@ bx	lr +	.byte	0xf0,0x01,0x60,0xf2	@ vorr	q8,q8,q8 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr  .size	_armv7_neon_probe,.-_armv7_neon_probe  .global	_armv7_tick  .type	_armv7_tick,%function  _armv7_tick: -	mrc	p15,0,r0,c9,c13,0 -	.word	0xe12fff1e	@ bx	lr +	mrrc	p15,1,r0,r1,c14		@ CNTVCT +#if __ARM_ARCH__>=5 +	bx	lr +#else +	.word	0xe12fff1e		@ bx	lr +#endif  .size	_armv7_tick,.-_armv7_tick +.global	_armv8_aes_probe +.type	_armv8_aes_probe,%function +_armv8_aes_probe: +	.byte	0x00,0x03,0xb0,0xf3	@ aese.8	q0,q0 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr +.size	_armv8_aes_probe,.-_armv8_aes_probe + +.global	_armv8_sha1_probe +.type	_armv8_sha1_probe,%function +_armv8_sha1_probe: +	.byte	0x40,0x0c,0x00,0xf2	@ sha1c.32	q0,q0,q0 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr +.size	_armv8_sha1_probe,.-_armv8_sha1_probe + +.global	_armv8_sha256_probe +.type	_armv8_sha256_probe,%function +_armv8_sha256_probe: +	.byte	0x40,0x0c,0x00,0xf3	@ sha256h.32	q0,q0,q0 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx lr +.size	_armv8_sha256_probe,.-_armv8_sha256_probe +.global	_armv8_pmull_probe +.type	_armv8_pmull_probe,%function +_armv8_pmull_probe: +	.byte	0x00,0x0e,0xa0,0xf2	@ vmull.p64	q0,d0,d0 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr +.size	_armv8_pmull_probe,.-_armv8_pmull_probe + +.align	5  .global	OPENSSL_atomic_add  .type	OPENSSL_atomic_add,%function  OPENSSL_atomic_add: @@ -28,7 +60,7 @@ OPENSSL_atomic_add:  	cmp	r2,#0  	bne	.Ladd  	mov	r0,r3 -	.word	0xe12fff1e	@ bx	lr +	bx	lr  #else  	stmdb	sp!,{r4-r6,lr}  	ldr	r2,.Lspinlock @@ -81,9 +113,13 @@ OPENSSL_cleanse:  	adds	r1,r1,#4  	bne	.Little  .Lcleanse_done: +#if __ARM_ARCH__>=5 +	bx	lr +#else  	tst	lr,#1  	moveq	pc,lr  	.word	0xe12fff1e	@ bx	lr +#endif  .size	OPENSSL_cleanse,.-OPENSSL_cleanse  .global	OPENSSL_wipe_cpu @@ -97,41 +133,53 @@ OPENSSL_wipe_cpu:  	eor	ip,ip,ip  	tst	r0,#1  	beq	.Lwipe_done -	.word	0xf3000150	@ veor    q0, q0, q0 -	.word	0xf3022152	@ veor    q1, q1, q1 -	.word	0xf3044154	@ veor    q2, q2, q2 -	.word	0xf3066156	@ veor    q3, q3, q3 -	.word	0xf34001f0	@ veor    q8, q8, q8 -	.word	0xf34221f2	@ veor    q9, q9, q9 -	.word	0xf34441f4	@ veor    q10, q10, q10 -	.word	0xf34661f6	@ veor    q11, q11, q11 -	.word	0xf34881f8	@ veor    q12, q12, q12 -	.word	0xf34aa1fa	@ veor    q13, q13, q13 -	.word	0xf34cc1fc	@ veor    q14, q14, q14 -	.word	0xf34ee1fe	@ veor    q15, q15, q15 +	.byte	0x50,0x01,0x00,0xf3	@ veor	q0, q0, q0 +	.byte	0x52,0x21,0x02,0xf3	@ veor	q1, q1, q1 +	.byte	0x54,0x41,0x04,0xf3	@ veor	q2, q2, q2 +	.byte	0x56,0x61,0x06,0xf3	@ veor	q3, q3, q3 +	.byte	0xf0,0x01,0x40,0xf3	@ veor	q8, q8, q8 +	.byte	0xf2,0x21,0x42,0xf3	@ veor	q9, q9, q9 +	.byte	0xf4,0x41,0x44,0xf3	@ veor	q10, q10, q10 +	.byte	0xf6,0x61,0x46,0xf3	@ veor	q11, q11, q11 +	.byte	0xf8,0x81,0x48,0xf3	@ veor	q12, q12, q12 +	.byte	0xfa,0xa1,0x4a,0xf3	@ veor	q13, q13, q13 +	.byte	0xfc,0xc1,0x4c,0xf3	@ veor	q14, q14, q14 +	.byte	0xfe,0xe1,0x4e,0xf3	@ veor	q14, q14, q14  .Lwipe_done:  	mov	r0,sp +#if __ARM_ARCH__>=5 +	bx	lr +#else  	tst	lr,#1  	moveq	pc,lr  	.word	0xe12fff1e	@ bx	lr +#endif  .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu  .global	OPENSSL_instrument_bus  .type	OPENSSL_instrument_bus,%function  OPENSSL_instrument_bus:  	eor	r0,r0,r0 +#if __ARM_ARCH__>=5 +	bx	lr +#else  	tst	lr,#1  	moveq	pc,lr  	.word	0xe12fff1e	@ bx	lr +#endif  .size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus  .global	OPENSSL_instrument_bus2  .type	OPENSSL_instrument_bus2,%function  OPENSSL_instrument_bus2:  	eor	r0,r0,r0 +#if __ARM_ARCH__>=5 +	bx	lr +#else  	tst	lr,#1  	moveq	pc,lr  	.word	0xe12fff1e	@ bx	lr +#endif  .size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2  .align	5 diff --git a/main/openssl/crypto/bn/asm/armv4-gf2m.S b/main/openssl/crypto/bn/asm/armv4-gf2m.S index 038f0864..0fa25b26 100644 --- a/main/openssl/crypto/bn/asm/armv4-gf2m.S +++ b/main/openssl/crypto/bn/asm/armv4-gf2m.S @@ -5,31 +5,6 @@  #if __ARM_ARCH__>=7  .fpu	neon - -.type	mul_1x1_neon,%function -.align	5 -mul_1x1_neon: -	vshl.u64	d2,d16,#8	@ q1-q3 are slided  -	vmull.p8	q0,d16,d17	@ a·bb -	vshl.u64	d4,d16,#16 -	vmull.p8	q1,d2,d17	@ a<<8·bb -	vshl.u64	d6,d16,#24 -	vmull.p8	q2,d4,d17	@ a<<16·bb -	vshr.u64	d2,#8 -	vmull.p8	q3,d6,d17	@ a<<24·bb -	vshl.u64	d3,#24 -	veor		d0,d2 -	vshr.u64	d4,#16 -	veor		d0,d3 -	vshl.u64	d5,#16 -	veor		d0,d4 -	vshr.u64	d6,#24 -	veor		d0,d5 -	vshl.u64	d7,#8 -	veor		d0,d6 -	veor		d0,d7 -	.word	0xe12fff1e -.size	mul_1x1_neon,.-mul_1x1_neon  #endif  .type	mul_1x1_ialu,%function  .align	5 @@ -120,40 +95,53 @@ bn_GF2m_mul_2x2:  	tst	r12,#1  	beq	.Lialu -	veor	d18,d18 -	vmov.32	d19,r3,r3		@ two copies of b1 -	vmov.32	d18[0],r1		@ a1 - -	veor	d20,d20 -	vld1.32	d21[],[sp,:32]	@ two copies of b0 -	vmov.32	d20[0],r2		@ a0 -	mov	r12,lr - -	vmov	d16,d18 -	vmov	d17,d19 -	bl	mul_1x1_neon		@ a1·b1 -	vmov	d22,d0 - -	vmov	d16,d20 -	vmov	d17,d21 -	bl	mul_1x1_neon		@ a0·b0 -	vmov	d23,d0 - -	veor	d16,d20,d18 -	veor	d17,d21,d19 -	veor	d20,d23,d22 -	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1) - -	veor	d0,d20			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1 -	vshl.u64 d1,d0,#32 -	vshr.u64 d0,d0,#32 -	veor	d23,d1 -	veor	d22,d0 -	vst1.32	{d23[0]},[r0,:32]! -	vst1.32	{d23[1]},[r0,:32]! -	vst1.32	{d22[0]},[r0,:32]! -	vst1.32	{d22[1]},[r0,:32] -	bx	r12 +	ldr		r12, [sp]		@ 5th argument +	vmov.32		d26, r2, r1 +	vmov.32		d27, r12, r3 +	vmov.i64	d28, #0x0000ffffffffffff +	vmov.i64	d29, #0x00000000ffffffff +	vmov.i64	d30, #0x000000000000ffff + +	vext.8		d2, d26, d26, #1	@ A1 +	vmull.p8	q1, d2, d27		@ F = A1*B +	vext.8		d0, d27, d27, #1	@ B1 +	vmull.p8	q0, d26, d0		@ E = A*B1 +	vext.8		d4, d26, d26, #2	@ A2 +	vmull.p8	q2, d4, d27		@ H = A2*B +	vext.8		d16, d27, d27, #2	@ B2 +	vmull.p8	q8, d26, d16		@ G = A*B2 +	vext.8		d6, d26, d26, #3	@ A3 +	veor		q1, q1, q0		@ L = E + F +	vmull.p8	q3, d6, d27		@ J = A3*B +	vext.8		d0, d27, d27, #3	@ B3 +	veor		q2, q2, q8		@ M = G + H +	vmull.p8	q0, d26, d0		@ I = A*B3 +	veor		d2, d2, d3	@ t0 = (L) (P0 + P1) << 8 +	vand		d3, d3, d28 +	vext.8		d16, d27, d27, #4	@ B4 +	veor		d4, d4, d5	@ t1 = (M) (P2 + P3) << 16 +	vand		d5, d5, d29 +	vmull.p8	q8, d26, d16		@ K = A*B4 +	veor		q3, q3, q0		@ N = I + J +	veor		d2, d2, d3 +	veor		d4, d4, d5 +	veor		d6, d6, d7	@ t2 = (N) (P4 + P5) << 24 +	vand		d7, d7, d30 +	vext.8		q1, q1, q1, #15 +	veor		d16, d16, d17	@ t3 = (K) (P6 + P7) << 32 +	vmov.i64	d17, #0 +	vext.8		q2, q2, q2, #14 +	veor		d6, d6, d7 +	vmull.p8	q0, d26, d27		@ D = A*B +	vext.8		q8, q8, q8, #12 +	vext.8		q3, q3, q3, #13 +	veor		q1, q1, q2 +	veor		q3, q3, q8 +	veor		q0, q0, q1 +	veor		q0, q0, q3 + +	vst1.32		{q0}, [r0] +	bx	lr		@ bx lr  .align	4  .Lialu:  #endif diff --git a/main/openssl/crypto/bn/asm/armv4-gf2m.pl b/main/openssl/crypto/bn/asm/armv4-gf2m.pl index 22ad1f85..3f1f4f67 100644 --- a/main/openssl/crypto/bn/asm/armv4-gf2m.pl +++ b/main/openssl/crypto/bn/asm/armv4-gf2m.pl @@ -20,14 +20,21 @@  # length, more for longer keys. Even though NEON 1x1 multiplication  # runs in even less cycles, ~30, improvement is measurable only on  # longer keys. One has to optimize code elsewhere to get NEON glow... +# +# April 2014 +# +# Double bn_GF2m_mul_2x2 performance by using algorithm from paper +# referred below, which improves ECDH and ECDSA verify benchmarks +# by 18-40%. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +#  +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}  open STDOUT,">$output"; -sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     } -sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   } -sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } -  $code=<<___;  #include "arm_arch.h" @@ -36,31 +43,6 @@ $code=<<___;  #if __ARM_ARCH__>=7  .fpu	neon - -.type	mul_1x1_neon,%function -.align	5 -mul_1x1_neon: -	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a -	vmull.p8	`&Q("d0")`,d16,d17	@ a·bb -	vshl.u64	`&Dlo("q2")`,d16,#16 -	vmull.p8	q1,`&Dlo("q1")`,d17	@ a<<8·bb -	vshl.u64	`&Dlo("q3")`,d16,#24 -	vmull.p8	q2,`&Dlo("q2")`,d17	@ a<<16·bb -	vshr.u64	`&Dlo("q1")`,#8 -	vmull.p8	q3,`&Dlo("q3")`,d17	@ a<<24·bb -	vshl.u64	`&Dhi("q1")`,#24 -	veor		d0,`&Dlo("q1")` -	vshr.u64	`&Dlo("q2")`,#16 -	veor		d0,`&Dhi("q1")` -	vshl.u64	`&Dhi("q2")`,#16 -	veor		d0,`&Dlo("q2")` -	vshr.u64	`&Dlo("q3")`,#24 -	veor		d0,`&Dhi("q2")` -	vshl.u64	`&Dhi("q3")`,#8 -	veor		d0,`&Dlo("q3")` -	veor		d0,`&Dhi("q3")` -	bx	lr -.size	mul_1x1_neon,.-mul_1x1_neon  #endif  ___  ################ @@ -159,8 +141,9 @@ ___  # void	bn_GF2m_mul_2x2(BN_ULONG *r,  #	BN_ULONG a1,BN_ULONG a0,  #	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0 - -($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); +{ +my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12)); +my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));  $code.=<<___;  .global	bn_GF2m_mul_2x2 @@ -173,44 +156,58 @@ bn_GF2m_mul_2x2:  	tst	r12,#1  	beq	.Lialu -	veor	$A1,$A1 -	vmov.32	$B1,r3,r3		@ two copies of b1 -	vmov.32	${A1}[0],r1		@ a1 - -	veor	$A0,$A0 -	vld1.32	${B0}[],[sp,:32]	@ two copies of b0 -	vmov.32	${A0}[0],r2		@ a0 -	mov	r12,lr - -	vmov	d16,$A1 -	vmov	d17,$B1 -	bl	mul_1x1_neon		@ a1·b1 -	vmov	$A1B1,d0 - -	vmov	d16,$A0 -	vmov	d17,$B0 -	bl	mul_1x1_neon		@ a0·b0 -	vmov	$A0B0,d0 - -	veor	d16,$A0,$A1 -	veor	d17,$B0,$B1 -	veor	$A0,$A0B0,$A1B1 -	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1) - -	veor	d0,$A0			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1 -	vshl.u64 d1,d0,#32 -	vshr.u64 d0,d0,#32 -	veor	$A0B0,d1 -	veor	$A1B1,d0 -	vst1.32	{${A0B0}[0]},[r0,:32]! -	vst1.32	{${A0B0}[1]},[r0,:32]! -	vst1.32	{${A1B1}[0]},[r0,:32]! -	vst1.32	{${A1B1}[1]},[r0,:32] -	bx	r12 +	ldr		r12, [sp]		@ 5th argument +	vmov.32		$a, r2, r1 +	vmov.32		$b, r12, r3 +	vmov.i64	$k48, #0x0000ffffffffffff +	vmov.i64	$k32, #0x00000000ffffffff +	vmov.i64	$k16, #0x000000000000ffff + +	vext.8		$t0#lo, $a, $a, #1	@ A1 +	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B +	vext.8		$r#lo, $b, $b, #1	@ B1 +	vmull.p8	$r, $a, $r#lo		@ E = A*B1 +	vext.8		$t1#lo, $a, $a, #2	@ A2 +	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B +	vext.8		$t3#lo, $b, $b, #2	@ B2 +	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2 +	vext.8		$t2#lo, $a, $a, #3	@ A3 +	veor		$t0, $t0, $r		@ L = E + F +	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B +	vext.8		$r#lo, $b, $b, #3	@ B3 +	veor		$t1, $t1, $t3		@ M = G + H +	vmull.p8	$r, $a, $r#lo		@ I = A*B3 +	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8 +	vand		$t0#hi, $t0#hi, $k48 +	vext.8		$t3#lo, $b, $b, #4	@ B4 +	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16 +	vand		$t1#hi, $t1#hi, $k32 +	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4 +	veor		$t2, $t2, $r		@ N = I + J +	veor		$t0#lo, $t0#lo, $t0#hi +	veor		$t1#lo, $t1#lo, $t1#hi +	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24 +	vand		$t2#hi, $t2#hi, $k16 +	vext.8		$t0, $t0, $t0, #15 +	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32 +	vmov.i64	$t3#hi, #0 +	vext.8		$t1, $t1, $t1, #14 +	veor		$t2#lo, $t2#lo, $t2#hi +	vmull.p8	$r, $a, $b		@ D = A*B +	vext.8		$t3, $t3, $t3, #12 +	vext.8		$t2, $t2, $t2, #13 +	veor		$t0, $t0, $t1 +	veor		$t2, $t2, $t3 +	veor		$r, $r, $t0 +	veor		$r, $r, $t2 + +	vst1.32		{$r}, [r0] +	ret		@ bx lr  .align	4  .Lialu:  #endif  ___ +}  $ret="r10";	# reassigned 1st argument  $code.=<<___;  	stmdb	sp!,{r4-r10,lr} @@ -272,7 +269,13 @@ $code.=<<___;  .comm	OPENSSL_armcap_P,4,4  ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4 -print $code; +foreach (split("\n",$code)) { +	s/\`([^\`]*)\`/eval $1/geo; + +	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or +	s/\bret\b/bx	lr/go		or +	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4 + +	print $_,"\n"; +}  close STDOUT;   # enforce flush diff --git a/main/openssl/crypto/bn/asm/armv4-mont.pl b/main/openssl/crypto/bn/asm/armv4-mont.pl index f78a8b5f..72bad8e3 100644 --- a/main/openssl/crypto/bn/asm/armv4-mont.pl +++ b/main/openssl/crypto/bn/asm/armv4-mont.pl @@ -1,7 +1,7 @@  #!/usr/bin/env perl  # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL  # project. The module is, however, dual licensed under OpenSSL and  # CRYPTOGAMS licenses depending on where you obtain it. For further  # details see http://www.openssl.org/~appro/cryptogams/. @@ -23,6 +23,21 @@  # than 1/2KB. Windows CE port would be trivial, as it's exclusively  # about decorations, ABI and instruction syntax are identical. +# November 2013 +# +# Add NEON code path, which handles lengths divisible by 8. RSA/DSA +# performance improvement on Cortex-A8 is ~45-100% depending on key +# length, more for longer keys. On Cortex-A15 the span is ~10-105%. +# On Snapdragon S4 improvement was measured to vary from ~70% to +# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is +# rather because original integer-only code seems to perform +# suboptimally on S4. Situation on Cortex-A9 is unfortunately +# different. It's being looked into, but the trouble is that +# performance for vectors longer than 256 bits is actually couple +# of percent worse than for integer-only code. The code is chosen +# for execution on all NEON-capable processors, because gain on +# others outweighs the marginal loss on Cortex-A9. +  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}  open STDOUT,">$output"; @@ -52,16 +67,40 @@ $_n0="$num,#14*4";  $_num="$num,#15*4";	$_bpend=$_num;  $code=<<___; +#include "arm_arch.h" +  .text +.code	32 + +#if __ARM_ARCH__>=7 +.align	5 +.LOPENSSL_armcap: +.word	OPENSSL_armcap_P-bn_mul_mont +#endif  .global	bn_mul_mont  .type	bn_mul_mont,%function -.align	2 +.align	5  bn_mul_mont: +	ldr	ip,[sp,#4]		@ load num  	stmdb	sp!,{r0,r2}		@ sp points at argument block -	ldr	$num,[sp,#3*4]		@ load num -	cmp	$num,#2 +#if __ARM_ARCH__>=7 +	tst	ip,#7 +	bne	.Lialu +	adr	r0,bn_mul_mont +	ldr	r2,.LOPENSSL_armcap +	ldr	r0,[r0,r2] +	tst	r0,#1			@ NEON available? +	ldmia	sp, {r0,r2} +	beq	.Lialu +	add	sp,sp,#8 +	b	bn_mul8x_mont_neon +.align	4 +.Lialu: +#endif +	cmp	ip,#2 +	mov	$num,ip			@ load num  	movlt	r0,#0  	addlt	sp,sp,#2*4  	blt	.Labrt @@ -191,14 +230,446 @@ bn_mul_mont:  	ldmia	sp!,{r4-r12,lr}		@ restore registers  	add	sp,sp,#2*4		@ skip over {r0,r2}  	mov	r0,#1 -.Labrt:	tst	lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 +	ret				@ bx lr +#else +	tst	lr,#1  	moveq	pc,lr			@ be binary compatible with V4, yet  	bx	lr			@ interoperable with Thumb ISA:-) +#endif  .size	bn_mul_mont,.-bn_mul_mont -.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +___ +{ +sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     } +sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   } + +my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); +my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); +my ($Z,$Temp)=("q4","q5"); +my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); +my ($Bi,$Ni,$M0)=map("d$_",(28..31)); +my $zero=&Dlo($Z); +my $temp=&Dlo($Temp); + +my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); +my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu	neon + +.type	bn_mul8x_mont_neon,%function +.align	5 +bn_mul8x_mont_neon: +	mov	ip,sp +	stmdb	sp!,{r4-r11} +	vstmdb	sp!,{d8-d15}		@ ABI specification says so +	ldmia	ip,{r4-r5}		@ load rest of parameter block + +	sub		$toutptr,sp,#16 +	vld1.32		{${Bi}[0]}, [$bptr,:32]! +	sub		$toutptr,$toutptr,$num,lsl#4 +	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-( +	and		$toutptr,$toutptr,#-64 +	vld1.32		{${M0}[0]}, [$n0,:32] +	mov		sp,$toutptr			@ alloca +	veor		$zero,$zero,$zero +	subs		$inner,$num,#8 +	vzip.16		$Bi,$zero + +	vmull.u32	$A0xB,$Bi,${A0}[0] +	vmull.u32	$A1xB,$Bi,${A0}[1] +	vmull.u32	$A2xB,$Bi,${A1}[0] +	vshl.i64	$temp,`&Dhi("$A0xB")`,#16 +	vmull.u32	$A3xB,$Bi,${A1}[1] + +	vadd.u64	$temp,$temp,`&Dlo("$A0xB")` +	veor		$zero,$zero,$zero +	vmul.u32	$Ni,$temp,$M0 + +	vmull.u32	$A4xB,$Bi,${A2}[0] +	 vld1.32	{$N0-$N3}, [$nptr]! +	vmull.u32	$A5xB,$Bi,${A2}[1] +	vmull.u32	$A6xB,$Bi,${A3}[0] +	vzip.16		$Ni,$zero +	vmull.u32	$A7xB,$Bi,${A3}[1] + +	bne	.LNEON_1st + +	@ special case for num=8, everything is in register bank... + +	vmlal.u32	$A0xB,$Ni,${N0}[0] +	sub		$outer,$num,#1 +	vmlal.u32	$A1xB,$Ni,${N0}[1] +	vmlal.u32	$A2xB,$Ni,${N1}[0] +	vmlal.u32	$A3xB,$Ni,${N1}[1] + +	vmlal.u32	$A4xB,$Ni,${N2}[0] +	vmov		$Temp,$A0xB +	vmlal.u32	$A5xB,$Ni,${N2}[1] +	vmov		$A0xB,$A1xB +	vmlal.u32	$A6xB,$Ni,${N3}[0] +	vmov		$A1xB,$A2xB +	vmlal.u32	$A7xB,$Ni,${N3}[1] +	vmov		$A2xB,$A3xB +	vmov		$A3xB,$A4xB +	vshr.u64	$temp,$temp,#16 +	vmov		$A4xB,$A5xB +	vmov		$A5xB,$A6xB +	vadd.u64	$temp,$temp,`&Dhi("$Temp")` +	vmov		$A6xB,$A7xB +	veor		$A7xB,$A7xB +	vshr.u64	$temp,$temp,#16 + +	b	.LNEON_outer8 + +.align	4 +.LNEON_outer8: +	vld1.32		{${Bi}[0]}, [$bptr,:32]! +	veor		$zero,$zero,$zero +	vzip.16		$Bi,$zero +	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + +	vmlal.u32	$A0xB,$Bi,${A0}[0] +	vmlal.u32	$A1xB,$Bi,${A0}[1] +	vmlal.u32	$A2xB,$Bi,${A1}[0] +	vshl.i64	$temp,`&Dhi("$A0xB")`,#16 +	vmlal.u32	$A3xB,$Bi,${A1}[1] + +	vadd.u64	$temp,$temp,`&Dlo("$A0xB")` +	veor		$zero,$zero,$zero +	subs		$outer,$outer,#1 +	vmul.u32	$Ni,$temp,$M0 + +	vmlal.u32	$A4xB,$Bi,${A2}[0] +	vmlal.u32	$A5xB,$Bi,${A2}[1] +	vmlal.u32	$A6xB,$Bi,${A3}[0] +	vzip.16		$Ni,$zero +	vmlal.u32	$A7xB,$Bi,${A3}[1] + +	vmlal.u32	$A0xB,$Ni,${N0}[0] +	vmlal.u32	$A1xB,$Ni,${N0}[1] +	vmlal.u32	$A2xB,$Ni,${N1}[0] +	vmlal.u32	$A3xB,$Ni,${N1}[1] + +	vmlal.u32	$A4xB,$Ni,${N2}[0] +	vmov		$Temp,$A0xB +	vmlal.u32	$A5xB,$Ni,${N2}[1] +	vmov		$A0xB,$A1xB +	vmlal.u32	$A6xB,$Ni,${N3}[0] +	vmov		$A1xB,$A2xB +	vmlal.u32	$A7xB,$Ni,${N3}[1] +	vmov		$A2xB,$A3xB +	vmov		$A3xB,$A4xB +	vshr.u64	$temp,$temp,#16 +	vmov		$A4xB,$A5xB +	vmov		$A5xB,$A6xB +	vadd.u64	$temp,$temp,`&Dhi("$Temp")` +	vmov		$A6xB,$A7xB +	veor		$A7xB,$A7xB +	vshr.u64	$temp,$temp,#16 + +	bne	.LNEON_outer8 + +	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp +	mov		$toutptr,sp +	vshr.u64	$temp,`&Dlo("$A0xB")`,#16 +	mov		$inner,$num +	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp +	add		$tinptr,sp,#16 +	vshr.u64	$temp,`&Dhi("$A0xB")`,#16 +	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")` + +	b	.LNEON_tail2 + +.align	4 +.LNEON_1st: +	vmlal.u32	$A0xB,$Ni,${N0}[0] +	 vld1.32	{$A0-$A3}, [$aptr]! +	vmlal.u32	$A1xB,$Ni,${N0}[1] +	subs		$inner,$inner,#8 +	vmlal.u32	$A2xB,$Ni,${N1}[0] +	vmlal.u32	$A3xB,$Ni,${N1}[1] + +	vmlal.u32	$A4xB,$Ni,${N2}[0] +	 vld1.32	{$N0-$N1}, [$nptr]! +	vmlal.u32	$A5xB,$Ni,${N2}[1] +	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]! +	vmlal.u32	$A6xB,$Ni,${N3}[0] +	vmlal.u32	$A7xB,$Ni,${N3}[1] +	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]! + +	vmull.u32	$A0xB,$Bi,${A0}[0] +	 vld1.32	{$N2-$N3}, [$nptr]! +	vmull.u32	$A1xB,$Bi,${A0}[1] +	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]! +	vmull.u32	$A2xB,$Bi,${A1}[0] +	vmull.u32	$A3xB,$Bi,${A1}[1] +	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]! + +	vmull.u32	$A4xB,$Bi,${A2}[0] +	vmull.u32	$A5xB,$Bi,${A2}[1] +	vmull.u32	$A6xB,$Bi,${A3}[0] +	vmull.u32	$A7xB,$Bi,${A3}[1] + +	bne	.LNEON_1st + +	vmlal.u32	$A0xB,$Ni,${N0}[0] +	add		$tinptr,sp,#16 +	vmlal.u32	$A1xB,$Ni,${N0}[1] +	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr +	vmlal.u32	$A2xB,$Ni,${N1}[0] +	 vld1.64	{$Temp}, [sp,:128] +	vmlal.u32	$A3xB,$Ni,${N1}[1] +	sub		$outer,$num,#1 + +	vmlal.u32	$A4xB,$Ni,${N2}[0] +	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]! +	vmlal.u32	$A5xB,$Ni,${N2}[1] +	vshr.u64	$temp,$temp,#16 +	 vld1.64	{$A0xB},       [$tinptr, :128]! +	vmlal.u32	$A6xB,$Ni,${N3}[0] +	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]! +	vmlal.u32	$A7xB,$Ni,${N3}[1] + +	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]! +	vadd.u64	$temp,$temp,`&Dhi("$Temp")` +	veor		$Z,$Z,$Z +	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]! +	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]! +	vst1.64		{$Z},          [$toutptr,:128] +	vshr.u64	$temp,$temp,#16 + +	b		.LNEON_outer + +.align	4 +.LNEON_outer: +	vld1.32		{${Bi}[0]}, [$bptr,:32]! +	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr +	vld1.32		{$A0-$A3},  [$aptr]! +	veor		$zero,$zero,$zero +	mov		$toutptr,sp +	vzip.16		$Bi,$zero +	sub		$inner,$num,#8 +	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + +	vmlal.u32	$A0xB,$Bi,${A0}[0] +	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]! +	vmlal.u32	$A1xB,$Bi,${A0}[1] +	vmlal.u32	$A2xB,$Bi,${A1}[0] +	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]! +	vmlal.u32	$A3xB,$Bi,${A1}[1] + +	vshl.i64	$temp,`&Dhi("$A0xB")`,#16 +	veor		$zero,$zero,$zero +	vadd.u64	$temp,$temp,`&Dlo("$A0xB")` +	 vld1.64	{$A7xB},[$tinptr,:128]! +	vmul.u32	$Ni,$temp,$M0 + +	vmlal.u32	$A4xB,$Bi,${A2}[0] +	 vld1.32	{$N0-$N3}, [$nptr]! +	vmlal.u32	$A5xB,$Bi,${A2}[1] +	vmlal.u32	$A6xB,$Bi,${A3}[0] +	vzip.16		$Ni,$zero +	vmlal.u32	$A7xB,$Bi,${A3}[1] + +.LNEON_inner: +	vmlal.u32	$A0xB,$Ni,${N0}[0] +	 vld1.32	{$A0-$A3}, [$aptr]! +	vmlal.u32	$A1xB,$Ni,${N0}[1] +	 subs		$inner,$inner,#8 +	vmlal.u32	$A2xB,$Ni,${N1}[0] +	vmlal.u32	$A3xB,$Ni,${N1}[1] +	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]! + +	vmlal.u32	$A4xB,$Ni,${N2}[0] +	 vld1.64	{$A0xB},       [$tinptr, :128]! +	vmlal.u32	$A5xB,$Ni,${N2}[1] +	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]! +	vmlal.u32	$A6xB,$Ni,${N3}[0] +	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]! +	vmlal.u32	$A7xB,$Ni,${N3}[1] +	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]! + +	vmlal.u32	$A0xB,$Bi,${A0}[0] +	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]! +	vmlal.u32	$A1xB,$Bi,${A0}[1] +	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]! +	vmlal.u32	$A2xB,$Bi,${A1}[0] +	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]! +	vmlal.u32	$A3xB,$Bi,${A1}[1] +	 vld1.32	{$N0-$N3}, [$nptr]! + +	vmlal.u32	$A4xB,$Bi,${A2}[0] +	 vld1.64	{$A7xB},       [$tinptr, :128]! +	vmlal.u32	$A5xB,$Bi,${A2}[1] +	vmlal.u32	$A6xB,$Bi,${A3}[0] +	vmlal.u32	$A7xB,$Bi,${A3}[1] + +	bne	.LNEON_inner + +	vmlal.u32	$A0xB,$Ni,${N0}[0] +	add		$tinptr,sp,#16 +	vmlal.u32	$A1xB,$Ni,${N0}[1] +	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr +	vmlal.u32	$A2xB,$Ni,${N1}[0] +	 vld1.64	{$Temp}, [sp,:128] +	vmlal.u32	$A3xB,$Ni,${N1}[1] +	subs		$outer,$outer,#1 + +	vmlal.u32	$A4xB,$Ni,${N2}[0] +	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]! +	vmlal.u32	$A5xB,$Ni,${N2}[1] +	 vld1.64	{$A0xB},       [$tinptr, :128]! +	vshr.u64	$temp,$temp,#16 +	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]! +	vmlal.u32	$A6xB,$Ni,${N3}[0] +	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]! +	vmlal.u32	$A7xB,$Ni,${N3}[1] + +	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]! +	vadd.u64	$temp,$temp,`&Dhi("$Temp")` +	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]! +	vshr.u64	$temp,$temp,#16 + +	bne	.LNEON_outer + +	mov		$toutptr,sp +	mov		$inner,$num + +.LNEON_tail: +	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp +	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]! +	vshr.u64	$temp,`&Dlo("$A0xB")`,#16 +	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp +	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]! +	vshr.u64	$temp,`&Dhi("$A0xB")`,#16 +	vld1.64		{$A7xB},       [$tinptr, :128]! +	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")` + +.LNEON_tail2: +	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp +	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! +	vshr.u64	$temp,`&Dlo("$A1xB")`,#16 +	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp +	vshr.u64	$temp,`&Dhi("$A1xB")`,#16 +	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")` + +	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp +	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! +	vshr.u64	$temp,`&Dlo("$A2xB")`,#16 +	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp +	vshr.u64	$temp,`&Dhi("$A2xB")`,#16 +	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")` + +	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp +	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! +	vshr.u64	$temp,`&Dlo("$A3xB")`,#16 +	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp +	vshr.u64	$temp,`&Dhi("$A3xB")`,#16 +	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")` + +	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp +	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! +	vshr.u64	$temp,`&Dlo("$A4xB")`,#16 +	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp +	vshr.u64	$temp,`&Dhi("$A4xB")`,#16 +	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")` + +	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp +	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! +	vshr.u64	$temp,`&Dlo("$A5xB")`,#16 +	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp +	vshr.u64	$temp,`&Dhi("$A5xB")`,#16 +	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")` + +	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp +	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! +	vshr.u64	$temp,`&Dlo("$A6xB")`,#16 +	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp +	vld1.64		{$A0xB}, [$tinptr, :128]! +	vshr.u64	$temp,`&Dhi("$A6xB")`,#16 +	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")` + +	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp +	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! +	vshr.u64	$temp,`&Dlo("$A7xB")`,#16 +	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp +	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]! +	vshr.u64	$temp,`&Dhi("$A7xB")`,#16 +	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")` +	subs		$inner,$inner,#8 +	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! + +	bne	.LNEON_tail + +	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit +	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr +	subs	$aptr,sp,#0				@ clear carry flag +	add	$bptr,sp,$num,lsl#2 + +.LNEON_sub: +	ldmia	$aptr!, {r4-r7} +	ldmia	$nptr!, {r8-r11} +	sbcs	r8, r4,r8 +	sbcs	r9, r5,r9 +	sbcs	r10,r6,r10 +	sbcs	r11,r7,r11 +	teq	$aptr,$bptr				@ preserves carry +	stmia	$rptr!, {r8-r11} +	bne	.LNEON_sub + +	ldr	r10, [$aptr]				@ load top-most bit +	veor	q0,q0,q0 +	sub	r11,$bptr,sp				@ this is num*4 +	veor	q1,q1,q1 +	mov	$aptr,sp +	sub	$rptr,$rptr,r11				@ rewind $rptr +	mov	$nptr,$bptr				@ second 3/4th of frame +	sbcs	r10,r10,#0				@ result is carry flag + +.LNEON_copy_n_zap: +	ldmia	$aptr!, {r4-r7} +	ldmia	$rptr,  {r8-r11} +	movcc	r8, r4 +	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe +	movcc	r9, r5 +	movcc	r10,r6 +	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe +	movcc	r11,r7 +	ldmia	$aptr, {r4-r7} +	stmia	$rptr!, {r8-r11} +	sub	$aptr,$aptr,#16 +	ldmia	$rptr, {r8-r11} +	movcc	r8, r4 +	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe +	movcc	r9, r5 +	movcc	r10,r6 +	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe +	movcc	r11,r7 +	teq	$aptr,$bptr				@ preserves carry +	stmia	$rptr!, {r8-r11} +	bne	.LNEON_copy_n_zap + +	sub	sp,ip,#96 +        vldmia  sp!,{d8-d15} +        ldmia   sp!,{r4-r11} +	ret						@ bx lr +.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +___ +} +$code.=<<___; +.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"  .align	2 +#if __ARM_ARCH__>=7 +.comm	OPENSSL_armcap_P,4,4 +#endif  ___ +$code =~ s/\`([^\`]*)\`/eval $1/gem;  $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx	lr/gm;  print $code;  close STDOUT; diff --git a/main/openssl/crypto/bn/asm/armv4-mont.s b/main/openssl/crypto/bn/asm/armv4-mont.s index 64c220b5..fecae15e 100644 --- a/main/openssl/crypto/bn/asm/armv4-mont.s +++ b/main/openssl/crypto/bn/asm/armv4-mont.s @@ -1,13 +1,37 @@ +#include "arm_arch.h" +  .text +.code	32 + +#if __ARM_ARCH__>=7 +.align	5 +.LOPENSSL_armcap: +.word	OPENSSL_armcap_P-bn_mul_mont +#endif  .global	bn_mul_mont  .type	bn_mul_mont,%function -.align	2 +.align	5  bn_mul_mont: +	ldr	ip,[sp,#4]		@ load num  	stmdb	sp!,{r0,r2}		@ sp points at argument block -	ldr	r0,[sp,#3*4]		@ load num -	cmp	r0,#2 +#if __ARM_ARCH__>=7 +	tst	ip,#7 +	bne	.Lialu +	adr	r0,bn_mul_mont +	ldr	r2,.LOPENSSL_armcap +	ldr	r0,[r0,r2] +	tst	r0,#1			@ NEON available? +	ldmia	sp, {r0,r2} +	beq	.Lialu +	add	sp,sp,#8 +	b	bn_mul8x_mont_neon +.align	4 +.Lialu: +#endif +	cmp	ip,#2 +	mov	r0,ip			@ load num  	movlt	r0,#0  	addlt	sp,sp,#2*4  	blt	.Labrt @@ -137,9 +161,419 @@ bn_mul_mont:  	ldmia	sp!,{r4-r12,lr}		@ restore registers  	add	sp,sp,#2*4		@ skip over {r0,r2}  	mov	r0,#1 -.Labrt:	tst	lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 +	bx	lr				@ .word	0xe12fff1e +#else +	tst	lr,#1  	moveq	pc,lr			@ be binary compatible with V4, yet  	.word	0xe12fff1e			@ interoperable with Thumb ISA:-) +#endif  .size	bn_mul_mont,.-bn_mul_mont -.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro@openssl.org>" +#if __ARM_ARCH__>=7 +.fpu	neon + +.type	bn_mul8x_mont_neon,%function +.align	5 +bn_mul8x_mont_neon: +	mov	ip,sp +	stmdb	sp!,{r4-r11} +	vstmdb	sp!,{d8-d15}		@ ABI specification says so +	ldmia	ip,{r4-r5}		@ load rest of parameter block + +	sub		r7,sp,#16 +	vld1.32		{d28[0]}, [r2,:32]! +	sub		r7,r7,r5,lsl#4 +	vld1.32		{d0-d3},  [r1]!		@ can't specify :32 :-( +	and		r7,r7,#-64 +	vld1.32		{d30[0]}, [r4,:32] +	mov		sp,r7			@ alloca +	veor		d8,d8,d8 +	subs		r8,r5,#8 +	vzip.16		d28,d8 + +	vmull.u32	q6,d28,d0[0] +	vmull.u32	q7,d28,d0[1] +	vmull.u32	q8,d28,d1[0] +	vshl.i64	d10,d13,#16 +	vmull.u32	q9,d28,d1[1] + +	vadd.u64	d10,d10,d12 +	veor		d8,d8,d8 +	vmul.u32	d29,d10,d30 + +	vmull.u32	q10,d28,d2[0] +	 vld1.32	{d4-d7}, [r3]! +	vmull.u32	q11,d28,d2[1] +	vmull.u32	q12,d28,d3[0] +	vzip.16		d29,d8 +	vmull.u32	q13,d28,d3[1] + +	bne	.LNEON_1st + +	@ special case for num=8, everything is in register bank... + +	vmlal.u32	q6,d29,d4[0] +	sub		r9,r5,#1 +	vmlal.u32	q7,d29,d4[1] +	vmlal.u32	q8,d29,d5[0] +	vmlal.u32	q9,d29,d5[1] + +	vmlal.u32	q10,d29,d6[0] +	vmov		q5,q6 +	vmlal.u32	q11,d29,d6[1] +	vmov		q6,q7 +	vmlal.u32	q12,d29,d7[0] +	vmov		q7,q8 +	vmlal.u32	q13,d29,d7[1] +	vmov		q8,q9 +	vmov		q9,q10 +	vshr.u64	d10,d10,#16 +	vmov		q10,q11 +	vmov		q11,q12 +	vadd.u64	d10,d10,d11 +	vmov		q12,q13 +	veor		q13,q13 +	vshr.u64	d10,d10,#16 + +	b	.LNEON_outer8 + +.align	4 +.LNEON_outer8: +	vld1.32		{d28[0]}, [r2,:32]! +	veor		d8,d8,d8 +	vzip.16		d28,d8 +	vadd.u64	d12,d12,d10 + +	vmlal.u32	q6,d28,d0[0] +	vmlal.u32	q7,d28,d0[1] +	vmlal.u32	q8,d28,d1[0] +	vshl.i64	d10,d13,#16 +	vmlal.u32	q9,d28,d1[1] + +	vadd.u64	d10,d10,d12 +	veor		d8,d8,d8 +	subs		r9,r9,#1 +	vmul.u32	d29,d10,d30 + +	vmlal.u32	q10,d28,d2[0] +	vmlal.u32	q11,d28,d2[1] +	vmlal.u32	q12,d28,d3[0] +	vzip.16		d29,d8 +	vmlal.u32	q13,d28,d3[1] + +	vmlal.u32	q6,d29,d4[0] +	vmlal.u32	q7,d29,d4[1] +	vmlal.u32	q8,d29,d5[0] +	vmlal.u32	q9,d29,d5[1] + +	vmlal.u32	q10,d29,d6[0] +	vmov		q5,q6 +	vmlal.u32	q11,d29,d6[1] +	vmov		q6,q7 +	vmlal.u32	q12,d29,d7[0] +	vmov		q7,q8 +	vmlal.u32	q13,d29,d7[1] +	vmov		q8,q9 +	vmov		q9,q10 +	vshr.u64	d10,d10,#16 +	vmov		q10,q11 +	vmov		q11,q12 +	vadd.u64	d10,d10,d11 +	vmov		q12,q13 +	veor		q13,q13 +	vshr.u64	d10,d10,#16 + +	bne	.LNEON_outer8 + +	vadd.u64	d12,d12,d10 +	mov		r7,sp +	vshr.u64	d10,d12,#16 +	mov		r8,r5 +	vadd.u64	d13,d13,d10 +	add		r6,sp,#16 +	vshr.u64	d10,d13,#16 +	vzip.16		d12,d13 + +	b	.LNEON_tail2 + +.align	4 +.LNEON_1st: +	vmlal.u32	q6,d29,d4[0] +	 vld1.32	{d0-d3}, [r1]! +	vmlal.u32	q7,d29,d4[1] +	subs		r8,r8,#8 +	vmlal.u32	q8,d29,d5[0] +	vmlal.u32	q9,d29,d5[1] + +	vmlal.u32	q10,d29,d6[0] +	 vld1.32	{d4-d5}, [r3]! +	vmlal.u32	q11,d29,d6[1] +	 vst1.64	{q6-q7}, [r7,:256]! +	vmlal.u32	q12,d29,d7[0] +	vmlal.u32	q13,d29,d7[1] +	 vst1.64	{q8-q9}, [r7,:256]! + +	vmull.u32	q6,d28,d0[0] +	 vld1.32	{d6-d7}, [r3]! +	vmull.u32	q7,d28,d0[1] +	 vst1.64	{q10-q11}, [r7,:256]! +	vmull.u32	q8,d28,d1[0] +	vmull.u32	q9,d28,d1[1] +	 vst1.64	{q12-q13}, [r7,:256]! + +	vmull.u32	q10,d28,d2[0] +	vmull.u32	q11,d28,d2[1] +	vmull.u32	q12,d28,d3[0] +	vmull.u32	q13,d28,d3[1] + +	bne	.LNEON_1st + +	vmlal.u32	q6,d29,d4[0] +	add		r6,sp,#16 +	vmlal.u32	q7,d29,d4[1] +	sub		r1,r1,r5,lsl#2		@ rewind r1 +	vmlal.u32	q8,d29,d5[0] +	 vld1.64	{q5}, [sp,:128] +	vmlal.u32	q9,d29,d5[1] +	sub		r9,r5,#1 + +	vmlal.u32	q10,d29,d6[0] +	vst1.64		{q6-q7}, [r7,:256]! +	vmlal.u32	q11,d29,d6[1] +	vshr.u64	d10,d10,#16 +	 vld1.64	{q6},       [r6, :128]! +	vmlal.u32	q12,d29,d7[0] +	vst1.64		{q8-q9}, [r7,:256]! +	vmlal.u32	q13,d29,d7[1] + +	vst1.64		{q10-q11}, [r7,:256]! +	vadd.u64	d10,d10,d11 +	veor		q4,q4,q4 +	vst1.64		{q12-q13}, [r7,:256]! +	 vld1.64	{q7-q8}, [r6, :256]! +	vst1.64		{q4},          [r7,:128] +	vshr.u64	d10,d10,#16 + +	b		.LNEON_outer + +.align	4 +.LNEON_outer: +	vld1.32		{d28[0]}, [r2,:32]! +	sub		r3,r3,r5,lsl#2		@ rewind r3 +	vld1.32		{d0-d3},  [r1]! +	veor		d8,d8,d8 +	mov		r7,sp +	vzip.16		d28,d8 +	sub		r8,r5,#8 +	vadd.u64	d12,d12,d10 + +	vmlal.u32	q6,d28,d0[0] +	 vld1.64	{q9-q10},[r6,:256]! +	vmlal.u32	q7,d28,d0[1] +	vmlal.u32	q8,d28,d1[0] +	 vld1.64	{q11-q12},[r6,:256]! +	vmlal.u32	q9,d28,d1[1] + +	vshl.i64	d10,d13,#16 +	veor		d8,d8,d8 +	vadd.u64	d10,d10,d12 +	 vld1.64	{q13},[r6,:128]! +	vmul.u32	d29,d10,d30 + +	vmlal.u32	q10,d28,d2[0] +	 vld1.32	{d4-d7}, [r3]! +	vmlal.u32	q11,d28,d2[1] +	vmlal.u32	q12,d28,d3[0] +	vzip.16		d29,d8 +	vmlal.u32	q13,d28,d3[1] + +.LNEON_inner: +	vmlal.u32	q6,d29,d4[0] +	 vld1.32	{d0-d3}, [r1]! +	vmlal.u32	q7,d29,d4[1] +	 subs		r8,r8,#8 +	vmlal.u32	q8,d29,d5[0] +	vmlal.u32	q9,d29,d5[1] +	vst1.64		{q6-q7}, [r7,:256]! + +	vmlal.u32	q10,d29,d6[0] +	 vld1.64	{q6},       [r6, :128]! +	vmlal.u32	q11,d29,d6[1] +	vst1.64		{q8-q9}, [r7,:256]! +	vmlal.u32	q12,d29,d7[0] +	 vld1.64	{q7-q8}, [r6, :256]! +	vmlal.u32	q13,d29,d7[1] +	vst1.64		{q10-q11}, [r7,:256]! + +	vmlal.u32	q6,d28,d0[0] +	 vld1.64	{q9-q10}, [r6, :256]! +	vmlal.u32	q7,d28,d0[1] +	vst1.64		{q12-q13}, [r7,:256]! +	vmlal.u32	q8,d28,d1[0] +	 vld1.64	{q11-q12}, [r6, :256]! +	vmlal.u32	q9,d28,d1[1] +	 vld1.32	{d4-d7}, [r3]! + +	vmlal.u32	q10,d28,d2[0] +	 vld1.64	{q13},       [r6, :128]! +	vmlal.u32	q11,d28,d2[1] +	vmlal.u32	q12,d28,d3[0] +	vmlal.u32	q13,d28,d3[1] + +	bne	.LNEON_inner + +	vmlal.u32	q6,d29,d4[0] +	add		r6,sp,#16 +	vmlal.u32	q7,d29,d4[1] +	sub		r1,r1,r5,lsl#2		@ rewind r1 +	vmlal.u32	q8,d29,d5[0] +	 vld1.64	{q5}, [sp,:128] +	vmlal.u32	q9,d29,d5[1] +	subs		r9,r9,#1 + +	vmlal.u32	q10,d29,d6[0] +	vst1.64		{q6-q7}, [r7,:256]! +	vmlal.u32	q11,d29,d6[1] +	 vld1.64	{q6},       [r6, :128]! +	vshr.u64	d10,d10,#16 +	vst1.64		{q8-q9}, [r7,:256]! +	vmlal.u32	q12,d29,d7[0] +	 vld1.64	{q7-q8}, [r6, :256]! +	vmlal.u32	q13,d29,d7[1] + +	vst1.64		{q10-q11}, [r7,:256]! +	vadd.u64	d10,d10,d11 +	vst1.64		{q12-q13}, [r7,:256]! +	vshr.u64	d10,d10,#16 + +	bne	.LNEON_outer + +	mov		r7,sp +	mov		r8,r5 + +.LNEON_tail: +	vadd.u64	d12,d12,d10 +	vld1.64		{q9-q10}, [r6, :256]! +	vshr.u64	d10,d12,#16 +	vadd.u64	d13,d13,d10 +	vld1.64		{q11-q12}, [r6, :256]! +	vshr.u64	d10,d13,#16 +	vld1.64		{q13},       [r6, :128]! +	vzip.16		d12,d13 + +.LNEON_tail2: +	vadd.u64	d14,d14,d10 +	vst1.32		{d12[0]}, [r7, :32]! +	vshr.u64	d10,d14,#16 +	vadd.u64	d15,d15,d10 +	vshr.u64	d10,d15,#16 +	vzip.16		d14,d15 + +	vadd.u64	d16,d16,d10 +	vst1.32		{d14[0]}, [r7, :32]! +	vshr.u64	d10,d16,#16 +	vadd.u64	d17,d17,d10 +	vshr.u64	d10,d17,#16 +	vzip.16		d16,d17 + +	vadd.u64	d18,d18,d10 +	vst1.32		{d16[0]}, [r7, :32]! +	vshr.u64	d10,d18,#16 +	vadd.u64	d19,d19,d10 +	vshr.u64	d10,d19,#16 +	vzip.16		d18,d19 + +	vadd.u64	d20,d20,d10 +	vst1.32		{d18[0]}, [r7, :32]! +	vshr.u64	d10,d20,#16 +	vadd.u64	d21,d21,d10 +	vshr.u64	d10,d21,#16 +	vzip.16		d20,d21 + +	vadd.u64	d22,d22,d10 +	vst1.32		{d20[0]}, [r7, :32]! +	vshr.u64	d10,d22,#16 +	vadd.u64	d23,d23,d10 +	vshr.u64	d10,d23,#16 +	vzip.16		d22,d23 + +	vadd.u64	d24,d24,d10 +	vst1.32		{d22[0]}, [r7, :32]! +	vshr.u64	d10,d24,#16 +	vadd.u64	d25,d25,d10 +	vld1.64		{q6}, [r6, :128]! +	vshr.u64	d10,d25,#16 +	vzip.16		d24,d25 + +	vadd.u64	d26,d26,d10 +	vst1.32		{d24[0]}, [r7, :32]! +	vshr.u64	d10,d26,#16 +	vadd.u64	d27,d27,d10 +	vld1.64		{q7-q8},	[r6, :256]! +	vshr.u64	d10,d27,#16 +	vzip.16		d26,d27 +	subs		r8,r8,#8 +	vst1.32		{d26[0]}, [r7, :32]! + +	bne	.LNEON_tail + +	vst1.32	{d10[0]}, [r7, :32]		@ top-most bit +	sub	r3,r3,r5,lsl#2			@ rewind r3 +	subs	r1,sp,#0				@ clear carry flag +	add	r2,sp,r5,lsl#2 + +.LNEON_sub: +	ldmia	r1!, {r4-r7} +	ldmia	r3!, {r8-r11} +	sbcs	r8, r4,r8 +	sbcs	r9, r5,r9 +	sbcs	r10,r6,r10 +	sbcs	r11,r7,r11 +	teq	r1,r2				@ preserves carry +	stmia	r0!, {r8-r11} +	bne	.LNEON_sub + +	ldr	r10, [r1]				@ load top-most bit +	veor	q0,q0,q0 +	sub	r11,r2,sp				@ this is num*4 +	veor	q1,q1,q1 +	mov	r1,sp +	sub	r0,r0,r11				@ rewind r0 +	mov	r3,r2				@ second 3/4th of frame +	sbcs	r10,r10,#0				@ result is carry flag + +.LNEON_copy_n_zap: +	ldmia	r1!, {r4-r7} +	ldmia	r0,  {r8-r11} +	movcc	r8, r4 +	vst1.64	{q0-q1}, [r3,:256]!			@ wipe +	movcc	r9, r5 +	movcc	r10,r6 +	vst1.64	{q0-q1}, [r3,:256]!			@ wipe +	movcc	r11,r7 +	ldmia	r1, {r4-r7} +	stmia	r0!, {r8-r11} +	sub	r1,r1,#16 +	ldmia	r0, {r8-r11} +	movcc	r8, r4 +	vst1.64	{q0-q1}, [r1,:256]!			@ wipe +	movcc	r9, r5 +	movcc	r10,r6 +	vst1.64	{q0-q1}, [r3,:256]!			@ wipe +	movcc	r11,r7 +	teq	r1,r2				@ preserves carry +	stmia	r0!, {r8-r11} +	bne	.LNEON_copy_n_zap + +	sub	sp,ip,#96 +        vldmia  sp!,{d8-d15} +        ldmia   sp!,{r4-r11} +	bx	lr						@ .word	0xe12fff1e +.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"  .align	2 +#if __ARM_ARCH__>=7 +.comm	OPENSSL_armcap_P,4,4 +#endif diff --git a/main/openssl/crypto/evp/e_aes.c b/main/openssl/crypto/evp/e_aes.c index 41cee42d..ad0f7a4a 100644 --- a/main/openssl/crypto/evp/e_aes.c +++ b/main/openssl/crypto/evp/e_aes.c @@ -62,7 +62,7 @@  typedef struct  	{ -	AES_KEY ks; +	union { double align; AES_KEY ks; } ks;  	block128_f block;  	union {  		cbc128_f cbc; @@ -72,7 +72,7 @@ typedef struct  typedef struct  	{ -	AES_KEY ks;		/* AES key schedule to use */ +	union { double align; AES_KEY ks; } ks;	/* AES key schedule to use */  	int key_set;		/* Set if key initialised */  	int iv_set;		/* Set if an iv is set */  	GCM128_CONTEXT gcm; @@ -86,7 +86,7 @@ typedef struct  typedef struct  	{ -	AES_KEY ks1, ks2;	/* AES key schedules to use */ +	union { double align; AES_KEY ks; } ks1, ks2;	/* AES key schedules to use */  	XTS128_CONTEXT xts;  	void     (*stream)(const unsigned char *in,  			unsigned char *out, size_t length, @@ -96,7 +96,7 @@ typedef struct  typedef struct  	{ -	AES_KEY ks;		/* AES key schedule to use */ +	union { double align; AES_KEY ks; } ks;	/* AES key schedule to use */  	int key_set;		/* Set if key initialised */  	int iv_set;		/* Set if an iv is set */  	int tag_set;		/* Set if tag is valid */ @@ -160,7 +160,7 @@ void AES_xts_decrypt(const char *inp,char *out,size_t len,  	defined(_M_AMD64)	|| defined(_M_X64)	|| \  	defined(__INTEL__)				) -extern unsigned int OPENSSL_ia32cap_P[2]; +extern unsigned int OPENSSL_ia32cap_P[];  #ifdef VPAES_ASM  #define VPAES_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(41-32))) @@ -310,7 +310,7 @@ static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  		return 1;  	if (key)  		{ -		aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); +		aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);  		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,  				(block128_f)aesni_encrypt);  		gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks; @@ -355,19 +355,19 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  		/* key_len is two AES keys */  		if (enc)  			{ -			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); +			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);  			xctx->xts.block1 = (block128_f)aesni_encrypt;  			xctx->stream = aesni_xts_encrypt;  			}  		else  			{ -			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); +			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);  			xctx->xts.block1 = (block128_f)aesni_decrypt;  			xctx->stream = aesni_xts_decrypt;  			}  		aesni_set_encrypt_key(key + ctx->key_len/2, -						ctx->key_len * 4, &xctx->ks2); +						ctx->key_len * 4, &xctx->ks2.ks);  		xctx->xts.block2 = (block128_f)aesni_encrypt;  		xctx->xts.key1 = &xctx->ks1; @@ -394,7 +394,7 @@ static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  		return 1;  	if (key)  		{ -		aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); +		aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);  		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,  					&cctx->ks, (block128_f)aesni_encrypt);  		cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks : @@ -482,14 +482,38 @@ static const EVP_CIPHER aes_##keylen##_##mode = { \  	NULL,NULL,aes_##mode##_ctrl,NULL }; \  const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \  { return &aes_##keylen##_##mode; } -  #endif -#if defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm)) +#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__))  #include "arm_arch.h"  #if __ARM_ARCH__>=7 -#define BSAES_CAPABLE  (OPENSSL_armcap_P & ARMV7_NEON) +# if defined(BSAES_ASM) +#  define BSAES_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON) +# endif +# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +# define HWAES_set_encrypt_key aes_v8_set_encrypt_key +# define HWAES_set_decrypt_key aes_v8_set_decrypt_key +# define HWAES_encrypt aes_v8_encrypt +# define HWAES_decrypt aes_v8_decrypt +# define HWAES_cbc_encrypt aes_v8_cbc_encrypt +# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks +#endif  #endif + +#if defined(HWAES_CAPABLE) +int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits, +	AES_KEY *key); +int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits, +	AES_KEY *key); +void HWAES_encrypt(const unsigned char *in, unsigned char *out, +	const AES_KEY *key); +void HWAES_decrypt(const unsigned char *in, unsigned char *out, +	const AES_KEY *key); +void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out, +	size_t length, const AES_KEY *key, +	unsigned char *ivec, const int enc); +void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, +	size_t len, const AES_KEY *key, const unsigned char ivec[16]);  #endif  #define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\ @@ -510,10 +534,23 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  	mode = ctx->cipher->flags & EVP_CIPH_MODE;  	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)  	    && !enc) +#ifdef HWAES_CAPABLE +	    if (HWAES_CAPABLE) +		{ +		ret = HWAES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); +		dat->block      = (block128_f)HWAES_decrypt; +		dat->stream.cbc = NULL; +#ifdef HWAES_cbc_encrypt +		if (mode==EVP_CIPH_CBC_MODE) +		    dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt; +#endif +		} +	    else +#endif  #ifdef BSAES_CAPABLE  	    if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)  		{ -		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); +		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);  		dat->block	= (block128_f)AES_decrypt;  		dat->stream.cbc	= (cbc128_f)bsaes_cbc_encrypt;  		} @@ -522,7 +559,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  #ifdef VPAES_CAPABLE  	    if (VPAES_CAPABLE)  		{ -		ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks); +		ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);  		dat->block	= (block128_f)vpaes_decrypt;  		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?  					(cbc128_f)vpaes_cbc_encrypt : @@ -531,17 +568,37 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  	    else  #endif  		{ -		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); +		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);  		dat->block	= (block128_f)AES_decrypt;  		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?  					(cbc128_f)AES_cbc_encrypt :  					NULL;  		}  	else +#ifdef HWAES_CAPABLE +	    if (HWAES_CAPABLE) +		{ +		ret = HWAES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); +		dat->block      = (block128_f)HWAES_encrypt; +		dat->stream.cbc = NULL; +#ifdef HWAES_cbc_encrypt +		if (mode==EVP_CIPH_CBC_MODE) +		    dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt; +		else +#endif +#ifdef HWAES_ctr32_encrypt_blocks +		if (mode==EVP_CIPH_CTR_MODE) +		    dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks; +		else +#endif +		(void)0;	/* terminate potentially open 'else' */ +		} +	    else +#endif  #ifdef BSAES_CAPABLE  	    if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)  		{ -		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); +		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);  		dat->block	= (block128_f)AES_encrypt;  		dat->stream.ctr	= (ctr128_f)bsaes_ctr32_encrypt_blocks;  		} @@ -550,7 +607,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  #ifdef VPAES_CAPABLE  	    if (VPAES_CAPABLE)  		{ -		ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks); +		ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);  		dat->block	= (block128_f)vpaes_encrypt;  		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?  					(cbc128_f)vpaes_cbc_encrypt : @@ -559,7 +616,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  	    else  #endif  		{ -		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); +		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);  		dat->block	= (block128_f)AES_encrypt;  		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?  					(cbc128_f)AES_cbc_encrypt : @@ -830,10 +887,25 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  		return 1;  	if (key)  		{ do { +#ifdef HWAES_CAPABLE +		if (HWAES_CAPABLE) +			{ +			HWAES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks); +			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, +					(block128_f)HWAES_encrypt); +#ifdef HWAES_ctr32_encrypt_blocks +			gctx->ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks; +#else +			gctx->ctr = NULL; +#endif +			break; +			} +		else +#endif  #ifdef BSAES_CAPABLE  		if (BSAES_CAPABLE)  			{ -			AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); +			AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);  			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,  					(block128_f)AES_encrypt);  			gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks; @@ -844,7 +916,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  #ifdef VPAES_CAPABLE  		if (VPAES_CAPABLE)  			{ -			vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); +			vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);  			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,  					(block128_f)vpaes_encrypt);  			gctx->ctr = NULL; @@ -854,7 +926,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  #endif  		(void)0;	/* terminate potentially open 'else' */ -		AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); +		AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);  		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);  #ifdef AES_CTR_ASM  		gctx->ctr = (ctr128_f)AES_ctr32_encrypt; @@ -1075,29 +1147,50 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  		xctx->stream = NULL;  #endif  		/* key_len is two AES keys */ -#if !(defined(__arm__) || defined(__arm))      /* not yet? */ +#ifdef HWAES_CAPABLE +		if (HWAES_CAPABLE) +			{ +			if (enc) +			    { +			    HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); +			    xctx->xts.block1 = (block128_f)HWAES_encrypt; +			    } +			else +			    { +			    HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); +			    xctx->xts.block1 = (block128_f)HWAES_decrypt; +			    } + +			HWAES_set_encrypt_key(key + ctx->key_len/2, +						    ctx->key_len * 4, &xctx->ks2.ks); +			xctx->xts.block2 = (block128_f)HWAES_encrypt; + +			xctx->xts.key1 = &xctx->ks1; +			break; +			} +		else +#endif  #ifdef BSAES_CAPABLE  		if (BSAES_CAPABLE)  			xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;  		else  #endif -#endif  #ifdef VPAES_CAPABLE  		if (VPAES_CAPABLE)  		    {  		    if (enc)  			{ -			vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); +			vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);  			xctx->xts.block1 = (block128_f)vpaes_encrypt;  			}  		    else  			{ -			vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); +			vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);  			xctx->xts.block1 = (block128_f)vpaes_decrypt;  			}  		    vpaes_set_encrypt_key(key + ctx->key_len/2, -						ctx->key_len * 4, &xctx->ks2); +						ctx->key_len * 4, &xctx->ks2.ks);  		    xctx->xts.block2 = (block128_f)vpaes_encrypt;  		    xctx->xts.key1 = &xctx->ks1; @@ -1109,17 +1202,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  		if (enc)  			{ -			AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); +			AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);  			xctx->xts.block1 = (block128_f)AES_encrypt;  			}  		else  			{ -			AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); +			AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);  			xctx->xts.block1 = (block128_f)AES_decrypt;  			}  		AES_set_encrypt_key(key + ctx->key_len/2, -						ctx->key_len * 4, &xctx->ks2); +						ctx->key_len * 4, &xctx->ks2.ks);  		xctx->xts.block2 = (block128_f)AES_encrypt;  		xctx->xts.key1 = &xctx->ks1; @@ -1227,10 +1320,23 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  		return 1;  	if (key) do  		{ +#ifdef HWAES_CAPABLE +		if (HWAES_CAPABLE) +			{ +			HWAES_set_encrypt_key(key,ctx->key_len*8,&cctx->ks.ks); + +			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, +					&cctx->ks, (block128_f)HWAES_encrypt); +			cctx->str = NULL; +			cctx->key_set = 1; +			break; +			} +		else +#endif  #ifdef VPAES_CAPABLE  		if (VPAES_CAPABLE)  			{ -			vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks); +			vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks.ks);  			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,  					&cctx->ks, (block128_f)vpaes_encrypt);  			cctx->str = NULL; @@ -1238,7 +1344,7 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,  			break;  			}  #endif -		AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); +		AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);  		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,  					&cctx->ks, (block128_f)AES_encrypt);  		cctx->str = NULL; diff --git a/main/openssl/crypto/modes/asm/ghash-armv4.S b/main/openssl/crypto/modes/asm/ghash-armv4.S index d66c4cbf..6c453774 100644 --- a/main/openssl/crypto/modes/asm/ghash-armv4.S +++ b/main/openssl/crypto/modes/asm/ghash-armv4.S @@ -309,99 +309,213 @@ gcm_gmult_4bit:  #if __ARM_ARCH__>=7  .fpu	neon +.global	gcm_init_neon +.type	gcm_init_neon,%function +.align	4 +gcm_init_neon: +	vld1.64		d7,[r1,:64]!	@ load H +	vmov.i8		q8,#0xe1 +	vld1.64		d6,[r1,:64] +	vshl.i64	d17,#57 +	vshr.u64	d16,#63		@ t0=0xc2....01 +	vdup.8		q9,d7[7] +	vshr.u64	d26,d6,#63 +	vshr.s8		q9,#7			@ broadcast carry bit +	vshl.i64	q3,q3,#1 +	vand		q8,q8,q9 +	vorr		d7,d26		@ H<<<=1 +	veor		q3,q3,q8		@ twisted H +	vstmia		r0,{q3} + +	bx	lr					@ bx lr +.size	gcm_init_neon,.-gcm_init_neon +  .global	gcm_gmult_neon  .type	gcm_gmult_neon,%function  .align	4  gcm_gmult_neon: -	sub		r1,#16		@ point at H in GCM128_CTX -	vld1.64		d29,[r0,:64]!@ load Xi -	vmov.i32	d5,#0xe1		@ our irreducible polynomial -	vld1.64		d28,[r0,:64]! -	vshr.u64	d5,#32 -	vldmia		r1,{d0-d1}	@ load H -	veor		q12,q12 +	vld1.64		d7,[r0,:64]!	@ load Xi +	vld1.64		d6,[r0,:64]! +	vmov.i64	d29,#0x0000ffffffffffff +	vldmia		r1,{d26-d27}	@ load twisted H +	vmov.i64	d30,#0x00000000ffffffff  #ifdef __ARMEL__ -	vrev64.8	q14,q14 +	vrev64.8	q3,q3  #endif -	veor		q13,q13 -	veor		q11,q11 -	mov		r1,#16 -	veor		q10,q10 +	vmov.i64	d31,#0x000000000000ffff +	veor		d28,d26,d27		@ Karatsuba pre-processing  	mov		r3,#16 -	veor		d2,d2 -	vdup.8		d4,d28[0]	@ broadcast lowest byte -	b		.Linner_neon +	b		.Lgmult_neon  .size	gcm_gmult_neon,.-gcm_gmult_neon  .global	gcm_ghash_neon  .type	gcm_ghash_neon,%function  .align	4  gcm_ghash_neon: -	vld1.64		d21,[r0,:64]!	@ load Xi -	vmov.i32	d5,#0xe1		@ our irreducible polynomial -	vld1.64		d20,[r0,:64]! -	vshr.u64	d5,#32 -	vldmia		r0,{d0-d1}		@ load H -	veor		q12,q12 -	nop +	vld1.64		d1,[r0,:64]!	@ load Xi +	vld1.64		d0,[r0,:64]! +	vmov.i64	d29,#0x0000ffffffffffff +	vldmia		r1,{d26-d27}	@ load twisted H +	vmov.i64	d30,#0x00000000ffffffff  #ifdef __ARMEL__ -	vrev64.8	q10,q10 +	vrev64.8	q0,q0  #endif -.Louter_neon: -	vld1.64		d29,[r2]!	@ load inp -	veor		q13,q13 -	vld1.64		d28,[r2]! -	veor		q11,q11 -	mov		r1,#16 +	vmov.i64	d31,#0x000000000000ffff +	veor		d28,d26,d27		@ Karatsuba pre-processing + +.Loop_neon: +	vld1.64		d7,[r2]!		@ load inp +	vld1.64		d6,[r2]!  #ifdef __ARMEL__ -	vrev64.8	q14,q14 +	vrev64.8	q3,q3  #endif -	veor		d2,d2 -	veor		q14,q10			@ inp^=Xi -	veor		q10,q10 -	vdup.8		d4,d28[0]	@ broadcast lowest byte -.Linner_neon: -	subs		r1,r1,#1 -	vmull.p8	q9,d1,d4		@ H.lo·Xi[i] -	vmull.p8	q8,d0,d4		@ H.hi·Xi[i] -	vext.8		q14,q12,#1		@ IN>>=8 - -	veor		q10,q13		@ modulo-scheduled part -	vshl.i64	d22,#48 -	vdup.8		d4,d28[0]	@ broadcast lowest byte -	veor		d3,d18,d20 - -	veor		d21,d22 -	vuzp.8		q9,q8 -	vsli.8		d2,d3,#1		@ compose the "carry" byte -	vext.8		q10,q12,#1		@ Z>>=8 +	veor		q3,q0			@ inp^=Xi +.Lgmult_neon: +	vext.8		d16, d26, d26, #1	@ A1 +	vmull.p8	q8, d16, d6		@ F = A1*B +	vext.8		d0, d6, d6, #1	@ B1 +	vmull.p8	q0, d26, d0		@ E = A*B1 +	vext.8		d18, d26, d26, #2	@ A2 +	vmull.p8	q9, d18, d6		@ H = A2*B +	vext.8		d22, d6, d6, #2	@ B2 +	vmull.p8	q11, d26, d22		@ G = A*B2 +	vext.8		d20, d26, d26, #3	@ A3 +	veor		q8, q8, q0		@ L = E + F +	vmull.p8	q10, d20, d6		@ J = A3*B +	vext.8		d0, d6, d6, #3	@ B3 +	veor		q9, q9, q11		@ M = G + H +	vmull.p8	q0, d26, d0		@ I = A*B3 +	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8 +	vand		d17, d17, d29 +	vext.8		d22, d6, d6, #4	@ B4 +	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16 +	vand		d19, d19, d30 +	vmull.p8	q11, d26, d22		@ K = A*B4 +	veor		q10, q10, q0		@ N = I + J +	veor		d16, d16, d17 +	veor		d18, d18, d19 +	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24 +	vand		d21, d21, d31 +	vext.8		q8, q8, q8, #15 +	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32 +	vmov.i64	d23, #0 +	vext.8		q9, q9, q9, #14 +	veor		d20, d20, d21 +	vmull.p8	q0, d26, d6		@ D = A*B +	vext.8		q11, q11, q11, #12 +	vext.8		q10, q10, q10, #13 +	veor		q8, q8, q9 +	veor		q10, q10, q11 +	veor		q0, q0, q8 +	veor		q0, q0, q10 +	veor		d6,d6,d7	@ Karatsuba pre-processing +	vext.8		d16, d28, d28, #1	@ A1 +	vmull.p8	q8, d16, d6		@ F = A1*B +	vext.8		d2, d6, d6, #1	@ B1 +	vmull.p8	q1, d28, d2		@ E = A*B1 +	vext.8		d18, d28, d28, #2	@ A2 +	vmull.p8	q9, d18, d6		@ H = A2*B +	vext.8		d22, d6, d6, #2	@ B2 +	vmull.p8	q11, d28, d22		@ G = A*B2 +	vext.8		d20, d28, d28, #3	@ A3 +	veor		q8, q8, q1		@ L = E + F +	vmull.p8	q10, d20, d6		@ J = A3*B +	vext.8		d2, d6, d6, #3	@ B3 +	veor		q9, q9, q11		@ M = G + H +	vmull.p8	q1, d28, d2		@ I = A*B3 +	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8 +	vand		d17, d17, d29 +	vext.8		d22, d6, d6, #4	@ B4 +	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16 +	vand		d19, d19, d30 +	vmull.p8	q11, d28, d22		@ K = A*B4 +	veor		q10, q10, q1		@ N = I + J +	veor		d16, d16, d17 +	veor		d18, d18, d19 +	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24 +	vand		d21, d21, d31 +	vext.8		q8, q8, q8, #15 +	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32 +	vmov.i64	d23, #0 +	vext.8		q9, q9, q9, #14 +	veor		d20, d20, d21 +	vmull.p8	q1, d28, d6		@ D = A*B +	vext.8		q11, q11, q11, #12 +	vext.8		q10, q10, q10, #13 +	veor		q8, q8, q9 +	veor		q10, q10, q11 +	veor		q1, q1, q8 +	veor		q1, q1, q10 +	vext.8		d16, d27, d27, #1	@ A1 +	vmull.p8	q8, d16, d7		@ F = A1*B +	vext.8		d4, d7, d7, #1	@ B1 +	vmull.p8	q2, d27, d4		@ E = A*B1 +	vext.8		d18, d27, d27, #2	@ A2 +	vmull.p8	q9, d18, d7		@ H = A2*B +	vext.8		d22, d7, d7, #2	@ B2 +	vmull.p8	q11, d27, d22		@ G = A*B2 +	vext.8		d20, d27, d27, #3	@ A3 +	veor		q8, q8, q2		@ L = E + F +	vmull.p8	q10, d20, d7		@ J = A3*B +	vext.8		d4, d7, d7, #3	@ B3 +	veor		q9, q9, q11		@ M = G + H +	vmull.p8	q2, d27, d4		@ I = A*B3 +	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8 +	vand		d17, d17, d29 +	vext.8		d22, d7, d7, #4	@ B4 +	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16 +	vand		d19, d19, d30 +	vmull.p8	q11, d27, d22		@ K = A*B4 +	veor		q10, q10, q2		@ N = I + J +	veor		d16, d16, d17 +	veor		d18, d18, d19 +	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24 +	vand		d21, d21, d31 +	vext.8		q8, q8, q8, #15 +	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32 +	vmov.i64	d23, #0 +	vext.8		q9, q9, q9, #14 +	veor		d20, d20, d21 +	vmull.p8	q2, d27, d7		@ D = A*B +	vext.8		q11, q11, q11, #12 +	vext.8		q10, q10, q10, #13 +	veor		q8, q8, q9 +	veor		q10, q10, q11 +	veor		q2, q2, q8 +	veor		q2, q2, q10 +	veor		q1,q1,q0		@ Karatsuba post-processing +	veor		q1,q1,q2 +	veor		d1,d1,d2 +	veor		d4,d4,d3	@ Xh|Xl - 256-bit result -	vmull.p8	q11,d2,d5		@ "carry"·0xe1 -	vshr.u8		d2,d3,#7		@ save Z's bottom bit -	vext.8		q13,q9,q12,#1	@ Qlo>>=8 -	veor		q10,q8 -	bne		.Linner_neon +	@ equivalent of reduction_avx from ghash-x86_64.pl +	vshl.i64	q9,q0,#57		@ 1st phase +	vshl.i64	q10,q0,#62 +	veor		q10,q10,q9		@ +	vshl.i64	q9,q0,#63 +	veor		q10, q10, q9		@ + 	veor		d1,d1,d20	@ +	veor		d4,d4,d21 -	veor		q10,q13		@ modulo-scheduled artefact -	vshl.i64	d22,#48 -	veor		d21,d22 +	vshr.u64	q10,q0,#1		@ 2nd phase +	veor		q2,q2,q0 +	veor		q0,q0,q10		@ +	vshr.u64	q10,q10,#6 +	vshr.u64	q0,q0,#1		@ +	veor		q0,q0,q2		@ +	veor		q0,q0,q10		@ -	@ finalization, normalize Z:Zo -	vand		d2,d5		@ suffices to mask the bit -	vshr.u64	d3,d20,#63 -	vshl.i64	q10,#1  	subs		r3,#16 -	vorr		q10,q1		@ Z=Z:Zo<<1 -	bne		.Louter_neon +	bne		.Loop_neon  #ifdef __ARMEL__ -	vrev64.8	q10,q10 +	vrev64.8	q0,q0  #endif  	sub		r0,#16	 -	vst1.64		d21,[r0,:64]!	@ write out Xi -	vst1.64		d20,[r0,:64] +	vst1.64		d1,[r0,:64]!	@ write out Xi +	vst1.64		d0,[r0,:64] -	.word	0xe12fff1e +	bx	lr					@ bx lr  .size	gcm_ghash_neon,.-gcm_ghash_neon  #endif  .asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" diff --git a/main/openssl/crypto/modes/asm/ghash-armv4.pl b/main/openssl/crypto/modes/asm/ghash-armv4.pl index e46f8e34..b79ecbcc 100644 --- a/main/openssl/crypto/modes/asm/ghash-armv4.pl +++ b/main/openssl/crypto/modes/asm/ghash-armv4.pl @@ -35,6 +35,20 @@  # Add NEON implementation featuring polynomial multiplication, i.e. no  # lookup tables involved. On Cortex A8 it was measured to process one  # byte in 15 cycles or 55% faster than integer-only code. +# +# April 2014 +# +# Switch to multiplication algorithm suggested in paper referred +# below and combine it with reduction algorithm from x86 module. +# Performance improvement over previous version varies from 65% on +# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 +# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 - +# in 9.33. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +#  +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf  # ====================================================================  # Note about "528B" variant. In ARM case it makes lesser sense to @@ -303,117 +317,160 @@ $code.=<<___;  .size	gcm_gmult_4bit,.-gcm_gmult_4bit  ___  { -my $cnt=$Htbl;	# $Htbl is used once in the very beginning - -my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); -my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); - -# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit -# in Zo. Or should I say "top bit", because GHASH is specified in -# reverse bit order? Otherwise straightforward 128-bt H by one input -# byte multiplication and modulo-reduction, times 16. +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); +my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); -sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     } -sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   } -sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } +sub clmul64x64 { +my ($r,$a,$b)=@_; +$code.=<<___; +	vext.8		$t0#lo, $a, $a, #1	@ A1 +	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B +	vext.8		$r#lo, $b, $b, #1	@ B1 +	vmull.p8	$r, $a, $r#lo		@ E = A*B1 +	vext.8		$t1#lo, $a, $a, #2	@ A2 +	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B +	vext.8		$t3#lo, $b, $b, #2	@ B2 +	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2 +	vext.8		$t2#lo, $a, $a, #3	@ A3 +	veor		$t0, $t0, $r		@ L = E + F +	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B +	vext.8		$r#lo, $b, $b, #3	@ B3 +	veor		$t1, $t1, $t3		@ M = G + H +	vmull.p8	$r, $a, $r#lo		@ I = A*B3 +	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8 +	vand		$t0#hi, $t0#hi, $k48 +	vext.8		$t3#lo, $b, $b, #4	@ B4 +	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16 +	vand		$t1#hi, $t1#hi, $k32 +	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4 +	veor		$t2, $t2, $r		@ N = I + J +	veor		$t0#lo, $t0#lo, $t0#hi +	veor		$t1#lo, $t1#lo, $t1#hi +	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24 +	vand		$t2#hi, $t2#hi, $k16 +	vext.8		$t0, $t0, $t0, #15 +	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32 +	vmov.i64	$t3#hi, #0 +	vext.8		$t1, $t1, $t1, #14 +	veor		$t2#lo, $t2#lo, $t2#hi +	vmull.p8	$r, $a, $b		@ D = A*B +	vext.8		$t3, $t3, $t3, #12 +	vext.8		$t2, $t2, $t2, #13 +	veor		$t0, $t0, $t1 +	veor		$t2, $t2, $t3 +	veor		$r, $r, $t0 +	veor		$r, $r, $t2 +___ +}  $code.=<<___;  #if __ARM_ARCH__>=7  .fpu	neon +.global	gcm_init_neon +.type	gcm_init_neon,%function +.align	4 +gcm_init_neon: +	vld1.64		$IN#hi,[r1,:64]!	@ load H +	vmov.i8		$t0,#0xe1 +	vld1.64		$IN#lo,[r1,:64] +	vshl.i64	$t0#hi,#57 +	vshr.u64	$t0#lo,#63		@ t0=0xc2....01 +	vdup.8		$t1,$IN#hi[7] +	vshr.u64	$Hlo,$IN#lo,#63 +	vshr.s8		$t1,#7			@ broadcast carry bit +	vshl.i64	$IN,$IN,#1 +	vand		$t0,$t0,$t1 +	vorr		$IN#hi,$Hlo		@ H<<<=1 +	veor		$IN,$IN,$t0		@ twisted H +	vstmia		r0,{$IN} + +	ret					@ bx lr +.size	gcm_init_neon,.-gcm_init_neon +  .global	gcm_gmult_neon  .type	gcm_gmult_neon,%function  .align	4  gcm_gmult_neon: -	sub		$Htbl,#16		@ point at H in GCM128_CTX -	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi -	vmov.i32	$mod,#0xe1		@ our irreducible polynomial -	vld1.64		`&Dlo("$IN")`,[$Xi,:64]! -	vshr.u64	$mod,#32 -	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H -	veor		$zero,$zero +	vld1.64		$IN#hi,[$Xi,:64]!	@ load Xi +	vld1.64		$IN#lo,[$Xi,:64]! +	vmov.i64	$k48,#0x0000ffffffffffff +	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H +	vmov.i64	$k32,#0x00000000ffffffff  #ifdef __ARMEL__  	vrev64.8	$IN,$IN  #endif -	veor		$Qpost,$Qpost -	veor		$R,$R -	mov		$cnt,#16 -	veor		$Z,$Z +	vmov.i64	$k16,#0x000000000000ffff +	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing  	mov		$len,#16 -	veor		$Zo,$Zo -	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte -	b		.Linner_neon +	b		.Lgmult_neon  .size	gcm_gmult_neon,.-gcm_gmult_neon  .global	gcm_ghash_neon  .type	gcm_ghash_neon,%function  .align	4  gcm_ghash_neon: -	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi -	vmov.i32	$mod,#0xe1		@ our irreducible polynomial -	vld1.64		`&Dlo("$Z")`,[$Xi,:64]! -	vshr.u64	$mod,#32 -	vldmia		$Xi,{$Hhi-$Hlo}		@ load H -	veor		$zero,$zero -	nop +	vld1.64		$Xl#hi,[$Xi,:64]!	@ load Xi +	vld1.64		$Xl#lo,[$Xi,:64]! +	vmov.i64	$k48,#0x0000ffffffffffff +	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H +	vmov.i64	$k32,#0x00000000ffffffff  #ifdef __ARMEL__ -	vrev64.8	$Z,$Z +	vrev64.8	$Xl,$Xl  #endif -.Louter_neon: -	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp -	veor		$Qpost,$Qpost -	vld1.64		`&Dlo($IN)`,[$inp]! -	veor		$R,$R -	mov		$cnt,#16 +	vmov.i64	$k16,#0x000000000000ffff +	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing + +.Loop_neon: +	vld1.64		$IN#hi,[$inp]!		@ load inp +	vld1.64		$IN#lo,[$inp]!  #ifdef __ARMEL__  	vrev64.8	$IN,$IN  #endif -	veor		$Zo,$Zo -	veor		$IN,$Z			@ inp^=Xi -	veor		$Z,$Z -	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte -.Linner_neon: -	subs		$cnt,$cnt,#1 -	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo·Xi[i] -	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi·Xi[i] -	vext.8		$IN,$zero,#1		@ IN>>=8 - -	veor		$Z,$Qpost		@ modulo-scheduled part -	vshl.i64	`&Dlo("$R")`,#48 -	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte -	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")` - -	veor		`&Dhi("$Z")`,`&Dlo("$R")` -	vuzp.8		$Qlo,$Qhi -	vsli.8		$Zo,$T,#1		@ compose the "carry" byte -	vext.8		$Z,$zero,#1		@ Z>>=8 - -	vmull.p8	$R,$Zo,$mod		@ "carry"·0xe1 -	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit -	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8 -	veor		$Z,$Qhi -	bne		.Linner_neon - -	veor		$Z,$Qpost		@ modulo-scheduled artefact -	vshl.i64	`&Dlo("$R")`,#48 -	veor		`&Dhi("$Z")`,`&Dlo("$R")` - -	@ finalization, normalize Z:Zo -	vand		$Zo,$mod		@ suffices to mask the bit -	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 -	vshl.i64	$Z,#1 +	veor		$IN,$Xl			@ inp^=Xi +.Lgmult_neon: +___ +	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo·Xi.lo +$code.=<<___; +	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing +___ +	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)·(Xi.lo+Xi.hi) +	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi·Xi.hi +$code.=<<___; +	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing +	veor		$Xm,$Xm,$Xh +	veor		$Xl#hi,$Xl#hi,$Xm#lo +	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result + +	@ equivalent of reduction_avx from ghash-x86_64.pl +	vshl.i64	$t1,$Xl,#57		@ 1st phase +	vshl.i64	$t2,$Xl,#62 +	veor		$t2,$t2,$t1		@ +	vshl.i64	$t1,$Xl,#63 +	veor		$t2, $t2, $t1		@ + 	veor		$Xl#hi,$Xl#hi,$t2#lo	@ +	veor		$Xh#lo,$Xh#lo,$t2#hi + +	vshr.u64	$t2,$Xl,#1		@ 2nd phase +	veor		$Xh,$Xh,$Xl +	veor		$Xl,$Xl,$t2		@ +	vshr.u64	$t2,$t2,#6 +	vshr.u64	$Xl,$Xl,#1		@ +	veor		$Xl,$Xl,$Xh		@ +	veor		$Xl,$Xl,$t2		@ +  	subs		$len,#16 -	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1 -	bne		.Louter_neon +	bne		.Loop_neon  #ifdef __ARMEL__ -	vrev64.8	$Z,$Z +	vrev64.8	$Xl,$Xl  #endif  	sub		$Xi,#16	 -	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi -	vst1.64		`&Dlo("$Z")`,[$Xi,:64] +	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi +	vst1.64		$Xl#lo,[$Xi,:64] -	bx	lr +	ret					@ bx lr  .size	gcm_ghash_neon,.-gcm_ghash_neon  #endif  ___ @@ -423,7 +480,13 @@ $code.=<<___;  .align  2  ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4 -print $code; +foreach (split("\n",$code)) { +	s/\`([^\`]*)\`/eval $1/geo; + +	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or +	s/\bret\b/bx	lr/go		or +	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4 + +	print $_,"\n"; +}  close STDOUT; # enforce flush diff --git a/main/openssl/crypto/modes/asm/ghashv8-armx-64.S b/main/openssl/crypto/modes/asm/ghashv8-armx-64.S new file mode 100644 index 00000000..b77b6c40 --- /dev/null +++ b/main/openssl/crypto/modes/asm/ghashv8-armx-64.S @@ -0,0 +1,115 @@ +#include "arm_arch.h" + +.text +.arch	armv8-a+crypto +.global	gcm_init_v8 +.type	gcm_init_v8,%function +.align	4 +gcm_init_v8: +	ld1		{v17.2d},[x1]		//load H +	movi		v16.16b,#0xe1 +	ext		v3.16b,v17.16b,v17.16b,#8 +	shl	v16.2d,v16.2d,#57 +	ushr	v18.2d,v16.2d,#63 +	ext		v16.16b,v18.16b,v16.16b,#8		//t0=0xc2....01 +	dup		v17.4s,v17.s[1] +	ushr	v19.2d,v3.2d,#63 +	sshr	v17.4s,v17.4s,#31		//broadcast carry bit +	and		v19.16b,v19.16b,v16.16b +	shl	v3.2d,v3.2d,#1 +	ext		v19.16b,v19.16b,v19.16b,#8 +	and		v16.16b,v16.16b,v17.16b +	orr		v3.16b,v3.16b,v19.16b		//H<<<=1 +	eor		v3.16b,v3.16b,v16.16b		//twisted H +	st1		{v3.2d},[x0] + +	ret +.size	gcm_init_v8,.-gcm_init_v8 + +.global	gcm_gmult_v8 +.type	gcm_gmult_v8,%function +.align	4 +gcm_gmult_v8: +	ld1		{v17.2d},[x0]		//load Xi +	movi		v19.16b,#0xe1 +	ld1		{v20.2d},[x1]		//load twisted H +	shl	v19.2d,v19.2d,#57 +#ifndef __ARMEB__ +	rev64	v17.16b,v17.16b +#endif +	ext		v21.16b,v20.16b,v20.16b,#8 +	mov		x3,#0 +	ext		v3.16b,v17.16b,v17.16b,#8 +	mov		x12,#0 +	eor		v21.16b,v21.16b,v20.16b		//Karatsuba pre-processing +	mov		x2,x0 +	b		.Lgmult_v8 +.size	gcm_gmult_v8,.-gcm_gmult_v8 + +.global	gcm_ghash_v8 +.type	gcm_ghash_v8,%function +.align	4 +gcm_ghash_v8: +	ld1		{v0.2d},[x0]		//load [rotated] Xi +	subs		x3,x3,#16 +	movi		v19.16b,#0xe1 +	mov		x12,#16 +	ld1		{v20.2d},[x1]		//load twisted H +	csel	x12,xzr,x12,eq +	ext		v0.16b,v0.16b,v0.16b,#8 +	shl	v19.2d,v19.2d,#57 +	ld1		{v17.2d},[x2],x12	//load [rotated] inp +	ext		v21.16b,v20.16b,v20.16b,#8 +#ifndef __ARMEB__ +	rev64	v0.16b,v0.16b +	rev64	v17.16b,v17.16b +#endif +	eor		v21.16b,v21.16b,v20.16b		//Karatsuba pre-processing +	ext		v3.16b,v17.16b,v17.16b,#8 +	b		.Loop_v8 + +.align	4 +.Loop_v8: +	ext		v18.16b,v0.16b,v0.16b,#8 +	eor		v3.16b,v3.16b,v0.16b		//inp^=Xi +	eor		v17.16b,v17.16b,v18.16b		//v17.16b is rotated inp^Xi + +.Lgmult_v8: +	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo +	eor		v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing +	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi +	subs		x3,x3,#16 +	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi) +	csel	x12,xzr,x12,eq + +	ext		v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing +	eor		v18.16b,v0.16b,v2.16b +	eor		v1.16b,v1.16b,v17.16b +	 ld1	{v17.2d},[x2],x12	//load [rotated] inp +	eor		v1.16b,v1.16b,v18.16b +	pmull	v18.1q,v0.1d,v19.1d		//1st phase + +	ins	v2.d[0],v1.d[1] +	ins	v1.d[1],v0.d[0] +#ifndef __ARMEB__ +	 rev64	v17.16b,v17.16b +#endif +	eor		v0.16b,v1.16b,v18.16b +	 ext		v3.16b,v17.16b,v17.16b,#8 + +	ext		v18.16b,v0.16b,v0.16b,#8		//2nd phase +	pmull	v0.1q,v0.1d,v19.1d +	eor		v18.16b,v18.16b,v2.16b +	eor		v0.16b,v0.16b,v18.16b +	b.hs		.Loop_v8 + +#ifndef __ARMEB__ +	rev64	v0.16b,v0.16b +#endif +	ext		v0.16b,v0.16b,v0.16b,#8 +	st1		{v0.2d},[x0]		//write out Xi + +	ret +.size	gcm_ghash_v8,.-gcm_ghash_v8 +.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align  2 diff --git a/main/openssl/crypto/modes/asm/ghashv8-armx.S b/main/openssl/crypto/modes/asm/ghashv8-armx.S new file mode 100644 index 00000000..f388c54e --- /dev/null +++ b/main/openssl/crypto/modes/asm/ghashv8-armx.S @@ -0,0 +1,116 @@ +#include "arm_arch.h" + +.text +.fpu	neon +.code	32 +.global	gcm_init_v8 +.type	gcm_init_v8,%function +.align	4 +gcm_init_v8: +	vld1.64		{q9},[r1]		@ load H +	vmov.i8		q8,#0xe1 +	vext.8		q3,q9,q9,#8 +	vshl.i64	q8,q8,#57 +	vshr.u64	q10,q8,#63 +	vext.8		q8,q10,q8,#8		@ t0=0xc2....01 +	vdup.32	q9,d18[1] +	vshr.u64	q11,q3,#63 +	vshr.s32	q9,q9,#31		@ broadcast carry bit +	vand		q11,q11,q8 +	vshl.i64	q3,q3,#1 +	vext.8		q11,q11,q11,#8 +	vand		q8,q8,q9 +	vorr		q3,q3,q11		@ H<<<=1 +	veor		q3,q3,q8		@ twisted H +	vst1.64		{q3},[r0] + +	bx	lr +.size	gcm_init_v8,.-gcm_init_v8 + +.global	gcm_gmult_v8 +.type	gcm_gmult_v8,%function +.align	4 +gcm_gmult_v8: +	vld1.64		{q9},[r0]		@ load Xi +	vmov.i8		q11,#0xe1 +	vld1.64		{q12},[r1]		@ load twisted H +	vshl.u64	q11,q11,#57 +#ifndef __ARMEB__ +	vrev64.8	q9,q9 +#endif +	vext.8		q13,q12,q12,#8 +	mov		r3,#0 +	vext.8		q3,q9,q9,#8 +	mov		r12,#0 +	veor		q13,q13,q12		@ Karatsuba pre-processing +	mov		r2,r0 +	b		.Lgmult_v8 +.size	gcm_gmult_v8,.-gcm_gmult_v8 + +.global	gcm_ghash_v8 +.type	gcm_ghash_v8,%function +.align	4 +gcm_ghash_v8: +	vld1.64		{q0},[r0]		@ load [rotated] Xi +	subs		r3,r3,#16 +	vmov.i8		q11,#0xe1 +	mov		r12,#16 +	vld1.64		{q12},[r1]		@ load twisted H +	moveq	r12,#0 +	vext.8		q0,q0,q0,#8 +	vshl.u64	q11,q11,#57 +	vld1.64		{q9},[r2],r12	@ load [rotated] inp +	vext.8		q13,q12,q12,#8 +#ifndef __ARMEB__ +	vrev64.8	q0,q0 +	vrev64.8	q9,q9 +#endif +	veor		q13,q13,q12		@ Karatsuba pre-processing +	vext.8		q3,q9,q9,#8 +	b		.Loop_v8 + +.align	4 +.Loop_v8: +	vext.8		q10,q0,q0,#8 +	veor		q3,q3,q0		@ inp^=Xi +	veor		q9,q9,q10		@ q9 is rotated inp^Xi + +.Lgmult_v8: +	.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo +	veor		q9,q9,q3		@ Karatsuba pre-processing +	.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi +	subs		r3,r3,#16 +	.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi) +	moveq	r12,#0 + +	vext.8		q9,q0,q2,#8		@ Karatsuba post-processing +	veor		q10,q0,q2 +	veor		q1,q1,q9 +	 vld1.64	{q9},[r2],r12	@ load [rotated] inp +	veor		q1,q1,q10 +	.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase + +	vmov		d4,d3		@ Xh|Xm - 256-bit result +	vmov		d3,d0		@ Xm is rotated Xl +#ifndef __ARMEB__ +	 vrev64.8	q9,q9 +#endif +	veor		q0,q1,q10 +	 vext.8		q3,q9,q9,#8 + +	vext.8		q10,q0,q0,#8		@ 2nd phase +	.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11 +	veor		q10,q10,q2 +	veor		q0,q0,q10 +	bhs		.Loop_v8 + +#ifndef __ARMEB__ +	vrev64.8	q0,q0 +#endif +	vext.8		q0,q0,q0,#8 +	vst1.64		{q0},[r0]		@ write out Xi + +	bx	lr +.size	gcm_ghash_v8,.-gcm_ghash_v8 +.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align  2 diff --git a/main/openssl/crypto/modes/asm/ghashv8-armx.pl b/main/openssl/crypto/modes/asm/ghashv8-armx.pl new file mode 100644 index 00000000..69e863e7 --- /dev/null +++ b/main/openssl/crypto/modes/asm/ghashv8-armx.pl @@ -0,0 +1,240 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. +# +# June 2014 +# +# Initial version was developed in tight cooperation with Ard +# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from +# other assembly modules. Just like aesv8-armx.pl this module +# supports both AArch32 and AArch64 execution modes. +# +# Current performance in cycles per processed byte: +# +#		PMULL[2]	32-bit NEON(*) +# Apple A7	1.76		5.62 +# Cortex-A5x	n/a		n/a +# +# (*)	presented for reference/comparison purposes; + +$flavour = shift; +open STDOUT,">".shift; + +$Xi="x0";	# argument block +$Htbl="x1"; +$inp="x2"; +$len="x3"; + +$inc="x12"; + +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14)); + +$code=<<___; +#include "arm_arch.h" + +.text +___ +$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/); +$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/); + +$code.=<<___; +.global	gcm_init_v8 +.type	gcm_init_v8,%function +.align	4 +gcm_init_v8: +	vld1.64		{$t1},[x1]		@ load H +	vmov.i8		$t0,#0xe1 +	vext.8		$IN,$t1,$t1,#8 +	vshl.i64	$t0,$t0,#57 +	vshr.u64	$t2,$t0,#63 +	vext.8		$t0,$t2,$t0,#8		@ t0=0xc2....01 +	vdup.32		$t1,${t1}[1] +	vshr.u64	$t3,$IN,#63 +	vshr.s32	$t1,$t1,#31		@ broadcast carry bit +	vand		$t3,$t3,$t0 +	vshl.i64	$IN,$IN,#1 +	vext.8		$t3,$t3,$t3,#8 +	vand		$t0,$t0,$t1 +	vorr		$IN,$IN,$t3		@ H<<<=1 +	veor		$IN,$IN,$t0		@ twisted H +	vst1.64		{$IN},[x0] + +	ret +.size	gcm_init_v8,.-gcm_init_v8 + +.global	gcm_gmult_v8 +.type	gcm_gmult_v8,%function +.align	4 +gcm_gmult_v8: +	vld1.64		{$t1},[$Xi]		@ load Xi +	vmov.i8		$t3,#0xe1 +	vld1.64		{$H},[$Htbl]		@ load twisted H +	vshl.u64	$t3,$t3,#57 +#ifndef __ARMEB__ +	vrev64.8	$t1,$t1 +#endif +	vext.8		$Hhl,$H,$H,#8 +	mov		$len,#0 +	vext.8		$IN,$t1,$t1,#8 +	mov		$inc,#0 +	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing +	mov		$inp,$Xi +	b		.Lgmult_v8 +.size	gcm_gmult_v8,.-gcm_gmult_v8 + +.global	gcm_ghash_v8 +.type	gcm_ghash_v8,%function +.align	4 +gcm_ghash_v8: +	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi +	subs		$len,$len,#16 +	vmov.i8		$t3,#0xe1 +	mov		$inc,#16 +	vld1.64		{$H},[$Htbl]		@ load twisted H +	cclr		$inc,eq +	vext.8		$Xl,$Xl,$Xl,#8 +	vshl.u64	$t3,$t3,#57 +	vld1.64		{$t1},[$inp],$inc	@ load [rotated] inp +	vext.8		$Hhl,$H,$H,#8 +#ifndef __ARMEB__ +	vrev64.8	$Xl,$Xl +	vrev64.8	$t1,$t1 +#endif +	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing +	vext.8		$IN,$t1,$t1,#8 +	b		.Loop_v8 + +.align	4 +.Loop_v8: +	vext.8		$t2,$Xl,$Xl,#8 +	veor		$IN,$IN,$Xl		@ inp^=Xi +	veor		$t1,$t1,$t2		@ $t1 is rotated inp^Xi + +.Lgmult_v8: +	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo +	veor		$t1,$t1,$IN		@ Karatsuba pre-processing +	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi +	subs		$len,$len,#16 +	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi) +	cclr		$inc,eq + +	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing +	veor		$t2,$Xl,$Xh +	veor		$Xm,$Xm,$t1 +	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] inp +	veor		$Xm,$Xm,$t2 +	vpmull.p64	$t2,$Xl,$t3		@ 1st phase + +	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result +	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl +#ifndef __ARMEB__ +	 vrev64.8	$t1,$t1 +#endif +	veor		$Xl,$Xm,$t2 +	 vext.8		$IN,$t1,$t1,#8 + +	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase +	vpmull.p64	$Xl,$Xl,$t3 +	veor		$t2,$t2,$Xh +	veor		$Xl,$Xl,$t2 +	b.hs		.Loop_v8 + +#ifndef __ARMEB__ +	vrev64.8	$Xl,$Xl +#endif +	vext.8		$Xl,$Xl,$Xl,#8 +	vst1.64		{$Xl},[$Xi]		@ write out Xi + +	ret +.size	gcm_ghash_v8,.-gcm_ghash_v8 +___ +} +$code.=<<___; +.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align  2 +___ + +if ($flavour =~ /64/) {			######## 64-bit code +    sub unvmov { +	my $arg=shift; + +	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && +	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; +    } +    foreach(split("\n",$code)) { +	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or +	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics +	s/vmov\s+(.*)/unvmov($1)/geo	or +	s/vext\.8/ext/o			or +	s/vshr\.s/sshr\.s/o		or +	s/vshr/ushr/o			or +	s/^(\s+)v/$1/o			or	# strip off v prefix +	s/\bbx\s+lr\b/ret/o; + +	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers +	s/@\s/\/\//o;				# old->new style commentary + +	# fix up remainig legacy suffixes +	s/\.[ui]?8(\s)/$1/o; +	s/\.[uis]?32//o and s/\.16b/\.4s/go; +	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument +	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments +	s/\.[uisp]?64//o and s/\.16b/\.2d/go; +	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + +	print $_,"\n"; +    } +} else {				######## 32-bit code +    sub unvdup32 { +	my $arg=shift; + +	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && +	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; +    } +    sub unvpmullp64 { +	my ($mnemonic,$arg)=@_; + +	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { +	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) +				 |(($2&7)<<17)|(($2&8)<<4) +				 |(($3&7)<<1) |(($3&8)<<2); +	    $word |= 0x00010001	 if ($mnemonic =~ "2"); +	    # since ARMv7 instructions are always encoded little-endian. +	    # correct solution is to use .inst directive, but older +	    # assemblers don't implement it:-( +	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", +			$word&0xff,($word>>8)&0xff, +			($word>>16)&0xff,($word>>24)&0xff, +			$mnemonic,$arg; +	} +    } + +    foreach(split("\n",$code)) { +	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers +	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers +        s/\/\/\s?/@ /o;				# new->old style commentary + +	# fix up remainig new-style suffixes +	s/\],#[0-9]+/]!/o; + +	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or +	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or +	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or +	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or +	s/^(\s+)b\./$1b/o						or +	s/^(\s+)ret/$1bx\tlr/o; + +        print $_,"\n"; +    } +} + +close STDOUT; # enforce flush diff --git a/main/openssl/crypto/modes/gcm128.c b/main/openssl/crypto/modes/gcm128.c index e1dc2b0f..79ebb66e 100644 --- a/main/openssl/crypto/modes/gcm128.c +++ b/main/openssl/crypto/modes/gcm128.c @@ -642,7 +642,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])  #endif -#if	TABLE_BITS==4 && defined(GHASH_ASM) +#if	TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))  # if	!defined(I386_ONLY) && \  	(defined(__i386)	|| defined(__i386__)	|| \  	 defined(__x86_64)	|| defined(__x86_64__)	|| \ @@ -663,13 +663,21 @@ void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len  void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);  void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);  #  endif -# elif defined(__arm__) || defined(__arm) +# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)  #  include "arm_arch.h"  #  if __ARM_ARCH__>=7  #   define GHASH_ASM_ARM  #   define GCM_FUNCREF_4BIT +#   define PMULL_CAPABLE	(OPENSSL_armcap_P & ARMV8_PMULL) +#   if defined(__arm__) || defined(__arm) +#    define NEON_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON) +#   endif +void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);  void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);  void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +void gcm_init_v8(u128 Htable[16],const u64 Xi[2]); +void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);  #  endif  # endif  #endif @@ -739,10 +747,21 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)  	ctx->ghash = gcm_ghash_4bit;  #  endif  # elif	defined(GHASH_ASM_ARM) -	if (OPENSSL_armcap_P & ARMV7_NEON) { +#  ifdef PMULL_CAPABLE +	if (PMULL_CAPABLE) { +		gcm_init_v8(ctx->Htable,ctx->H.u); +		ctx->gmult = gcm_gmult_v8; +		ctx->ghash = gcm_ghash_v8; +	} else +#  endif +#  ifdef NEON_CAPABLE +	if (NEON_CAPABLE) { +		gcm_init_neon(ctx->Htable,ctx->H.u);  		ctx->gmult = gcm_gmult_neon;  		ctx->ghash = gcm_ghash_neon; -	} else { +	} else +#  endif +	{  		gcm_init_4bit(ctx->Htable,ctx->H.u);  		ctx->gmult = gcm_gmult_4bit;  		ctx->ghash = gcm_ghash_4bit; diff --git a/main/openssl/crypto/opensslconf-32.h b/main/openssl/crypto/opensslconf-32.h index d6625489..caf6f1b8 100644 --- a/main/openssl/crypto/opensslconf-32.h +++ b/main/openssl/crypto/opensslconf-32.h @@ -53,6 +53,9 @@  #ifndef OPENSSL_NO_RFC3779  # define OPENSSL_NO_RFC3779  #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif  #ifndef OPENSSL_NO_RSAX  # define OPENSSL_NO_RSAX  #endif @@ -137,6 +140,9 @@  # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)  #  define NO_RFC3779  # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +#  define NO_RIPEMD +# endif  # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)  #  define NO_RSAX  # endif diff --git a/main/openssl/crypto/opensslconf-64.h b/main/openssl/crypto/opensslconf-64.h index 70c5a2cb..88fb0419 100644 --- a/main/openssl/crypto/opensslconf-64.h +++ b/main/openssl/crypto/opensslconf-64.h @@ -53,6 +53,9 @@  #ifndef OPENSSL_NO_RFC3779  # define OPENSSL_NO_RFC3779  #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif  #ifndef OPENSSL_NO_RSAX  # define OPENSSL_NO_RSAX  #endif @@ -137,6 +140,9 @@  # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)  #  define NO_RFC3779  # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +#  define NO_RIPEMD +# endif  # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)  #  define NO_RSAX  # endif diff --git a/main/openssl/crypto/opensslconf-static-32.h b/main/openssl/crypto/opensslconf-static-32.h index d6625489..caf6f1b8 100644 --- a/main/openssl/crypto/opensslconf-static-32.h +++ b/main/openssl/crypto/opensslconf-static-32.h @@ -53,6 +53,9 @@  #ifndef OPENSSL_NO_RFC3779  # define OPENSSL_NO_RFC3779  #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif  #ifndef OPENSSL_NO_RSAX  # define OPENSSL_NO_RSAX  #endif @@ -137,6 +140,9 @@  # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)  #  define NO_RFC3779  # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +#  define NO_RIPEMD +# endif  # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)  #  define NO_RSAX  # endif diff --git a/main/openssl/crypto/opensslconf-static-64.h b/main/openssl/crypto/opensslconf-static-64.h index 70c5a2cb..88fb0419 100644 --- a/main/openssl/crypto/opensslconf-static-64.h +++ b/main/openssl/crypto/opensslconf-static-64.h @@ -53,6 +53,9 @@  #ifndef OPENSSL_NO_RFC3779  # define OPENSSL_NO_RFC3779  #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif  #ifndef OPENSSL_NO_RSAX  # define OPENSSL_NO_RSAX  #endif @@ -137,6 +140,9 @@  # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)  #  define NO_RFC3779  # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +#  define NO_RIPEMD +# endif  # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)  #  define NO_RSAX  # endif diff --git a/main/openssl/crypto/ripemd/README b/main/openssl/crypto/ripemd/README deleted file mode 100644 index f1ffc8b1..00000000 --- a/main/openssl/crypto/ripemd/README +++ /dev/null @@ -1,15 +0,0 @@ -RIPEMD-160 -http://www.esat.kuleuven.ac.be/~bosselae/ripemd160.html - -This is my implementation of RIPEMD-160.  The pentium assember is a little -off the pace since I only get 1050 cycles, while the best is 1013. -I have a few ideas for how to get another 20 or so cycles, but at -this point I will not bother right now.  I believe the trick will be -to remove my 'copy X array onto stack' until inside the RIP1() finctions the -first time round.  To do this I need another register and will only have one -temporary one.  A bit tricky....  I can also cleanup the saving of the 5 words -after the first half of the calculation.  I should read the origional -value, add then write.  Currently I just save the new and read the origioal. -I then read both at the end.  Bad. - -eric (20-Jan-1998) diff --git a/main/openssl/crypto/ripemd/asm/rips.cpp b/main/openssl/crypto/ripemd/asm/rips.cpp deleted file mode 100644 index f7a13677..00000000 --- a/main/openssl/crypto/ripemd/asm/rips.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// -// gettsc.inl -// -// gives access to the Pentium's (secret) cycle counter -// -// This software was written by Leonard Janke (janke@unixg.ubc.ca) -// in 1996-7 and is entered, by him, into the public domain. - -#if defined(__WATCOMC__) -void GetTSC(unsigned long&); -#pragma aux GetTSC = 0x0f 0x31 "mov [edi], eax" parm [edi] modify [edx eax]; -#elif defined(__GNUC__) -inline -void GetTSC(unsigned long& tsc) -{ -  asm volatile(".byte 15, 49\n\t" -	       : "=eax" (tsc) -	       : -	       : "%edx", "%eax"); -} -#elif defined(_MSC_VER) -inline -void GetTSC(unsigned long& tsc) -{ -  unsigned long a; -  __asm _emit 0fh -  __asm _emit 31h -  __asm mov a, eax; -  tsc=a; -} -#endif       - -#include <stdio.h> -#include <stdlib.h> -#include <openssl/ripemd.h> - -#define ripemd160_block_x86 ripemd160_block_asm_host_order - -extern "C" { -void ripemd160_block_x86(RIPEMD160_CTX *ctx, unsigned char *buffer,int num); -} - -void main(int argc,char *argv[]) -	{ -	unsigned char buffer[64*256]; -	RIPEMD160_CTX ctx; -	unsigned long s1,s2,e1,e2; -	unsigned char k[16]; -	unsigned long data[2]; -	unsigned char iv[8]; -	int i,num=0,numm; -	int j=0; - -	if (argc >= 2) -		num=atoi(argv[1]); - -	if (num == 0) num=16; -	if (num > 250) num=16; -	numm=num+2; -#if 0 -	num*=64; -	numm*=64; -#endif - -	for (j=0; j<6; j++) -		{ -		for (i=0; i<10; i++) /**/ -			{ -			ripemd160_block_x86(&ctx,buffer,numm); -			GetTSC(s1); -			ripemd160_block_x86(&ctx,buffer,numm); -			GetTSC(e1); -			GetTSC(s2); -			ripemd160_block_x86(&ctx,buffer,num); -			GetTSC(e2); -			ripemd160_block_x86(&ctx,buffer,num); -			} -		printf("ripemd160 (%d bytes) %d %d (%.2f)\n",num*64, -			e1-s1,e2-s2,(double)((e1-s1)-(e2-s2))/2); -		} -	} - diff --git a/main/openssl/crypto/ripemd/asm/rmd-586.pl b/main/openssl/crypto/ripemd/asm/rmd-586.pl deleted file mode 100644 index e8b2bc2d..00000000 --- a/main/openssl/crypto/ripemd/asm/rmd-586.pl +++ /dev/null @@ -1,591 +0,0 @@ -#!/usr/local/bin/perl - -# Normal is the -# ripemd160_block_asm_data_order(RIPEMD160_CTX *c, ULONG *X,int blocks); - -$normal=0; - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],$0); - -$A="ecx"; -$B="esi"; -$C="edi"; -$D="ebx"; -$E="ebp"; -$tmp1="eax"; -$tmp2="edx"; - -$KL1=0x5A827999; -$KL2=0x6ED9EBA1; -$KL3=0x8F1BBCDC; -$KL4=0xA953FD4E; -$KR0=0x50A28BE6; -$KR1=0x5C4DD124;  -$KR2=0x6D703EF3; -$KR3=0x7A6D76E9; - - -@wl=(	 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, -	 7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8, -	 3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12, -	 1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2, -	 4, 0, 5, 9, 7,12, 2,10,14, 1, 3, 8,11, 6,15,13, -	 ); - -@wr=(	 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12, -	 6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2, -	15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13, -	 8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14, -	12,15,10, 4, 1, 5, 8, 7, 6, 2,13,14, 0, 3, 9,11, -	); - -@sl=(	11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8, -	 7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12, -	11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5, -	11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12, -	 9,15, 5,11, 6, 8,13,12, 5,12,13,14,11, 8, 5, 6, -	 ); - -@sr=(	 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6, -	 9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11, -	 9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5, -	15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8, -	 8, 5,12, 9,12, 5,14, 6, 8,13, 6, 5,15,13,11,11, - 	); - -&ripemd160_block("ripemd160_block_asm_data_order"); -&asm_finish(); - -sub Xv -	{ -	local($n)=@_; -	return(&swtmp($n)); -	# tmp on stack -	} - -sub Np -	{ -	local($p)=@_; -	local(%n)=($A,$E,$B,$A,$C,$B,$D,$C,$E,$D); -	return($n{$p}); -	} - -sub RIP1 -	{ -	local($a,$b,$c,$d,$e,$pos,$s,$o,$pos2)=@_; - -	&comment($p++); -	if ($p & 1) -		{ -	 #&mov($tmp1,	$c) if $o == -1; -	&xor($tmp1,	$d) if $o == -1; -	 &mov($tmp2,	&Xv($pos)); -	&xor($tmp1,	$b); -	 &add($a,	$tmp2); -	&rotl($c,	10); -	&add($a,	$tmp1); -	 &mov($tmp1,	&Np($c));	# NEXT -	 # XXX -	&rotl($a,	$s); -	&add($a,	$e); -		} -	else -		{ -	 &xor($tmp1,	$d); -	&mov($tmp2,	&Xv($pos)); -	 &xor($tmp1,	$b); -	&add($a,	$tmp1); -	 &mov($tmp1,	&Np($c)) if $o <= 0; -	 &mov($tmp1,	-1) if $o == 1; -	 # XXX if $o == 2; -	&rotl($c,	10); -	&add($a,	$tmp2); -	 &xor($tmp1,	&Np($d)) if $o <= 0; -	 &mov($tmp2,	&Xv($pos2)) if $o == 1; -	 &mov($tmp2,	&wparam(0)) if $o == 2; -	&rotl($a,	$s); -	&add($a,	$e); -		} -	} - -sub RIP2 -	{ -	local($a,$b,$c,$d,$e,$pos,$pos2,$s,$K,$o)=@_; - -# XXXXXX -	&comment($p++); -	if ($p & 1) -		{ -#	 &mov($tmp2,	&Xv($pos)) if $o < -1; -#	&mov($tmp1,	-1) if $o < -1; - -	 &add($a,	$tmp2); -	&mov($tmp2,	$c); -	 &sub($tmp1,	$b); -	&and($tmp2,	$b); -	 &and($tmp1,	$d); -	&or($tmp2,	$tmp1); -	 &mov($tmp1,	&Xv($pos2)) if $o <= 0; # XXXXXXXXXXXXXX -	 # XXX -	&rotl($c,	10); -	&lea($a,	&DWP($K,$a,$tmp2,1)); -	 &mov($tmp2,	-1) if $o <= 0; -	 # XXX -	&rotl($a,	$s); -	&add($a,	$e); -		} -	else -		{ -	 # XXX -	 &add($a,	$tmp1); -	&mov($tmp1,	$c); -	 &sub($tmp2,	$b); -	&and($tmp1,	$b); -	 &and($tmp2,	$d); -	if ($o != 2) -		{ -	&or($tmp1,	$tmp2); -	 &mov($tmp2,	&Xv($pos2)) if $o <= 0; -	 &mov($tmp2,	-1) if $o == 1; -	&rotl($c,	10); -	&lea($a,	&DWP($K,$a,$tmp1,1)); -	 &mov($tmp1,	-1) if $o <= 0; -	 &sub($tmp2,	&Np($c)) if $o == 1; -		} else { -	&or($tmp2,	$tmp1); -	 &mov($tmp1,	&Np($c)); -	&rotl($c,	10); -	&lea($a,	&DWP($K,$a,$tmp2,1)); -	 &xor($tmp1,	&Np($d)); -		} -	&rotl($a,	$s); -	&add($a,	$e); -		} -	} - -sub RIP3 -	{ -	local($a,$b,$c,$d,$e,$pos,$s,$K,$o,$pos2)=@_; - -	&comment($p++); -	if ($p & 1) -		{ -#	 &mov($tmp2,	-1) if $o < -1; -#	&sub($tmp2,	$c) if $o < -1; -	 &mov($tmp1,	&Xv($pos)); -	&or($tmp2,	$b); -	 &add($a,	$tmp1); -	&xor($tmp2,	$d); -	 &mov($tmp1,	-1) if $o <= 0;		# NEXT -	 # XXX -	&rotl($c,	10); -	&lea($a,	&DWP($K,$a,$tmp2,1)); -	 &sub($tmp1,	&Np($c)) if $o <= 0;	# NEXT -	 # XXX -	&rotl($a,	$s); -	&add($a,	$e); -		} -	else -		{ -	 &mov($tmp2,	&Xv($pos)); -	&or($tmp1,	$b); -	 &add($a,	$tmp2); -	&xor($tmp1,	$d); -	 &mov($tmp2,	-1) if $o <= 0;		# NEXT -	 &mov($tmp2,	-1) if $o == 1; -	 &mov($tmp2,	&Xv($pos2)) if $o == 2; -	&rotl($c,	10); -	&lea($a,	&DWP($K,$a,$tmp1,1)); -	 &sub($tmp2,	&Np($c)) if $o <= 0;	# NEXT -	 &mov($tmp1,	&Np($d)) if $o == 1; -	 &mov($tmp1,	-1) if $o == 2; -	&rotl($a,	$s); -	&add($a,	$e); -		} -	} - -sub RIP4 -	{ -	local($a,$b,$c,$d,$e,$pos,$s,$K,$o)=@_; - -	&comment($p++); -	if ($p & 1) -		{ -#	 &mov($tmp2,	-1) if $o == -2; -#	&mov($tmp1,	$d) if $o == -2; -	 &sub($tmp2,	$d); -	&and($tmp1,	$b); -	 &and($tmp2,	$c); -	&or($tmp2,	$tmp1); -	 &mov($tmp1,	&Xv($pos)); -	&rotl($c,	10); -	&lea($a,	&DWP($K,$a,$tmp2)); -	 &mov($tmp2,	-1) unless $o > 0;	# NEXT -	 # XXX -	&add($a,	$tmp1); -	 &mov($tmp1,	&Np($d)) unless $o > 0; # NEXT -	 # XXX -	&rotl($a,	$s); -	&add($a,	$e); -		} -	else -		{ -	 &sub($tmp2,	$d); -	&and($tmp1,	$b); -	 &and($tmp2,	$c); -	&or($tmp2,	$tmp1); -	 &mov($tmp1,	&Xv($pos)); -	&rotl($c,	10); -	&lea($a,	&DWP($K,$a,$tmp2)); -	 &mov($tmp2,	-1) if $o == 0;	# NEXT -	 &mov($tmp2,	-1) if $o == 1; -	 &mov($tmp2,	-1) if $o == 2; -	 # XXX -	&add($a,	$tmp1); -	 &mov($tmp1,	&Np($d)) if $o == 0;	# NEXT -	 &sub($tmp2,	&Np($d)) if $o == 1; -	 &sub($tmp2,	&Np($c)) if $o == 2; -	 # XXX -	&rotl($a,	$s); -	&add($a,	$e); -		} -	} - -sub RIP5 -	{ -	local($a,$b,$c,$d,$e,$pos,$s,$K,$o)=@_; - -	&comment($p++); -	if ($p & 1) -		{ -	 &mov($tmp2,	-1) if $o == -2; -	&sub($tmp2,	$d) if $o == -2; -	 &mov($tmp1,	&Xv($pos)); -	&or($tmp2,	$c); -	 &add($a,	$tmp1); -	&xor($tmp2,	$b); -	 &mov($tmp1,	-1) if $o <= 0; -	 # XXX -	&rotl($c,	10); -	&lea($a,	&DWP($K,$a,$tmp2,1)); -	 &sub($tmp1,	&Np($d)) if $o <= 0; -	 # XXX -	&rotl($a,	$s); -	&add($a,	$e); -		} -	else -		{ -	 &mov($tmp2,	&Xv($pos)); -	&or($tmp1,	$c); -	 &add($a,	$tmp2); -	&xor($tmp1,	$b); -	 &mov($tmp2,	-1) if $o <= 0; -	 &mov($tmp2,	&wparam(0)) if $o == 1;	# Middle code -	 &mov($tmp2,	-1) if $o == 2; -	&rotl($c,	10); -	&lea($a,	&DWP($K,$a,$tmp1,1)); -	 &sub($tmp2,	&Np($d)) if $o <= 0; -	 &mov(&swtmp(16),	$A) if $o == 1; -	 &mov($tmp1,	&Np($d)) if $o == 2; -	&rotl($a,	$s); -	&add($a,	$e); -		} -	} - -sub ripemd160_block -	{ -	local($name)=@_; - -	&function_begin_B($name,"",3); - -	# parameter 1 is the RIPEMD160_CTX structure. -	# A	0 -	# B	4 -	# C	8 -	# D 	12 -	# E 	16 - -	&mov($tmp2,	&wparam(0)); -	 &mov($tmp1,	&wparam(1)); -	&push("esi"); -	 &mov($A,	&DWP( 0,$tmp2,"",0)); -	&push("edi"); -	 &mov($B,	&DWP( 4,$tmp2,"",0)); -	&push("ebp"); -	 &mov($C,	&DWP( 8,$tmp2,"",0)); -	&push("ebx"); -	 &stack_push(16+5+6); -			  # Special comment about the figure of 6. -			  # Idea is to pad the current frame so -			  # that the top of the stack gets fairly -			  # aligned. Well, as you realize it would -			  # always depend on how the frame below is -			  # aligned. The good news are that gcc-2.95 -			  # and later does keep first argument at -			  # least double-wise aligned. -			  #			<appro@fy.chalmers.se> - -	&set_label("start") unless $normal; -	&comment(""); - -	# &mov($tmp1,	&wparam(1)); # Done at end of loop -	# &mov($tmp2,	&wparam(0)); # Done at end of loop - -	for ($z=0; $z<16; $z+=2) -		{ -		&mov($D,		&DWP( $z*4,$tmp1,"",0)); -		 &mov($E,		&DWP( ($z+1)*4,$tmp1,"",0)); -		&mov(&swtmp($z),	$D); -		 &mov(&swtmp($z+1),	$E); -		} -	&mov($tmp1,	$C); -	 &mov($D,	&DWP(12,$tmp2,"",0)); -	&mov($E,	&DWP(16,$tmp2,"",0)); - -	&RIP1($A,$B,$C,$D,$E,$wl[ 0],$sl[ 0],-1); -	&RIP1($E,$A,$B,$C,$D,$wl[ 1],$sl[ 1],0); -	&RIP1($D,$E,$A,$B,$C,$wl[ 2],$sl[ 2],0); -	&RIP1($C,$D,$E,$A,$B,$wl[ 3],$sl[ 3],0); -	&RIP1($B,$C,$D,$E,$A,$wl[ 4],$sl[ 4],0); -	&RIP1($A,$B,$C,$D,$E,$wl[ 5],$sl[ 5],0); -	&RIP1($E,$A,$B,$C,$D,$wl[ 6],$sl[ 6],0); -	&RIP1($D,$E,$A,$B,$C,$wl[ 7],$sl[ 7],0); -	&RIP1($C,$D,$E,$A,$B,$wl[ 8],$sl[ 8],0); -	&RIP1($B,$C,$D,$E,$A,$wl[ 9],$sl[ 9],0); -	&RIP1($A,$B,$C,$D,$E,$wl[10],$sl[10],0); -	&RIP1($E,$A,$B,$C,$D,$wl[11],$sl[11],0); -	&RIP1($D,$E,$A,$B,$C,$wl[12],$sl[12],0); -	&RIP1($C,$D,$E,$A,$B,$wl[13],$sl[13],0); -	&RIP1($B,$C,$D,$E,$A,$wl[14],$sl[14],0); -	&RIP1($A,$B,$C,$D,$E,$wl[15],$sl[15],1,$wl[16]); - -	&RIP2($E,$A,$B,$C,$D,$wl[16],$wl[17],$sl[16],$KL1,-1); -	&RIP2($D,$E,$A,$B,$C,$wl[17],$wl[18],$sl[17],$KL1,0); -	&RIP2($C,$D,$E,$A,$B,$wl[18],$wl[19],$sl[18],$KL1,0); -	&RIP2($B,$C,$D,$E,$A,$wl[19],$wl[20],$sl[19],$KL1,0); -	&RIP2($A,$B,$C,$D,$E,$wl[20],$wl[21],$sl[20],$KL1,0); -	&RIP2($E,$A,$B,$C,$D,$wl[21],$wl[22],$sl[21],$KL1,0); -	&RIP2($D,$E,$A,$B,$C,$wl[22],$wl[23],$sl[22],$KL1,0); -	&RIP2($C,$D,$E,$A,$B,$wl[23],$wl[24],$sl[23],$KL1,0); -	&RIP2($B,$C,$D,$E,$A,$wl[24],$wl[25],$sl[24],$KL1,0); -	&RIP2($A,$B,$C,$D,$E,$wl[25],$wl[26],$sl[25],$KL1,0); -	&RIP2($E,$A,$B,$C,$D,$wl[26],$wl[27],$sl[26],$KL1,0); -	&RIP2($D,$E,$A,$B,$C,$wl[27],$wl[28],$sl[27],$KL1,0); -	&RIP2($C,$D,$E,$A,$B,$wl[28],$wl[29],$sl[28],$KL1,0); -	&RIP2($B,$C,$D,$E,$A,$wl[29],$wl[30],$sl[29],$KL1,0); -	&RIP2($A,$B,$C,$D,$E,$wl[30],$wl[31],$sl[30],$KL1,0); -	&RIP2($E,$A,$B,$C,$D,$wl[31],$wl[32],$sl[31],$KL1,1); - -	&RIP3($D,$E,$A,$B,$C,$wl[32],$sl[32],$KL2,-1); -	&RIP3($C,$D,$E,$A,$B,$wl[33],$sl[33],$KL2,0); -	&RIP3($B,$C,$D,$E,$A,$wl[34],$sl[34],$KL2,0); -	&RIP3($A,$B,$C,$D,$E,$wl[35],$sl[35],$KL2,0); -	&RIP3($E,$A,$B,$C,$D,$wl[36],$sl[36],$KL2,0); -	&RIP3($D,$E,$A,$B,$C,$wl[37],$sl[37],$KL2,0); -	&RIP3($C,$D,$E,$A,$B,$wl[38],$sl[38],$KL2,0); -	&RIP3($B,$C,$D,$E,$A,$wl[39],$sl[39],$KL2,0); -	&RIP3($A,$B,$C,$D,$E,$wl[40],$sl[40],$KL2,0); -	&RIP3($E,$A,$B,$C,$D,$wl[41],$sl[41],$KL2,0); -	&RIP3($D,$E,$A,$B,$C,$wl[42],$sl[42],$KL2,0); -	&RIP3($C,$D,$E,$A,$B,$wl[43],$sl[43],$KL2,0); -	&RIP3($B,$C,$D,$E,$A,$wl[44],$sl[44],$KL2,0); -	&RIP3($A,$B,$C,$D,$E,$wl[45],$sl[45],$KL2,0); -	&RIP3($E,$A,$B,$C,$D,$wl[46],$sl[46],$KL2,0); -	&RIP3($D,$E,$A,$B,$C,$wl[47],$sl[47],$KL2,1); - -	&RIP4($C,$D,$E,$A,$B,$wl[48],$sl[48],$KL3,-1); -	&RIP4($B,$C,$D,$E,$A,$wl[49],$sl[49],$KL3,0); -	&RIP4($A,$B,$C,$D,$E,$wl[50],$sl[50],$KL3,0); -	&RIP4($E,$A,$B,$C,$D,$wl[51],$sl[51],$KL3,0); -	&RIP4($D,$E,$A,$B,$C,$wl[52],$sl[52],$KL3,0); -	&RIP4($C,$D,$E,$A,$B,$wl[53],$sl[53],$KL3,0); -	&RIP4($B,$C,$D,$E,$A,$wl[54],$sl[54],$KL3,0); -	&RIP4($A,$B,$C,$D,$E,$wl[55],$sl[55],$KL3,0); -	&RIP4($E,$A,$B,$C,$D,$wl[56],$sl[56],$KL3,0); -	&RIP4($D,$E,$A,$B,$C,$wl[57],$sl[57],$KL3,0); -	&RIP4($C,$D,$E,$A,$B,$wl[58],$sl[58],$KL3,0); -	&RIP4($B,$C,$D,$E,$A,$wl[59],$sl[59],$KL3,0); -	&RIP4($A,$B,$C,$D,$E,$wl[60],$sl[60],$KL3,0); -	&RIP4($E,$A,$B,$C,$D,$wl[61],$sl[61],$KL3,0); -	&RIP4($D,$E,$A,$B,$C,$wl[62],$sl[62],$KL3,0); -	&RIP4($C,$D,$E,$A,$B,$wl[63],$sl[63],$KL3,1); - -	&RIP5($B,$C,$D,$E,$A,$wl[64],$sl[64],$KL4,-1); -	&RIP5($A,$B,$C,$D,$E,$wl[65],$sl[65],$KL4,0); -	&RIP5($E,$A,$B,$C,$D,$wl[66],$sl[66],$KL4,0); -	&RIP5($D,$E,$A,$B,$C,$wl[67],$sl[67],$KL4,0); -	&RIP5($C,$D,$E,$A,$B,$wl[68],$sl[68],$KL4,0); -	&RIP5($B,$C,$D,$E,$A,$wl[69],$sl[69],$KL4,0); -	&RIP5($A,$B,$C,$D,$E,$wl[70],$sl[70],$KL4,0); -	&RIP5($E,$A,$B,$C,$D,$wl[71],$sl[71],$KL4,0); -	&RIP5($D,$E,$A,$B,$C,$wl[72],$sl[72],$KL4,0); -	&RIP5($C,$D,$E,$A,$B,$wl[73],$sl[73],$KL4,0); -	&RIP5($B,$C,$D,$E,$A,$wl[74],$sl[74],$KL4,0); -	&RIP5($A,$B,$C,$D,$E,$wl[75],$sl[75],$KL4,0); -	&RIP5($E,$A,$B,$C,$D,$wl[76],$sl[76],$KL4,0); -	&RIP5($D,$E,$A,$B,$C,$wl[77],$sl[77],$KL4,0); -	&RIP5($C,$D,$E,$A,$B,$wl[78],$sl[78],$KL4,0); -	&RIP5($B,$C,$D,$E,$A,$wl[79],$sl[79],$KL4,1); - -	# &mov($tmp2,	&wparam(0)); # moved into last RIP5 -	# &mov(&swtmp(16),	$A); -	 &mov($A,	&DWP( 0,$tmp2,"",0)); -	&mov(&swtmp(16+1),	$B); -	 &mov(&swtmp(16+2),	$C); -	&mov($B,	&DWP( 4,$tmp2,"",0)); -	 &mov(&swtmp(16+3),	$D); -	&mov($C,	&DWP( 8,$tmp2,"",0)); -	 &mov(&swtmp(16+4),	$E); -	&mov($D,	&DWP(12,$tmp2,"",0)); -	 &mov($E,	&DWP(16,$tmp2,"",0)); - -	&RIP5($A,$B,$C,$D,$E,$wr[ 0],$sr[ 0],$KR0,-2); -	&RIP5($E,$A,$B,$C,$D,$wr[ 1],$sr[ 1],$KR0,0); -	&RIP5($D,$E,$A,$B,$C,$wr[ 2],$sr[ 2],$KR0,0); -	&RIP5($C,$D,$E,$A,$B,$wr[ 3],$sr[ 3],$KR0,0); -	&RIP5($B,$C,$D,$E,$A,$wr[ 4],$sr[ 4],$KR0,0); -	&RIP5($A,$B,$C,$D,$E,$wr[ 5],$sr[ 5],$KR0,0); -	&RIP5($E,$A,$B,$C,$D,$wr[ 6],$sr[ 6],$KR0,0); -	&RIP5($D,$E,$A,$B,$C,$wr[ 7],$sr[ 7],$KR0,0); -	&RIP5($C,$D,$E,$A,$B,$wr[ 8],$sr[ 8],$KR0,0); -	&RIP5($B,$C,$D,$E,$A,$wr[ 9],$sr[ 9],$KR0,0); -	&RIP5($A,$B,$C,$D,$E,$wr[10],$sr[10],$KR0,0); -	&RIP5($E,$A,$B,$C,$D,$wr[11],$sr[11],$KR0,0); -	&RIP5($D,$E,$A,$B,$C,$wr[12],$sr[12],$KR0,0); -	&RIP5($C,$D,$E,$A,$B,$wr[13],$sr[13],$KR0,0); -	&RIP5($B,$C,$D,$E,$A,$wr[14],$sr[14],$KR0,0); -	&RIP5($A,$B,$C,$D,$E,$wr[15],$sr[15],$KR0,2); - -	&RIP4($E,$A,$B,$C,$D,$wr[16],$sr[16],$KR1,-2); -	&RIP4($D,$E,$A,$B,$C,$wr[17],$sr[17],$KR1,0); -	&RIP4($C,$D,$E,$A,$B,$wr[18],$sr[18],$KR1,0); -	&RIP4($B,$C,$D,$E,$A,$wr[19],$sr[19],$KR1,0); -	&RIP4($A,$B,$C,$D,$E,$wr[20],$sr[20],$KR1,0); -	&RIP4($E,$A,$B,$C,$D,$wr[21],$sr[21],$KR1,0); -	&RIP4($D,$E,$A,$B,$C,$wr[22],$sr[22],$KR1,0); -	&RIP4($C,$D,$E,$A,$B,$wr[23],$sr[23],$KR1,0); -	&RIP4($B,$C,$D,$E,$A,$wr[24],$sr[24],$KR1,0); -	&RIP4($A,$B,$C,$D,$E,$wr[25],$sr[25],$KR1,0); -	&RIP4($E,$A,$B,$C,$D,$wr[26],$sr[26],$KR1,0); -	&RIP4($D,$E,$A,$B,$C,$wr[27],$sr[27],$KR1,0); -	&RIP4($C,$D,$E,$A,$B,$wr[28],$sr[28],$KR1,0); -	&RIP4($B,$C,$D,$E,$A,$wr[29],$sr[29],$KR1,0); -	&RIP4($A,$B,$C,$D,$E,$wr[30],$sr[30],$KR1,0); -	&RIP4($E,$A,$B,$C,$D,$wr[31],$sr[31],$KR1,2); - -	&RIP3($D,$E,$A,$B,$C,$wr[32],$sr[32],$KR2,-2); -	&RIP3($C,$D,$E,$A,$B,$wr[33],$sr[33],$KR2,0); -	&RIP3($B,$C,$D,$E,$A,$wr[34],$sr[34],$KR2,0); -	&RIP3($A,$B,$C,$D,$E,$wr[35],$sr[35],$KR2,0); -	&RIP3($E,$A,$B,$C,$D,$wr[36],$sr[36],$KR2,0); -	&RIP3($D,$E,$A,$B,$C,$wr[37],$sr[37],$KR2,0); -	&RIP3($C,$D,$E,$A,$B,$wr[38],$sr[38],$KR2,0); -	&RIP3($B,$C,$D,$E,$A,$wr[39],$sr[39],$KR2,0); -	&RIP3($A,$B,$C,$D,$E,$wr[40],$sr[40],$KR2,0); -	&RIP3($E,$A,$B,$C,$D,$wr[41],$sr[41],$KR2,0); -	&RIP3($D,$E,$A,$B,$C,$wr[42],$sr[42],$KR2,0); -	&RIP3($C,$D,$E,$A,$B,$wr[43],$sr[43],$KR2,0); -	&RIP3($B,$C,$D,$E,$A,$wr[44],$sr[44],$KR2,0); -	&RIP3($A,$B,$C,$D,$E,$wr[45],$sr[45],$KR2,0); -	&RIP3($E,$A,$B,$C,$D,$wr[46],$sr[46],$KR2,0); -	&RIP3($D,$E,$A,$B,$C,$wr[47],$sr[47],$KR2,2,$wr[48]); - -	&RIP2($C,$D,$E,$A,$B,$wr[48],$wr[49],$sr[48],$KR3,-2); -	&RIP2($B,$C,$D,$E,$A,$wr[49],$wr[50],$sr[49],$KR3,0); -	&RIP2($A,$B,$C,$D,$E,$wr[50],$wr[51],$sr[50],$KR3,0); -	&RIP2($E,$A,$B,$C,$D,$wr[51],$wr[52],$sr[51],$KR3,0); -	&RIP2($D,$E,$A,$B,$C,$wr[52],$wr[53],$sr[52],$KR3,0); -	&RIP2($C,$D,$E,$A,$B,$wr[53],$wr[54],$sr[53],$KR3,0); -	&RIP2($B,$C,$D,$E,$A,$wr[54],$wr[55],$sr[54],$KR3,0); -	&RIP2($A,$B,$C,$D,$E,$wr[55],$wr[56],$sr[55],$KR3,0); -	&RIP2($E,$A,$B,$C,$D,$wr[56],$wr[57],$sr[56],$KR3,0); -	&RIP2($D,$E,$A,$B,$C,$wr[57],$wr[58],$sr[57],$KR3,0); -	&RIP2($C,$D,$E,$A,$B,$wr[58],$wr[59],$sr[58],$KR3,0); -	&RIP2($B,$C,$D,$E,$A,$wr[59],$wr[60],$sr[59],$KR3,0); -	&RIP2($A,$B,$C,$D,$E,$wr[60],$wr[61],$sr[60],$KR3,0); -	&RIP2($E,$A,$B,$C,$D,$wr[61],$wr[62],$sr[61],$KR3,0); -	&RIP2($D,$E,$A,$B,$C,$wr[62],$wr[63],$sr[62],$KR3,0); -	&RIP2($C,$D,$E,$A,$B,$wr[63],$wr[64],$sr[63],$KR3,2); - -	&RIP1($B,$C,$D,$E,$A,$wr[64],$sr[64],-2); -	&RIP1($A,$B,$C,$D,$E,$wr[65],$sr[65],0); -	&RIP1($E,$A,$B,$C,$D,$wr[66],$sr[66],0); -	&RIP1($D,$E,$A,$B,$C,$wr[67],$sr[67],0); -	&RIP1($C,$D,$E,$A,$B,$wr[68],$sr[68],0); -	&RIP1($B,$C,$D,$E,$A,$wr[69],$sr[69],0); -	&RIP1($A,$B,$C,$D,$E,$wr[70],$sr[70],0); -	&RIP1($E,$A,$B,$C,$D,$wr[71],$sr[71],0); -	&RIP1($D,$E,$A,$B,$C,$wr[72],$sr[72],0); -	&RIP1($C,$D,$E,$A,$B,$wr[73],$sr[73],0); -	&RIP1($B,$C,$D,$E,$A,$wr[74],$sr[74],0); -	&RIP1($A,$B,$C,$D,$E,$wr[75],$sr[75],0); -	&RIP1($E,$A,$B,$C,$D,$wr[76],$sr[76],0); -	&RIP1($D,$E,$A,$B,$C,$wr[77],$sr[77],0); -	&RIP1($C,$D,$E,$A,$B,$wr[78],$sr[78],0); -	&RIP1($B,$C,$D,$E,$A,$wr[79],$sr[79],2); - -	# &mov($tmp2,	&wparam(0)); # Moved into last round - -	 &mov($tmp1,	&DWP( 4,$tmp2,"",0));	# ctx->B - 	&add($D,	$tmp1);	 -	 &mov($tmp1,	&swtmp(16+2));		# $c -	&add($D,	$tmp1); - -	 &mov($tmp1,	&DWP( 8,$tmp2,"",0));	# ctx->C -	&add($E,	$tmp1);	 -	 &mov($tmp1,	&swtmp(16+3));		# $d -	&add($E,	$tmp1); - -	 &mov($tmp1,	&DWP(12,$tmp2,"",0));	# ctx->D -	&add($A,	$tmp1);	 -	 &mov($tmp1,	&swtmp(16+4));		# $e -	&add($A,	$tmp1); - - -	 &mov($tmp1,	&DWP(16,$tmp2,"",0));	# ctx->E -	&add($B,	$tmp1);	 -	 &mov($tmp1,	&swtmp(16+0));		# $a -	&add($B,	$tmp1); - -	 &mov($tmp1,	&DWP( 0,$tmp2,"",0));	# ctx->A -	&add($C,	$tmp1);	 -	 &mov($tmp1,	&swtmp(16+1));		# $b -	&add($C,	$tmp1); - -	 &mov($tmp1,	&wparam(2)); - -	&mov(&DWP( 0,$tmp2,"",0),	$D); -	 &mov(&DWP( 4,$tmp2,"",0),	$E); -	&mov(&DWP( 8,$tmp2,"",0),	$A); -	 &sub($tmp1,1); -	&mov(&DWP(12,$tmp2,"",0),	$B); -	 &mov(&DWP(16,$tmp2,"",0),	$C); - -	&jle(&label("get_out")); - -	&mov(&wparam(2),$tmp1); -	 &mov($C,	$A); -	&mov($tmp1,	&wparam(1)); -	 &mov($A,	$D); -	&add($tmp1,	64); -	 &mov($B,	$E); -	&mov(&wparam(1),$tmp1); - -	&jmp(&label("start")); - -	&set_label("get_out"); - -	&stack_pop(16+5+6); - -	&pop("ebx"); -	&pop("ebp"); -	&pop("edi"); -	&pop("esi"); -	&ret(); -	&function_end_B($name); -	} - diff --git a/main/openssl/crypto/ripemd/ripemd.h b/main/openssl/crypto/ripemd/ripemd.h deleted file mode 100644 index 189bd8c9..00000000 --- a/main/openssl/crypto/ripemd/ripemd.h +++ /dev/null @@ -1,107 +0,0 @@ -/* crypto/ripemd/ripemd.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - *  - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to.  The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code.  The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - *  - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - *  - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - *    notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - *    notice, this list of conditions and the following disclaimer in the - *    documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - *    must display the following acknowledgement: - *    "This product includes cryptographic software written by - *     Eric Young (eay@cryptsoft.com)" - *    The word 'cryptographic' can be left out if the rouines from the library - *    being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from  - *    the apps directory (application code) you must include an acknowledgement: - *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - *  - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - *  - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed.  i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#ifndef HEADER_RIPEMD_H -#define HEADER_RIPEMD_H - -#include <openssl/e_os2.h> -#include <stddef.h> - -#ifdef  __cplusplus -extern "C" { -#endif - -#ifdef OPENSSL_NO_RIPEMD -#error RIPEMD is disabled. -#endif - -#if defined(__LP32__) -#define RIPEMD160_LONG unsigned long -#elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__) -#define RIPEMD160_LONG unsigned long -#define RIPEMD160_LONG_LOG2 3 -#else -#define RIPEMD160_LONG unsigned int -#endif - -#define RIPEMD160_CBLOCK	64 -#define RIPEMD160_LBLOCK	(RIPEMD160_CBLOCK/4) -#define RIPEMD160_DIGEST_LENGTH	20 - -typedef struct RIPEMD160state_st -	{ -	RIPEMD160_LONG A,B,C,D,E; -	RIPEMD160_LONG Nl,Nh; -	RIPEMD160_LONG data[RIPEMD160_LBLOCK]; -	unsigned int   num; -	} RIPEMD160_CTX; - -#ifdef OPENSSL_FIPS -int private_RIPEMD160_Init(RIPEMD160_CTX *c); -#endif -int RIPEMD160_Init(RIPEMD160_CTX *c); -int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len); -int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c); -unsigned char *RIPEMD160(const unsigned char *d, size_t n, -	unsigned char *md); -void RIPEMD160_Transform(RIPEMD160_CTX *c, const unsigned char *b); -#ifdef  __cplusplus -} -#endif - -#endif diff --git a/main/openssl/crypto/ripemd/rmd160.c b/main/openssl/crypto/ripemd/rmd160.c deleted file mode 100644 index b0ec5744..00000000 --- a/main/openssl/crypto/ripemd/rmd160.c +++ /dev/null @@ -1,127 +0,0 @@ -/* crypto/ripemd/rmd160.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - *  - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to.  The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code.  The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - *  - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - *  - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - *    notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - *    notice, this list of conditions and the following disclaimer in the - *    documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - *    must display the following acknowledgement: - *    "This product includes cryptographic software written by - *     Eric Young (eay@cryptsoft.com)" - *    The word 'cryptographic' can be left out if the rouines from the library - *    being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from  - *    the apps directory (application code) you must include an acknowledgement: - *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - *  - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - *  - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed.  i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <stdlib.h> -#include <openssl/ripemd.h> - -#define BUFSIZE	1024*16 - -void do_fp(FILE *f); -void pt(unsigned char *md); -#if !defined(_OSD_POSIX) && !defined(__DJGPP__) -int read(int, void *, unsigned int); -#endif - -int main(int argc, char **argv) -	{ -	int i,err=0; -	FILE *IN; - -	if (argc == 1) -		{ -		do_fp(stdin); -		} -	else -		{ -		for (i=1; i<argc; i++) -			{ -			IN=fopen(argv[i],"r"); -			if (IN == NULL) -				{ -				perror(argv[i]); -				err++; -				continue; -				} -			printf("RIPEMD160(%s)= ",argv[i]); -			do_fp(IN); -			fclose(IN); -			} -		} -	exit(err); -	} - -void do_fp(FILE *f) -	{ -	RIPEMD160_CTX c; -	unsigned char md[RIPEMD160_DIGEST_LENGTH]; -	int fd; -	int i; -	static unsigned char buf[BUFSIZE]; - -	fd=fileno(f); -	RIPEMD160_Init(&c); -	for (;;) -		{ -		i=read(fd,buf,BUFSIZE); -		if (i <= 0) break; -		RIPEMD160_Update(&c,buf,(unsigned long)i); -		} -	RIPEMD160_Final(&(md[0]),&c); -	pt(md); -	} - -void pt(unsigned char *md) -	{ -	int i; - -	for (i=0; i<RIPEMD160_DIGEST_LENGTH; i++) -		printf("%02x",md[i]); -	printf("\n"); -	} - diff --git a/main/openssl/crypto/ripemd/rmd_dgst.c b/main/openssl/crypto/ripemd/rmd_dgst.c deleted file mode 100644 index d8e72da5..00000000 --- a/main/openssl/crypto/ripemd/rmd_dgst.c +++ /dev/null @@ -1,292 +0,0 @@ -/* crypto/ripemd/rmd_dgst.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - *  - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to.  The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code.  The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - *  - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - *  - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - *    notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - *    notice, this list of conditions and the following disclaimer in the - *    documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - *    must display the following acknowledgement: - *    "This product includes cryptographic software written by - *     Eric Young (eay@cryptsoft.com)" - *    The word 'cryptographic' can be left out if the rouines from the library - *    being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from  - *    the apps directory (application code) you must include an acknowledgement: - *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - *  - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - *  - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed.  i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include "rmd_locl.h" -#include <openssl/opensslv.h> -#include <openssl/crypto.h> - -const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT; - -#  ifdef RMD160_ASM -     void ripemd160_block_x86(RIPEMD160_CTX *c, unsigned long *p,size_t num); -#    define ripemd160_block ripemd160_block_x86 -#  else -     void ripemd160_block(RIPEMD160_CTX *c, unsigned long *p,size_t num); -#  endif - -fips_md_init(RIPEMD160) -	{ -	memset (c,0,sizeof(*c)); -	c->A=RIPEMD160_A; -	c->B=RIPEMD160_B; -	c->C=RIPEMD160_C; -	c->D=RIPEMD160_D; -	c->E=RIPEMD160_E; -	return 1; -	} - -#ifndef ripemd160_block_data_order -#ifdef X -#undef X -#endif -void ripemd160_block_data_order (RIPEMD160_CTX *ctx, const void *p, size_t num) -	{ -	const unsigned char *data=p; -	register unsigned MD32_REG_T A,B,C,D,E; -	unsigned MD32_REG_T a,b,c,d,e,l; -#ifndef MD32_XARRAY -	/* See comment in crypto/sha/sha_locl.h for details. */ -	unsigned MD32_REG_T	XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, -				XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; -# define X(i)	XX##i -#else -	RIPEMD160_LONG	XX[16]; -# define X(i)	XX[i] -#endif - -	for (;num--;) -		{ - -	A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E; - -	(void)HOST_c2l(data,l); X( 0)=l;(void)HOST_c2l(data,l); X( 1)=l; -	RIP1(A,B,C,D,E,WL00,SL00);	(void)HOST_c2l(data,l); X( 2)=l; -	RIP1(E,A,B,C,D,WL01,SL01);	(void)HOST_c2l(data,l); X( 3)=l; -	RIP1(D,E,A,B,C,WL02,SL02);	(void)HOST_c2l(data,l); X( 4)=l; -	RIP1(C,D,E,A,B,WL03,SL03);	(void)HOST_c2l(data,l); X( 5)=l; -	RIP1(B,C,D,E,A,WL04,SL04);	(void)HOST_c2l(data,l); X( 6)=l; -	RIP1(A,B,C,D,E,WL05,SL05);	(void)HOST_c2l(data,l); X( 7)=l; -	RIP1(E,A,B,C,D,WL06,SL06);	(void)HOST_c2l(data,l); X( 8)=l; -	RIP1(D,E,A,B,C,WL07,SL07);	(void)HOST_c2l(data,l); X( 9)=l; -	RIP1(C,D,E,A,B,WL08,SL08);	(void)HOST_c2l(data,l); X(10)=l; -	RIP1(B,C,D,E,A,WL09,SL09);	(void)HOST_c2l(data,l); X(11)=l; -	RIP1(A,B,C,D,E,WL10,SL10);	(void)HOST_c2l(data,l); X(12)=l; -	RIP1(E,A,B,C,D,WL11,SL11);	(void)HOST_c2l(data,l); X(13)=l; -	RIP1(D,E,A,B,C,WL12,SL12);	(void)HOST_c2l(data,l); X(14)=l; -	RIP1(C,D,E,A,B,WL13,SL13);	(void)HOST_c2l(data,l); X(15)=l; -	RIP1(B,C,D,E,A,WL14,SL14); -	RIP1(A,B,C,D,E,WL15,SL15); - -	RIP2(E,A,B,C,D,WL16,SL16,KL1); -	RIP2(D,E,A,B,C,WL17,SL17,KL1); -	RIP2(C,D,E,A,B,WL18,SL18,KL1); -	RIP2(B,C,D,E,A,WL19,SL19,KL1); -	RIP2(A,B,C,D,E,WL20,SL20,KL1); -	RIP2(E,A,B,C,D,WL21,SL21,KL1); -	RIP2(D,E,A,B,C,WL22,SL22,KL1); -	RIP2(C,D,E,A,B,WL23,SL23,KL1); -	RIP2(B,C,D,E,A,WL24,SL24,KL1); -	RIP2(A,B,C,D,E,WL25,SL25,KL1); -	RIP2(E,A,B,C,D,WL26,SL26,KL1); -	RIP2(D,E,A,B,C,WL27,SL27,KL1); -	RIP2(C,D,E,A,B,WL28,SL28,KL1); -	RIP2(B,C,D,E,A,WL29,SL29,KL1); -	RIP2(A,B,C,D,E,WL30,SL30,KL1); -	RIP2(E,A,B,C,D,WL31,SL31,KL1); - -	RIP3(D,E,A,B,C,WL32,SL32,KL2); -	RIP3(C,D,E,A,B,WL33,SL33,KL2); -	RIP3(B,C,D,E,A,WL34,SL34,KL2); -	RIP3(A,B,C,D,E,WL35,SL35,KL2); -	RIP3(E,A,B,C,D,WL36,SL36,KL2); -	RIP3(D,E,A,B,C,WL37,SL37,KL2); -	RIP3(C,D,E,A,B,WL38,SL38,KL2); -	RIP3(B,C,D,E,A,WL39,SL39,KL2); -	RIP3(A,B,C,D,E,WL40,SL40,KL2); -	RIP3(E,A,B,C,D,WL41,SL41,KL2); -	RIP3(D,E,A,B,C,WL42,SL42,KL2); -	RIP3(C,D,E,A,B,WL43,SL43,KL2); -	RIP3(B,C,D,E,A,WL44,SL44,KL2); -	RIP3(A,B,C,D,E,WL45,SL45,KL2); -	RIP3(E,A,B,C,D,WL46,SL46,KL2); -	RIP3(D,E,A,B,C,WL47,SL47,KL2); - -	RIP4(C,D,E,A,B,WL48,SL48,KL3); -	RIP4(B,C,D,E,A,WL49,SL49,KL3); -	RIP4(A,B,C,D,E,WL50,SL50,KL3); -	RIP4(E,A,B,C,D,WL51,SL51,KL3); -	RIP4(D,E,A,B,C,WL52,SL52,KL3); -	RIP4(C,D,E,A,B,WL53,SL53,KL3); -	RIP4(B,C,D,E,A,WL54,SL54,KL3); -	RIP4(A,B,C,D,E,WL55,SL55,KL3); -	RIP4(E,A,B,C,D,WL56,SL56,KL3); -	RIP4(D,E,A,B,C,WL57,SL57,KL3); -	RIP4(C,D,E,A,B,WL58,SL58,KL3); -	RIP4(B,C,D,E,A,WL59,SL59,KL3); -	RIP4(A,B,C,D,E,WL60,SL60,KL3); -	RIP4(E,A,B,C,D,WL61,SL61,KL3); -	RIP4(D,E,A,B,C,WL62,SL62,KL3); -	RIP4(C,D,E,A,B,WL63,SL63,KL3); - -	RIP5(B,C,D,E,A,WL64,SL64,KL4); -	RIP5(A,B,C,D,E,WL65,SL65,KL4); -	RIP5(E,A,B,C,D,WL66,SL66,KL4); -	RIP5(D,E,A,B,C,WL67,SL67,KL4); -	RIP5(C,D,E,A,B,WL68,SL68,KL4); -	RIP5(B,C,D,E,A,WL69,SL69,KL4); -	RIP5(A,B,C,D,E,WL70,SL70,KL4); -	RIP5(E,A,B,C,D,WL71,SL71,KL4); -	RIP5(D,E,A,B,C,WL72,SL72,KL4); -	RIP5(C,D,E,A,B,WL73,SL73,KL4); -	RIP5(B,C,D,E,A,WL74,SL74,KL4); -	RIP5(A,B,C,D,E,WL75,SL75,KL4); -	RIP5(E,A,B,C,D,WL76,SL76,KL4); -	RIP5(D,E,A,B,C,WL77,SL77,KL4); -	RIP5(C,D,E,A,B,WL78,SL78,KL4); -	RIP5(B,C,D,E,A,WL79,SL79,KL4); - -	a=A; b=B; c=C; d=D; e=E; -	/* Do other half */ -	A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E; - -	RIP5(A,B,C,D,E,WR00,SR00,KR0); -	RIP5(E,A,B,C,D,WR01,SR01,KR0); -	RIP5(D,E,A,B,C,WR02,SR02,KR0); -	RIP5(C,D,E,A,B,WR03,SR03,KR0); -	RIP5(B,C,D,E,A,WR04,SR04,KR0); -	RIP5(A,B,C,D,E,WR05,SR05,KR0); -	RIP5(E,A,B,C,D,WR06,SR06,KR0); -	RIP5(D,E,A,B,C,WR07,SR07,KR0); -	RIP5(C,D,E,A,B,WR08,SR08,KR0); -	RIP5(B,C,D,E,A,WR09,SR09,KR0); -	RIP5(A,B,C,D,E,WR10,SR10,KR0); -	RIP5(E,A,B,C,D,WR11,SR11,KR0); -	RIP5(D,E,A,B,C,WR12,SR12,KR0); -	RIP5(C,D,E,A,B,WR13,SR13,KR0); -	RIP5(B,C,D,E,A,WR14,SR14,KR0); -	RIP5(A,B,C,D,E,WR15,SR15,KR0); - -	RIP4(E,A,B,C,D,WR16,SR16,KR1); -	RIP4(D,E,A,B,C,WR17,SR17,KR1); -	RIP4(C,D,E,A,B,WR18,SR18,KR1); -	RIP4(B,C,D,E,A,WR19,SR19,KR1); -	RIP4(A,B,C,D,E,WR20,SR20,KR1); -	RIP4(E,A,B,C,D,WR21,SR21,KR1); -	RIP4(D,E,A,B,C,WR22,SR22,KR1); -	RIP4(C,D,E,A,B,WR23,SR23,KR1); -	RIP4(B,C,D,E,A,WR24,SR24,KR1); -	RIP4(A,B,C,D,E,WR25,SR25,KR1); -	RIP4(E,A,B,C,D,WR26,SR26,KR1); -	RIP4(D,E,A,B,C,WR27,SR27,KR1); -	RIP4(C,D,E,A,B,WR28,SR28,KR1); -	RIP4(B,C,D,E,A,WR29,SR29,KR1); -	RIP4(A,B,C,D,E,WR30,SR30,KR1); -	RIP4(E,A,B,C,D,WR31,SR31,KR1); - -	RIP3(D,E,A,B,C,WR32,SR32,KR2); -	RIP3(C,D,E,A,B,WR33,SR33,KR2); -	RIP3(B,C,D,E,A,WR34,SR34,KR2); -	RIP3(A,B,C,D,E,WR35,SR35,KR2); -	RIP3(E,A,B,C,D,WR36,SR36,KR2); -	RIP3(D,E,A,B,C,WR37,SR37,KR2); -	RIP3(C,D,E,A,B,WR38,SR38,KR2); -	RIP3(B,C,D,E,A,WR39,SR39,KR2); -	RIP3(A,B,C,D,E,WR40,SR40,KR2); -	RIP3(E,A,B,C,D,WR41,SR41,KR2); -	RIP3(D,E,A,B,C,WR42,SR42,KR2); -	RIP3(C,D,E,A,B,WR43,SR43,KR2); -	RIP3(B,C,D,E,A,WR44,SR44,KR2); -	RIP3(A,B,C,D,E,WR45,SR45,KR2); -	RIP3(E,A,B,C,D,WR46,SR46,KR2); -	RIP3(D,E,A,B,C,WR47,SR47,KR2); - -	RIP2(C,D,E,A,B,WR48,SR48,KR3); -	RIP2(B,C,D,E,A,WR49,SR49,KR3); -	RIP2(A,B,C,D,E,WR50,SR50,KR3); -	RIP2(E,A,B,C,D,WR51,SR51,KR3); -	RIP2(D,E,A,B,C,WR52,SR52,KR3); -	RIP2(C,D,E,A,B,WR53,SR53,KR3); -	RIP2(B,C,D,E,A,WR54,SR54,KR3); -	RIP2(A,B,C,D,E,WR55,SR55,KR3); -	RIP2(E,A,B,C,D,WR56,SR56,KR3); -	RIP2(D,E,A,B,C,WR57,SR57,KR3); -	RIP2(C,D,E,A,B,WR58,SR58,KR3); -	RIP2(B,C,D,E,A,WR59,SR59,KR3); -	RIP2(A,B,C,D,E,WR60,SR60,KR3); -	RIP2(E,A,B,C,D,WR61,SR61,KR3); -	RIP2(D,E,A,B,C,WR62,SR62,KR3); -	RIP2(C,D,E,A,B,WR63,SR63,KR3); - -	RIP1(B,C,D,E,A,WR64,SR64); -	RIP1(A,B,C,D,E,WR65,SR65); -	RIP1(E,A,B,C,D,WR66,SR66); -	RIP1(D,E,A,B,C,WR67,SR67); -	RIP1(C,D,E,A,B,WR68,SR68); -	RIP1(B,C,D,E,A,WR69,SR69); -	RIP1(A,B,C,D,E,WR70,SR70); -	RIP1(E,A,B,C,D,WR71,SR71); -	RIP1(D,E,A,B,C,WR72,SR72); -	RIP1(C,D,E,A,B,WR73,SR73); -	RIP1(B,C,D,E,A,WR74,SR74); -	RIP1(A,B,C,D,E,WR75,SR75); -	RIP1(E,A,B,C,D,WR76,SR76); -	RIP1(D,E,A,B,C,WR77,SR77); -	RIP1(C,D,E,A,B,WR78,SR78); -	RIP1(B,C,D,E,A,WR79,SR79); - -	D     =ctx->B+c+D; -	ctx->B=ctx->C+d+E; -	ctx->C=ctx->D+e+A; -	ctx->D=ctx->E+a+B; -	ctx->E=ctx->A+b+C; -	ctx->A=D; - -		} -	} -#endif diff --git a/main/openssl/crypto/ripemd/rmd_locl.h b/main/openssl/crypto/ripemd/rmd_locl.h deleted file mode 100644 index 2bd8957d..00000000 --- a/main/openssl/crypto/ripemd/rmd_locl.h +++ /dev/null @@ -1,150 +0,0 @@ -/* crypto/ripemd/rmd_locl.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - *  - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to.  The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code.  The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - *  - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - *  - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - *    notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - *    notice, this list of conditions and the following disclaimer in the - *    documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - *    must display the following acknowledgement: - *    "This product includes cryptographic software written by - *     Eric Young (eay@cryptsoft.com)" - *    The word 'cryptographic' can be left out if the rouines from the library - *    being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from  - *    the apps directory (application code) you must include an acknowledgement: - *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - *  - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - *  - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed.  i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdlib.h> -#include <string.h> -#include <openssl/opensslconf.h> -#include <openssl/ripemd.h> - -#ifndef RIPEMD160_LONG_LOG2 -#define RIPEMD160_LONG_LOG2 2 /* default to 32 bits */ -#endif - -/* - * DO EXAMINE COMMENTS IN crypto/md5/md5_locl.h & crypto/md5/md5_dgst.c - * FOR EXPLANATIONS ON FOLLOWING "CODE." - *					<appro@fy.chalmers.se> - */ -#ifdef RMD160_ASM -# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) -#  define ripemd160_block_data_order ripemd160_block_asm_data_order -# endif -#endif - -void ripemd160_block_data_order (RIPEMD160_CTX *c, const void *p,size_t num); - -#define DATA_ORDER_IS_LITTLE_ENDIAN - -#define HASH_LONG               RIPEMD160_LONG -#define HASH_CTX                RIPEMD160_CTX -#define HASH_CBLOCK             RIPEMD160_CBLOCK -#define HASH_UPDATE             RIPEMD160_Update -#define HASH_TRANSFORM          RIPEMD160_Transform -#define HASH_FINAL              RIPEMD160_Final -#define	HASH_MAKE_STRING(c,s)	do {	\ -	unsigned long ll;		\ -	ll=(c)->A; (void)HOST_l2c(ll,(s));	\ -	ll=(c)->B; (void)HOST_l2c(ll,(s));	\ -	ll=(c)->C; (void)HOST_l2c(ll,(s));	\ -	ll=(c)->D; (void)HOST_l2c(ll,(s));	\ -	ll=(c)->E; (void)HOST_l2c(ll,(s));	\ -	} while (0) -#define HASH_BLOCK_DATA_ORDER   ripemd160_block_data_order - -#include "md32_common.h" - -#if 0 -#define F1(x,y,z)	 ((x)^(y)^(z)) -#define F2(x,y,z)	(((x)&(y))|((~x)&z)) -#define F3(x,y,z)	(((x)|(~y))^(z)) -#define F4(x,y,z)	(((x)&(z))|((y)&(~(z)))) -#define F5(x,y,z)	 ((x)^((y)|(~(z)))) -#else -/* - * Transformed F2 and F4 are courtesy of Wei Dai <weidai@eskimo.com> - */ -#define F1(x,y,z)	((x) ^ (y) ^ (z)) -#define F2(x,y,z)	((((y) ^ (z)) & (x)) ^ (z)) -#define F3(x,y,z)	(((~(y)) | (x)) ^ (z)) -#define F4(x,y,z)	((((x) ^ (y)) & (z)) ^ (y)) -#define F5(x,y,z)	(((~(z)) | (y)) ^ (x)) -#endif - -#define RIPEMD160_A	0x67452301L -#define RIPEMD160_B	0xEFCDAB89L -#define RIPEMD160_C	0x98BADCFEL -#define RIPEMD160_D	0x10325476L -#define RIPEMD160_E	0xC3D2E1F0L - -#include "rmdconst.h" - -#define RIP1(a,b,c,d,e,w,s) { \ -	a+=F1(b,c,d)+X(w); \ -        a=ROTATE(a,s)+e; \ -        c=ROTATE(c,10); } - -#define RIP2(a,b,c,d,e,w,s,K) { \ -	a+=F2(b,c,d)+X(w)+K; \ -        a=ROTATE(a,s)+e; \ -        c=ROTATE(c,10); } - -#define RIP3(a,b,c,d,e,w,s,K) { \ -	a+=F3(b,c,d)+X(w)+K; \ -        a=ROTATE(a,s)+e; \ -        c=ROTATE(c,10); } - -#define RIP4(a,b,c,d,e,w,s,K) { \ -	a+=F4(b,c,d)+X(w)+K; \ -        a=ROTATE(a,s)+e; \ -        c=ROTATE(c,10); } - -#define RIP5(a,b,c,d,e,w,s,K) { \ -	a+=F5(b,c,d)+X(w)+K; \ -        a=ROTATE(a,s)+e; \ -        c=ROTATE(c,10); } - diff --git a/main/openssl/crypto/ripemd/rmd_one.c b/main/openssl/crypto/ripemd/rmd_one.c deleted file mode 100644 index 3efb1375..00000000 --- a/main/openssl/crypto/ripemd/rmd_one.c +++ /dev/null @@ -1,78 +0,0 @@ -/* crypto/ripemd/rmd_one.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - *  - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to.  The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code.  The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - *  - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - *  - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - *    notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - *    notice, this list of conditions and the following disclaimer in the - *    documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - *    must display the following acknowledgement: - *    "This product includes cryptographic software written by - *     Eric Young (eay@cryptsoft.com)" - *    The word 'cryptographic' can be left out if the rouines from the library - *    being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from  - *    the apps directory (application code) you must include an acknowledgement: - *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - *  - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - *  - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed.  i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <string.h> -#include <openssl/ripemd.h> -#include <openssl/crypto.h> - -unsigned char *RIPEMD160(const unsigned char *d, size_t n, -	     unsigned char *md) -	{ -	RIPEMD160_CTX c; -	static unsigned char m[RIPEMD160_DIGEST_LENGTH]; - -	if (md == NULL) md=m; -	if (!RIPEMD160_Init(&c)) -		return NULL; -	RIPEMD160_Update(&c,d,n); -	RIPEMD160_Final(md,&c); -	OPENSSL_cleanse(&c,sizeof(c)); /* security consideration */ -	return(md); -	} - diff --git a/main/openssl/crypto/ripemd/rmdconst.h b/main/openssl/crypto/ripemd/rmdconst.h deleted file mode 100644 index 59c48dea..00000000 --- a/main/openssl/crypto/ripemd/rmdconst.h +++ /dev/null @@ -1,399 +0,0 @@ -/* crypto/ripemd/rmdconst.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - *  - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to.  The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code.  The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - *  - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - *  - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - *    notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - *    notice, this list of conditions and the following disclaimer in the - *    documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - *    must display the following acknowledgement: - *    "This product includes cryptographic software written by - *     Eric Young (eay@cryptsoft.com)" - *    The word 'cryptographic' can be left out if the rouines from the library - *    being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from  - *    the apps directory (application code) you must include an acknowledgement: - *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - *  - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - *  - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed.  i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -#define KL0 0x00000000L -#define KL1 0x5A827999L -#define KL2 0x6ED9EBA1L -#define KL3 0x8F1BBCDCL -#define KL4 0xA953FD4EL - -#define KR0 0x50A28BE6L -#define KR1 0x5C4DD124L -#define KR2 0x6D703EF3L -#define KR3 0x7A6D76E9L -#define KR4 0x00000000L - -#define WL00  0 -#define SL00 11 -#define WL01  1 -#define SL01 14 -#define WL02  2 -#define SL02 15 -#define WL03  3 -#define SL03 12 -#define WL04  4 -#define SL04  5 -#define WL05  5 -#define SL05  8 -#define WL06  6 -#define SL06  7 -#define WL07  7 -#define SL07  9 -#define WL08  8 -#define SL08 11 -#define WL09  9 -#define SL09 13 -#define WL10 10 -#define SL10 14 -#define WL11 11 -#define SL11 15 -#define WL12 12 -#define SL12  6 -#define WL13 13 -#define SL13  7 -#define WL14 14 -#define SL14  9 -#define WL15 15 -#define SL15  8 - -#define WL16  7 -#define SL16  7 -#define WL17  4 -#define SL17  6 -#define WL18 13 -#define SL18  8 -#define WL19  1 -#define SL19 13 -#define WL20 10 -#define SL20 11 -#define WL21  6 -#define SL21  9 -#define WL22 15 -#define SL22  7 -#define WL23  3 -#define SL23 15 -#define WL24 12 -#define SL24  7 -#define WL25  0 -#define SL25 12 -#define WL26  9 -#define SL26 15 -#define WL27  5 -#define SL27  9 -#define WL28  2 -#define SL28 11 -#define WL29 14 -#define SL29  7 -#define WL30 11 -#define SL30 13 -#define WL31  8 -#define SL31 12 - -#define WL32  3 -#define SL32 11 -#define WL33 10 -#define SL33 13 -#define WL34 14 -#define SL34  6 -#define WL35  4 -#define SL35  7 -#define WL36  9 -#define SL36 14 -#define WL37 15 -#define SL37  9 -#define WL38  8 -#define SL38 13 -#define WL39  1 -#define SL39 15 -#define WL40  2 -#define SL40 14 -#define WL41  7 -#define SL41  8 -#define WL42  0 -#define SL42 13 -#define WL43  6 -#define SL43  6 -#define WL44 13 -#define SL44  5 -#define WL45 11 -#define SL45 12 -#define WL46  5 -#define SL46  7 -#define WL47 12 -#define SL47  5 - -#define WL48  1 -#define SL48 11 -#define WL49  9 -#define SL49 12 -#define WL50 11 -#define SL50 14 -#define WL51 10 -#define SL51 15 -#define WL52  0 -#define SL52 14 -#define WL53  8 -#define SL53 15 -#define WL54 12 -#define SL54  9 -#define WL55  4 -#define SL55  8 -#define WL56 13 -#define SL56  9 -#define WL57  3 -#define SL57 14 -#define WL58  7 -#define SL58  5 -#define WL59 15 -#define SL59  6 -#define WL60 14 -#define SL60  8 -#define WL61  5 -#define SL61  6 -#define WL62  6 -#define SL62  5 -#define WL63  2 -#define SL63 12 - -#define WL64  4 -#define SL64  9 -#define WL65  0 -#define SL65 15 -#define WL66  5 -#define SL66  5 -#define WL67  9 -#define SL67 11 -#define WL68  7 -#define SL68  6 -#define WL69 12 -#define SL69  8 -#define WL70  2 -#define SL70 13 -#define WL71 10 -#define SL71 12 -#define WL72 14 -#define SL72  5 -#define WL73  1 -#define SL73 12 -#define WL74  3 -#define SL74 13 -#define WL75  8 -#define SL75 14 -#define WL76 11 -#define SL76 11 -#define WL77  6 -#define SL77  8 -#define WL78 15 -#define SL78  5 -#define WL79 13 -#define SL79  6 - -#define WR00  5 -#define SR00  8 -#define WR01 14 -#define SR01  9 -#define WR02  7 -#define SR02  9 -#define WR03  0 -#define SR03 11 -#define WR04  9 -#define SR04 13 -#define WR05  2 -#define SR05 15 -#define WR06 11 -#define SR06 15 -#define WR07  4 -#define SR07  5 -#define WR08 13 -#define SR08  7 -#define WR09  6 -#define SR09  7 -#define WR10 15 -#define SR10  8 -#define WR11  8 -#define SR11 11 -#define WR12  1 -#define SR12 14 -#define WR13 10 -#define SR13 14 -#define WR14  3 -#define SR14 12 -#define WR15 12 -#define SR15  6 - -#define WR16  6 -#define SR16  9 -#define WR17 11 -#define SR17 13 -#define WR18  3 -#define SR18 15 -#define WR19  7 -#define SR19  7 -#define WR20  0 -#define SR20 12 -#define WR21 13 -#define SR21  8 -#define WR22  5 -#define SR22  9 -#define WR23 10 -#define SR23 11 -#define WR24 14 -#define SR24  7 -#define WR25 15 -#define SR25  7 -#define WR26  8 -#define SR26 12 -#define WR27 12 -#define SR27  7 -#define WR28  4 -#define SR28  6 -#define WR29  9 -#define SR29 15 -#define WR30  1 -#define SR30 13 -#define WR31  2 -#define SR31 11 - -#define WR32 15 -#define SR32  9 -#define WR33  5 -#define SR33  7 -#define WR34  1 -#define SR34 15 -#define WR35  3 -#define SR35 11 -#define WR36  7 -#define SR36  8 -#define WR37 14 -#define SR37  6 -#define WR38  6 -#define SR38  6 -#define WR39  9 -#define SR39 14 -#define WR40 11 -#define SR40 12 -#define WR41  8 -#define SR41 13 -#define WR42 12 -#define SR42  5 -#define WR43  2 -#define SR43 14 -#define WR44 10 -#define SR44 13 -#define WR45  0 -#define SR45 13 -#define WR46  4 -#define SR46  7 -#define WR47 13 -#define SR47  5 - -#define WR48  8 -#define SR48 15 -#define WR49  6 -#define SR49  5 -#define WR50  4 -#define SR50  8 -#define WR51  1 -#define SR51 11 -#define WR52  3 -#define SR52 14 -#define WR53 11 -#define SR53 14 -#define WR54 15 -#define SR54  6 -#define WR55  0 -#define SR55 14 -#define WR56  5 -#define SR56  6 -#define WR57 12 -#define SR57  9 -#define WR58  2 -#define SR58 12 -#define WR59 13 -#define SR59  9 -#define WR60  9 -#define SR60 12 -#define WR61  7 -#define SR61  5 -#define WR62 10 -#define SR62 15 -#define WR63 14 -#define SR63  8 - -#define WR64 12 -#define SR64  8 -#define WR65 15 -#define SR65  5 -#define WR66 10 -#define SR66 12 -#define WR67  4 -#define SR67  9 -#define WR68  1 -#define SR68 12 -#define WR69  5 -#define SR69  5 -#define WR70  8 -#define SR70 14 -#define WR71  7 -#define SR71  6 -#define WR72  6 -#define SR72  8 -#define WR73  2 -#define SR73 13 -#define WR74 13 -#define SR74  6 -#define WR75 14 -#define SR75  5 -#define WR76  0 -#define SR76 15 -#define WR77  3 -#define SR77 13 -#define WR78  9 -#define SR78 11 -#define WR79 11 -#define SR79 11 - diff --git a/main/openssl/crypto/ripemd/rmdtest.c b/main/openssl/crypto/ripemd/rmdtest.c deleted file mode 100644 index fb34e0e8..00000000 --- a/main/openssl/crypto/ripemd/rmdtest.c +++ /dev/null @@ -1,145 +0,0 @@ -/* crypto/ripemd/rmdtest.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - *  - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to.  The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code.  The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - *  - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - *  - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - *    notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - *    notice, this list of conditions and the following disclaimer in the - *    documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - *    must display the following acknowledgement: - *    "This product includes cryptographic software written by - *     Eric Young (eay@cryptsoft.com)" - *    The word 'cryptographic' can be left out if the rouines from the library - *    being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from  - *    the apps directory (application code) you must include an acknowledgement: - *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - *  - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - *  - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed.  i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> - -#include "../e_os.h" - -#ifdef OPENSSL_NO_RIPEMD -int main(int argc, char *argv[]) -{ -    printf("No ripemd support\n"); -    return(0); -} -#else -#include <openssl/ripemd.h> -#include <openssl/evp.h> - -#ifdef CHARSET_EBCDIC -#include <openssl/ebcdic.h> -#endif - -static char *test[]={ -	"", -	"a", -	"abc", -	"message digest", -	"abcdefghijklmnopqrstuvwxyz", -	"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", -	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", -	"12345678901234567890123456789012345678901234567890123456789012345678901234567890", -	NULL, -	}; - -static char *ret[]={ -	"9c1185a5c5e9fc54612808977ee8f548b2258d31", -	"0bdc9d2d256b3ee9daae347be6f4dc835a467ffe", -	"8eb208f7e05d987a9b044a8e98c6b087f15a0bfc", -	"5d0689ef49d2fae572b881b123a85ffa21595f36", -	"f71c27109c692c1b56bbdceb5b9d2865b3708dbc", -	"12a053384a9c0c88e405a06c27dcf49ada62eb2b", -	"b0e20b6e3116640286ed3a87a5713079b21f5189", -	"9b752e45573d4b39f4dbd3323cab82bf63326bfb", -	}; - -static char *pt(unsigned char *md); -int main(int argc, char *argv[]) -	{ -	int i,err=0; -	char **P,**R; -	char *p; -	unsigned char md[RIPEMD160_DIGEST_LENGTH]; - -	P=test; -	R=ret; -	i=1; -	while (*P != NULL) -		{ -#ifdef CHARSET_EBCDIC -		ebcdic2ascii((char *)*P, (char *)*P, strlen((char *)*P)); -#endif -		EVP_Digest(&(P[0][0]),strlen((char *)*P),md,NULL,EVP_ripemd160(), NULL); -		p=pt(md); -		if (strcmp(p,(char *)*R) != 0) -			{ -			printf("error calculating RIPEMD160 on '%s'\n",*P); -			printf("got %s instead of %s\n",p,*R); -			err++; -			} -		else -			printf("test %d ok\n",i); -		i++; -		R++; -		P++; -		} -	EXIT(err); -	return(0); -	} - -static char *pt(unsigned char *md) -	{ -	int i; -	static char buf[80]; - -	for (i=0; i<RIPEMD160_DIGEST_LENGTH; i++) -		sprintf(&(buf[i*2]),"%02x",md[i]); -	return(buf); -	} -#endif diff --git a/main/openssl/crypto/sha/asm/sha1-armv4-large.pl b/main/openssl/crypto/sha/asm/sha1-armv4-large.pl index 33da3e0e..50bd07b3 100644 --- a/main/openssl/crypto/sha/asm/sha1-armv4-large.pl +++ b/main/openssl/crypto/sha/asm/sha1-armv4-large.pl @@ -1,7 +1,7 @@  #!/usr/bin/env perl  # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL  # project. The module is, however, dual licensed under OpenSSL and  # CRYPTOGAMS licenses depending on where you obtain it. For further  # details see http://www.openssl.org/~appro/cryptogams/. @@ -52,6 +52,20 @@  # Profiler-assisted and platform-specific optimization resulted in 10%  # improvement on Cortex A8 core and 12.2 cycles per byte. +# September 2013. +# +# Add NEON implementation (see sha1-586.pl for background info). On +# Cortex A8 it was measured to process one byte in 6.7 cycles or >80% +# faster than integer-only code. Because [fully unrolled] NEON code +# is ~2.5x larger and there are some redundant instructions executed +# when processing last block, improvement is not as big for smallest +# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per +# byte, which is also >80% faster than integer-only code. + +# May 2014. +# +# Add ARMv8 code path performing at 2.35 cpb on Apple A7. +  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}  open STDOUT,">$output"; @@ -153,12 +167,22 @@ $code=<<___;  #include "arm_arch.h"  .text +.code	32  .global	sha1_block_data_order  .type	sha1_block_data_order,%function -.align	2 +.align	5  sha1_block_data_order: +#if __ARM_ARCH__>=7 +	sub	r3,pc,#8		@ sha1_block_data_order +	ldr	r12,.LOPENSSL_armcap +	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P +	tst	r12,#ARMV8_SHA1 +	bne	.LARMv8 +	tst	r12,#ARMV7_NEON +	bne	.LNEON +#endif  	stmdb	sp!,{r4-r12,lr}  	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp  	ldmia	$ctx,{$a,$b,$c,$d,$e} @@ -233,16 +257,422 @@ $code.=<<___;  	moveq	pc,lr			@ be binary compatible with V4, yet  	bx	lr			@ interoperable with Thumb ISA:-)  #endif -.align	2 +.size	sha1_block_data_order,.-sha1_block_data_order + +.align	5  .LK_00_19:	.word	0x5a827999  .LK_20_39:	.word	0x6ed9eba1  .LK_40_59:	.word	0x8f1bbcdc  .LK_60_79:	.word	0xca62c1d6 -.size	sha1_block_data_order,.-sha1_block_data_order -.asciz	"SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" -.align	2 +.LOPENSSL_armcap: +.word	OPENSSL_armcap_P-sha1_block_data_order +.asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align	5 +___ +##################################################################### +# NEON stuff +# +{{{ +my @V=($a,$b,$c,$d,$e); +my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14)); +my $Xi=4; +my @X=map("q$_",(8..11,0..3)); +my @Tx=("q12","q13"); +my ($K,$zero)=("q14","q15"); +my $j=0; + +sub AUTOLOAD()          # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; +  my $arg = pop; +    $arg = "#$arg" if ($arg*1 eq $arg); +    $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub body_00_19 () { +	( +	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'. +	'&bic	($t0,$d,$b)', +	'&add	($e,$e,$Ki)',		# e+=X[i]+K +	'&and	($t1,$c,$b)', +	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', +	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27) +	'&eor	($t1,$t1,$t0)',		# F_00_19 +	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2) +	'&add	($e,$e,$t1);'.		# e+=F_00_19 +	'$j++;	unshift(@V,pop(@V));' +	) +} +sub body_20_39 () { +	( +	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'. +	'&eor	($t0,$b,$d)', +	'&add	($e,$e,$Ki)',		# e+=X[i]+K +	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)', +	'&eor	($t1,$t0,$c)',		# F_20_39 +	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27) +	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2) +	'&add	($e,$e,$t1);'.		# e+=F_20_39 +	'$j++;	unshift(@V,pop(@V));' +	) +} +sub body_40_59 () { +	( +	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'. +	'&add	($e,$e,$Ki)',		# e+=X[i]+K +	'&and	($t0,$c,$d)', +	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', +	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27) +	'&eor	($t1,$c,$d)', +	'&add	($e,$e,$t0)', +	'&and	($t1,$t1,$b)', +	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2) +	'&add	($e,$e,$t1);'.		# e+=F_40_59 +	'$j++;	unshift(@V,pop(@V));' +	) +} + +sub Xupdate_16_31 () +{ use integer; +  my $body = shift; +  my @insns = (&$body,&$body,&$body,&$body); +  my ($a,$b,$c,$d,$e); + +	&vext_8		(@X[0],@X[-4&7],@X[-3&7],8);	# compose "X[-14]" in "X[0]" +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vadd_i32	(@Tx[1],@X[-1&7],$K); +	 eval(shift(@insns)); +	  &vld1_32	("{$K\[]}","[$K_XX_XX,:32]!")	if ($Xi%5==0); +	 eval(shift(@insns)); +	&vext_8		(@Tx[0],@X[-1&7],$zero,4);	# "X[-3]", 3 words +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&veor		(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]" +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&veor		(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]" +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&veor		(@Tx[0],@Tx[0],@X[0]);		# "X[0]"^="X[-3]"^"X[-8] +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vst1_32	("{@Tx[1]}","[$Xfer,:128]!");	# X[]+K xfer +	  &sub		($Xfer,$Xfer,64)		if ($Xi%4==0); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vext_8		(@Tx[1],$zero,@Tx[0],4);	# "X[0]"<<96, extract one dword +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vadd_i32	(@X[0],@Tx[0],@Tx[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vsri_32	(@X[0],@Tx[0],31);		# "X[0]"<<<=1 +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vshr_u32	(@Tx[0],@Tx[1],30); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vshl_u32	(@Tx[1],@Tx[1],2); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&veor		(@X[0],@X[0],@Tx[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&veor		(@X[0],@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2 + +	foreach (@insns) { eval; }	# remaining instructions [if any] + +  $Xi++;	push(@X,shift(@X));	# "rotate" X[] +} + +sub Xupdate_32_79 () +{ use integer; +  my $body = shift; +  my @insns = (&$body,&$body,&$body,&$body); +  my ($a,$b,$c,$d,$e); + +	&vext_8		(@Tx[0],@X[-2&7],@X[-1&7],8);	# compose "X[-6]" +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&veor		(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]" +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&veor		(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]" +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vadd_i32	(@Tx[1],@X[-1&7],$K); +	 eval(shift(@insns)); +	  &vld1_32	("{$K\[]}","[$K_XX_XX,:32]!")	if ($Xi%5==0); +	 eval(shift(@insns)); +	&veor		(@Tx[0],@Tx[0],@X[0]);		# "X[-6]"^="X[0]" +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vshr_u32	(@X[0],@Tx[0],30); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vst1_32	("{@Tx[1]}","[$Xfer,:128]!");	# X[]+K xfer +	  &sub		($Xfer,$Xfer,64)		if ($Xi%4==0); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vsli_32	(@X[0],@Tx[0],2);		# "X[0]"="X[-6]"<<<2 + +	foreach (@insns) { eval; }	# remaining instructions [if any] + +  $Xi++;	push(@X,shift(@X));	# "rotate" X[] +} + +sub Xuplast_80 () +{ use integer; +  my $body = shift; +  my @insns = (&$body,&$body,&$body,&$body); +  my ($a,$b,$c,$d,$e); + +	&vadd_i32	(@Tx[1],@X[-1&7],$K); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vst1_32	("{@Tx[1]}","[$Xfer,:128]!"); +	&sub		($Xfer,$Xfer,64); + +	&teq		($inp,$len); +	&sub		($K_XX_XX,$K_XX_XX,16);	# rewind $K_XX_XX +	&subeq		($inp,$inp,64);		# reload last block to avoid SEGV +	&vld1_8		("{@X[-4&7]-@X[-3&7]}","[$inp]!"); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vld1_8		("{@X[-2&7]-@X[-1&7]}","[$inp]!"); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vld1_32	("{$K\[]}","[$K_XX_XX,:32]!");	# load K_00_19 +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vrev32_8	(@X[-4&7],@X[-4&7]); + +	foreach (@insns) { eval; }		# remaining instructions + +   $Xi=0; +} + +sub Xloop() +{ use integer; +  my $body = shift; +  my @insns = (&$body,&$body,&$body,&$body); +  my ($a,$b,$c,$d,$e); + +	&vrev32_8	(@X[($Xi-3)&7],@X[($Xi-3)&7]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vadd_i32	(@X[$Xi&7],@X[($Xi-4)&7],$K); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vst1_32	("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU + +	foreach (@insns) { eval; } + +  $Xi++; +} + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu	neon + +.type	sha1_block_data_order_neon,%function +.align	4 +sha1_block_data_order_neon: +.LNEON: +	stmdb	sp!,{r4-r12,lr} +	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp +	@ dmb				@ errata #451034 on early Cortex A8 +	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so +	mov	$saved_sp,sp +	sub	sp,sp,#64		@ alloca +	adr	$K_XX_XX,.LK_00_19 +	bic	sp,sp,#15		@ align for 128-bit stores + +	ldmia	$ctx,{$a,$b,$c,$d,$e}	@ load context +	mov	$Xfer,sp + +	vld1.8		{@X[-4&7]-@X[-3&7]},[$inp]!	@ handles unaligned +	veor		$zero,$zero,$zero +	vld1.8		{@X[-2&7]-@X[-1&7]},[$inp]! +	vld1.32		{${K}\[]},[$K_XX_XX,:32]!	@ load K_00_19 +	vrev32.8	@X[-4&7],@X[-4&7]		@ yes, even on +	vrev32.8	@X[-3&7],@X[-3&7]		@ big-endian... +	vrev32.8	@X[-2&7],@X[-2&7] +	vadd.i32	@X[0],@X[-4&7],$K +	vrev32.8	@X[-1&7],@X[-1&7] +	vadd.i32	@X[1],@X[-3&7],$K +	vst1.32		{@X[0]},[$Xfer,:128]! +	vadd.i32	@X[2],@X[-2&7],$K +	vst1.32		{@X[1]},[$Xfer,:128]! +	vst1.32		{@X[2]},[$Xfer,:128]! +	ldr		$Ki,[sp]			@ big RAW stall + +.Loop_neon: +___ +	&Xupdate_16_31(\&body_00_19); +	&Xupdate_16_31(\&body_00_19); +	&Xupdate_16_31(\&body_00_19); +	&Xupdate_16_31(\&body_00_19); +	&Xupdate_32_79(\&body_00_19); +	&Xupdate_32_79(\&body_20_39); +	&Xupdate_32_79(\&body_20_39); +	&Xupdate_32_79(\&body_20_39); +	&Xupdate_32_79(\&body_20_39); +	&Xupdate_32_79(\&body_20_39); +	&Xupdate_32_79(\&body_40_59); +	&Xupdate_32_79(\&body_40_59); +	&Xupdate_32_79(\&body_40_59); +	&Xupdate_32_79(\&body_40_59); +	&Xupdate_32_79(\&body_40_59); +	&Xupdate_32_79(\&body_20_39); +	&Xuplast_80(\&body_20_39); +	&Xloop(\&body_20_39); +	&Xloop(\&body_20_39); +	&Xloop(\&body_20_39); +$code.=<<___; +	ldmia	$ctx,{$Ki,$t0,$t1,$Xfer}	@ accumulate context +	add	$a,$a,$Ki +	ldr	$Ki,[$ctx,#16] +	add	$b,$b,$t0 +	add	$c,$c,$t1 +	add	$d,$d,$Xfer +	moveq	sp,$saved_sp +	add	$e,$e,$Ki +	ldrne	$Ki,[sp] +	stmia	$ctx,{$a,$b,$c,$d,$e} +	addne	$Xfer,sp,#3*16 +	bne	.Loop_neon + +	@ vldmia	sp!,{d8-d15} +	ldmia	sp!,{r4-r12,pc} +.size	sha1_block_data_order_neon,.-sha1_block_data_order_neon +#endif +___ +}}} +##################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3)); +my @MSG=map("q$_",(4..7)); +my @Kxx=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); + +$code.=<<___; +#if __ARM_ARCH__>=7 +.type	sha1_block_data_order_armv8,%function +.align	5 +sha1_block_data_order_armv8: +.LARMv8: +	vstmdb	sp!,{d8-d15}		@ ABI specification says so + +	veor	$E,$E,$E +	adr	r3,.LK_00_19 +	vld1.32	{$ABCD},[$ctx]! +	vld1.32	{$E\[0]},[$ctx] +	sub	$ctx,$ctx,#16 +	vld1.32	{@Kxx[0]\[]},[r3,:32]! +	vld1.32	{@Kxx[1]\[]},[r3,:32]! +	vld1.32	{@Kxx[2]\[]},[r3,:32]! +	vld1.32	{@Kxx[3]\[]},[r3,:32] + +.Loop_v8: +	vld1.8		{@MSG[0]-@MSG[1]},[$inp]! +	vld1.8		{@MSG[2]-@MSG[3]},[$inp]! +	vrev32.8	@MSG[0],@MSG[0] +	vrev32.8	@MSG[1],@MSG[1] + +	vadd.i32	$W0,@Kxx[0],@MSG[0] +	vrev32.8	@MSG[2],@MSG[2] +	vmov		$ABCD_SAVE,$ABCD	@ offload +	subs		$len,$len,#1 + +	vadd.i32	$W1,@Kxx[0],@MSG[1] +	vrev32.8	@MSG[3],@MSG[3] +	sha1h		$E1,$ABCD		@ 0 +	sha1c		$ABCD,$E,$W0 +	vadd.i32	$W0,@Kxx[$j],@MSG[2] +	sha1su0		@MSG[0],@MSG[1],@MSG[2] +___ +for ($j=0,$i=1;$i<20-3;$i++) { +my $f=("c","p","m","p")[$i/5]; +$code.=<<___; +	sha1h		$E0,$ABCD		@ $i +	sha1$f		$ABCD,$E1,$W1 +	vadd.i32	$W1,@Kxx[$j],@MSG[3] +	sha1su1		@MSG[0],@MSG[3] +___ +$code.=<<___ if ($i<20-4); +	sha1su0		@MSG[1],@MSG[2],@MSG[3]  ___ +	($E0,$E1)=($E1,$E0);	($W0,$W1)=($W1,$W0); +	push(@MSG,shift(@MSG));	$j++ if ((($i+3)%5)==0); +} +$code.=<<___; +	sha1h		$E0,$ABCD		@ $i +	sha1p		$ABCD,$E1,$W1 +	vadd.i32	$W1,@Kxx[$j],@MSG[3] + +	sha1h		$E1,$ABCD		@ 18 +	sha1p		$ABCD,$E0,$W0 + +	sha1h		$E0,$ABCD		@ 19 +	sha1p		$ABCD,$E1,$W1 + +	vadd.i32	$E,$E,$E0 +	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE +	bne		.Loop_v8 + +	vst1.32		{$ABCD},[$ctx]! +	vst1.32		{$E\[0]},[$ctx] + +	vldmia	sp!,{d8-d15} +	ret					@ bx lr +.size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +.comm	OPENSSL_armcap_P,4,4 +___ + +{   my  %opcode = ( +	"sha1c"		=> 0xf2000c40,	"sha1p"		=> 0xf2100c40, +	"sha1m"		=> 0xf2200c40,	"sha1su0"	=> 0xf2300c40, +	"sha1h"		=> 0xf3b902c0,	"sha1su1"	=> 0xf3ba0380	); + +    sub unsha1 { +	my ($mnemonic,$arg)=@_; + +	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { +	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) +					 |(($2&7)<<17)|(($2&8)<<4) +					 |(($3&7)<<1) |(($3&8)<<2); +	    # since ARMv7 instructions are always encoded little-endian. +	    # correct solution is to use .inst directive, but older +	    # assemblers don't implement it:-( +	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", +			$word&0xff,($word>>8)&0xff, +			($word>>16)&0xff,($word>>24)&0xff, +			$mnemonic,$arg; +	} +    } +} + +foreach (split($/,$code)) { +	s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo	or +	s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo; + +	s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo; + +	s/\bret\b/bx	lr/o		or +	s/\bbx\s+lr\b/.word\t0xe12fff1e/o;	# make it possible to compile with -march=armv4 + +	print $_,$/; +} -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4 -print $code;  close STDOUT; # enforce flush diff --git a/main/openssl/crypto/sha/asm/sha1-armv4-large.s b/main/openssl/crypto/sha/asm/sha1-armv4-large.s index 639ae78a..a1562883 100644 --- a/main/openssl/crypto/sha/asm/sha1-armv4-large.s +++ b/main/openssl/crypto/sha/asm/sha1-armv4-large.s @@ -1,12 +1,22 @@  #include "arm_arch.h"  .text +.code	32  .global	sha1_block_data_order  .type	sha1_block_data_order,%function -.align	2 +.align	5  sha1_block_data_order: +#if __ARM_ARCH__>=7 +	sub	r3,pc,#8		@ sha1_block_data_order +	ldr	r12,.LOPENSSL_armcap +	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P +	tst	r12,#ARMV8_SHA1 +	bne	.LARMv8 +	tst	r12,#ARMV7_NEON +	bne	.LNEON +#endif  	stmdb	sp!,{r4-r12,lr}  	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1  	ldmia	r0,{r3,r4,r5,r6,r7} @@ -442,11 +452,999 @@ sha1_block_data_order:  	moveq	pc,lr			@ be binary compatible with V4, yet  	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)  #endif -.align	2 +.size	sha1_block_data_order,.-sha1_block_data_order + +.align	5  .LK_00_19:	.word	0x5a827999  .LK_20_39:	.word	0x6ed9eba1  .LK_40_59:	.word	0x8f1bbcdc  .LK_60_79:	.word	0xca62c1d6 -.size	sha1_block_data_order,.-sha1_block_data_order -.asciz	"SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" -.align	2 +.LOPENSSL_armcap: +.word	OPENSSL_armcap_P-sha1_block_data_order +.asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align	5 +#if __ARM_ARCH__>=7 +.fpu	neon + +.type	sha1_block_data_order_neon,%function +.align	4 +sha1_block_data_order_neon: +.LNEON: +	stmdb	sp!,{r4-r12,lr} +	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1 +	@ dmb				@ errata #451034 on early Cortex A8 +	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so +	mov	r14,sp +	sub	sp,sp,#64		@ alloca +	adr	r8,.LK_00_19 +	bic	sp,sp,#15		@ align for 128-bit stores + +	ldmia	r0,{r3,r4,r5,r6,r7}	@ load context +	mov	r12,sp + +	vld1.8		{q0-q1},[r1]!	@ handles unaligned +	veor		q15,q15,q15 +	vld1.8		{q2-q3},[r1]! +	vld1.32		{d28[],d29[]},[r8,:32]!	@ load K_00_19 +	vrev32.8	q0,q0		@ yes, even on +	vrev32.8	q1,q1		@ big-endian... +	vrev32.8	q2,q2 +	vadd.i32	q8,q0,q14 +	vrev32.8	q3,q3 +	vadd.i32	q9,q1,q14 +	vst1.32		{q8},[r12,:128]! +	vadd.i32	q10,q2,q14 +	vst1.32		{q9},[r12,:128]! +	vst1.32		{q10},[r12,:128]! +	ldr		r9,[sp]			@ big RAW stall + +.Loop_neon: +	vext.8	q8,q0,q1,#8 +	bic	r10,r6,r4 +	add	r7,r7,r9 +	and	r11,r5,r4 +	vadd.i32	q13,q3,q14 +	ldr	r9,[sp,#4] +	add	r7,r7,r3,ror#27 +	vext.8	q12,q3,q15,#4 +	eor	r11,r11,r10 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	veor	q8,q8,q0 +	bic	r10,r5,r3 +	add	r6,r6,r9 +	veor	q12,q12,q2 +	and	r11,r4,r3 +	ldr	r9,[sp,#8] +	veor	q12,q12,q8 +	add	r6,r6,r7,ror#27 +	eor	r11,r11,r10 +	vst1.32	{q13},[r12,:128]! +	sub	r12,r12,#64 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	vext.8	q13,q15,q12,#4 +	bic	r10,r4,r7 +	add	r5,r5,r9 +	vadd.i32	q8,q12,q12 +	and	r11,r3,r7 +	ldr	r9,[sp,#12] +	vsri.32	q8,q12,#31 +	add	r5,r5,r6,ror#27 +	eor	r11,r11,r10 +	mov	r7,r7,ror#2 +	vshr.u32	q12,q13,#30 +	add	r5,r5,r11 +	bic	r10,r3,r6 +	vshl.u32	q13,q13,#2 +	add	r4,r4,r9 +	and	r11,r7,r6 +	veor	q8,q8,q12 +	ldr	r9,[sp,#16] +	add	r4,r4,r5,ror#27 +	veor	q8,q8,q13 +	eor	r11,r11,r10 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	vext.8	q9,q1,q2,#8 +	bic	r10,r7,r5 +	add	r3,r3,r9 +	and	r11,r6,r5 +	vadd.i32	q13,q8,q14 +	ldr	r9,[sp,#20] +	vld1.32	{d28[],d29[]},[r8,:32]! +	add	r3,r3,r4,ror#27 +	vext.8	q12,q8,q15,#4 +	eor	r11,r11,r10 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	veor	q9,q9,q1 +	bic	r10,r6,r4 +	add	r7,r7,r9 +	veor	q12,q12,q3 +	and	r11,r5,r4 +	ldr	r9,[sp,#24] +	veor	q12,q12,q9 +	add	r7,r7,r3,ror#27 +	eor	r11,r11,r10 +	vst1.32	{q13},[r12,:128]! +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	vext.8	q13,q15,q12,#4 +	bic	r10,r5,r3 +	add	r6,r6,r9 +	vadd.i32	q9,q12,q12 +	and	r11,r4,r3 +	ldr	r9,[sp,#28] +	vsri.32	q9,q12,#31 +	add	r6,r6,r7,ror#27 +	eor	r11,r11,r10 +	mov	r3,r3,ror#2 +	vshr.u32	q12,q13,#30 +	add	r6,r6,r11 +	bic	r10,r4,r7 +	vshl.u32	q13,q13,#2 +	add	r5,r5,r9 +	and	r11,r3,r7 +	veor	q9,q9,q12 +	ldr	r9,[sp,#32] +	add	r5,r5,r6,ror#27 +	veor	q9,q9,q13 +	eor	r11,r11,r10 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	vext.8	q10,q2,q3,#8 +	bic	r10,r3,r6 +	add	r4,r4,r9 +	and	r11,r7,r6 +	vadd.i32	q13,q9,q14 +	ldr	r9,[sp,#36] +	add	r4,r4,r5,ror#27 +	vext.8	q12,q9,q15,#4 +	eor	r11,r11,r10 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	veor	q10,q10,q2 +	bic	r10,r7,r5 +	add	r3,r3,r9 +	veor	q12,q12,q8 +	and	r11,r6,r5 +	ldr	r9,[sp,#40] +	veor	q12,q12,q10 +	add	r3,r3,r4,ror#27 +	eor	r11,r11,r10 +	vst1.32	{q13},[r12,:128]! +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	vext.8	q13,q15,q12,#4 +	bic	r10,r6,r4 +	add	r7,r7,r9 +	vadd.i32	q10,q12,q12 +	and	r11,r5,r4 +	ldr	r9,[sp,#44] +	vsri.32	q10,q12,#31 +	add	r7,r7,r3,ror#27 +	eor	r11,r11,r10 +	mov	r4,r4,ror#2 +	vshr.u32	q12,q13,#30 +	add	r7,r7,r11 +	bic	r10,r5,r3 +	vshl.u32	q13,q13,#2 +	add	r6,r6,r9 +	and	r11,r4,r3 +	veor	q10,q10,q12 +	ldr	r9,[sp,#48] +	add	r6,r6,r7,ror#27 +	veor	q10,q10,q13 +	eor	r11,r11,r10 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	vext.8	q11,q3,q8,#8 +	bic	r10,r4,r7 +	add	r5,r5,r9 +	and	r11,r3,r7 +	vadd.i32	q13,q10,q14 +	ldr	r9,[sp,#52] +	add	r5,r5,r6,ror#27 +	vext.8	q12,q10,q15,#4 +	eor	r11,r11,r10 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	veor	q11,q11,q3 +	bic	r10,r3,r6 +	add	r4,r4,r9 +	veor	q12,q12,q9 +	and	r11,r7,r6 +	ldr	r9,[sp,#56] +	veor	q12,q12,q11 +	add	r4,r4,r5,ror#27 +	eor	r11,r11,r10 +	vst1.32	{q13},[r12,:128]! +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	vext.8	q13,q15,q12,#4 +	bic	r10,r7,r5 +	add	r3,r3,r9 +	vadd.i32	q11,q12,q12 +	and	r11,r6,r5 +	ldr	r9,[sp,#60] +	vsri.32	q11,q12,#31 +	add	r3,r3,r4,ror#27 +	eor	r11,r11,r10 +	mov	r5,r5,ror#2 +	vshr.u32	q12,q13,#30 +	add	r3,r3,r11 +	bic	r10,r6,r4 +	vshl.u32	q13,q13,#2 +	add	r7,r7,r9 +	and	r11,r5,r4 +	veor	q11,q11,q12 +	ldr	r9,[sp,#0] +	add	r7,r7,r3,ror#27 +	veor	q11,q11,q13 +	eor	r11,r11,r10 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	vext.8	q12,q10,q11,#8 +	bic	r10,r5,r3 +	add	r6,r6,r9 +	and	r11,r4,r3 +	veor	q0,q0,q8 +	ldr	r9,[sp,#4] +	add	r6,r6,r7,ror#27 +	veor	q0,q0,q1 +	eor	r11,r11,r10 +	mov	r3,r3,ror#2 +	vadd.i32	q13,q11,q14 +	add	r6,r6,r11 +	bic	r10,r4,r7 +	veor	q12,q12,q0 +	add	r5,r5,r9 +	and	r11,r3,r7 +	vshr.u32	q0,q12,#30 +	ldr	r9,[sp,#8] +	add	r5,r5,r6,ror#27 +	vst1.32	{q13},[r12,:128]! +	sub	r12,r12,#64 +	eor	r11,r11,r10 +	mov	r7,r7,ror#2 +	vsli.32	q0,q12,#2 +	add	r5,r5,r11 +	bic	r10,r3,r6 +	add	r4,r4,r9 +	and	r11,r7,r6 +	ldr	r9,[sp,#12] +	add	r4,r4,r5,ror#27 +	eor	r11,r11,r10 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	bic	r10,r7,r5 +	add	r3,r3,r9 +	and	r11,r6,r5 +	ldr	r9,[sp,#16] +	add	r3,r3,r4,ror#27 +	eor	r11,r11,r10 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	vext.8	q12,q11,q0,#8 +	eor	r10,r4,r6 +	add	r7,r7,r9 +	ldr	r9,[sp,#20] +	veor	q1,q1,q9 +	eor	r11,r10,r5 +	add	r7,r7,r3,ror#27 +	veor	q1,q1,q2 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	vadd.i32	q13,q0,q14 +	eor	r10,r3,r5 +	add	r6,r6,r9 +	veor	q12,q12,q1 +	ldr	r9,[sp,#24] +	eor	r11,r10,r4 +	vshr.u32	q1,q12,#30 +	add	r6,r6,r7,ror#27 +	mov	r3,r3,ror#2 +	vst1.32	{q13},[r12,:128]! +	add	r6,r6,r11 +	eor	r10,r7,r4 +	vsli.32	q1,q12,#2 +	add	r5,r5,r9 +	ldr	r9,[sp,#28] +	eor	r11,r10,r3 +	add	r5,r5,r6,ror#27 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	eor	r10,r6,r3 +	add	r4,r4,r9 +	ldr	r9,[sp,#32] +	eor	r11,r10,r7 +	add	r4,r4,r5,ror#27 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	vext.8	q12,q0,q1,#8 +	eor	r10,r5,r7 +	add	r3,r3,r9 +	ldr	r9,[sp,#36] +	veor	q2,q2,q10 +	eor	r11,r10,r6 +	add	r3,r3,r4,ror#27 +	veor	q2,q2,q3 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	vadd.i32	q13,q1,q14 +	eor	r10,r4,r6 +	vld1.32	{d28[],d29[]},[r8,:32]! +	add	r7,r7,r9 +	veor	q12,q12,q2 +	ldr	r9,[sp,#40] +	eor	r11,r10,r5 +	vshr.u32	q2,q12,#30 +	add	r7,r7,r3,ror#27 +	mov	r4,r4,ror#2 +	vst1.32	{q13},[r12,:128]! +	add	r7,r7,r11 +	eor	r10,r3,r5 +	vsli.32	q2,q12,#2 +	add	r6,r6,r9 +	ldr	r9,[sp,#44] +	eor	r11,r10,r4 +	add	r6,r6,r7,ror#27 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	eor	r10,r7,r4 +	add	r5,r5,r9 +	ldr	r9,[sp,#48] +	eor	r11,r10,r3 +	add	r5,r5,r6,ror#27 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	vext.8	q12,q1,q2,#8 +	eor	r10,r6,r3 +	add	r4,r4,r9 +	ldr	r9,[sp,#52] +	veor	q3,q3,q11 +	eor	r11,r10,r7 +	add	r4,r4,r5,ror#27 +	veor	q3,q3,q8 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	vadd.i32	q13,q2,q14 +	eor	r10,r5,r7 +	add	r3,r3,r9 +	veor	q12,q12,q3 +	ldr	r9,[sp,#56] +	eor	r11,r10,r6 +	vshr.u32	q3,q12,#30 +	add	r3,r3,r4,ror#27 +	mov	r5,r5,ror#2 +	vst1.32	{q13},[r12,:128]! +	add	r3,r3,r11 +	eor	r10,r4,r6 +	vsli.32	q3,q12,#2 +	add	r7,r7,r9 +	ldr	r9,[sp,#60] +	eor	r11,r10,r5 +	add	r7,r7,r3,ror#27 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	eor	r10,r3,r5 +	add	r6,r6,r9 +	ldr	r9,[sp,#0] +	eor	r11,r10,r4 +	add	r6,r6,r7,ror#27 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	vext.8	q12,q2,q3,#8 +	eor	r10,r7,r4 +	add	r5,r5,r9 +	ldr	r9,[sp,#4] +	veor	q8,q8,q0 +	eor	r11,r10,r3 +	add	r5,r5,r6,ror#27 +	veor	q8,q8,q9 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	vadd.i32	q13,q3,q14 +	eor	r10,r6,r3 +	add	r4,r4,r9 +	veor	q12,q12,q8 +	ldr	r9,[sp,#8] +	eor	r11,r10,r7 +	vshr.u32	q8,q12,#30 +	add	r4,r4,r5,ror#27 +	mov	r6,r6,ror#2 +	vst1.32	{q13},[r12,:128]! +	sub	r12,r12,#64 +	add	r4,r4,r11 +	eor	r10,r5,r7 +	vsli.32	q8,q12,#2 +	add	r3,r3,r9 +	ldr	r9,[sp,#12] +	eor	r11,r10,r6 +	add	r3,r3,r4,ror#27 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	eor	r10,r4,r6 +	add	r7,r7,r9 +	ldr	r9,[sp,#16] +	eor	r11,r10,r5 +	add	r7,r7,r3,ror#27 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	vext.8	q12,q3,q8,#8 +	eor	r10,r3,r5 +	add	r6,r6,r9 +	ldr	r9,[sp,#20] +	veor	q9,q9,q1 +	eor	r11,r10,r4 +	add	r6,r6,r7,ror#27 +	veor	q9,q9,q10 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	vadd.i32	q13,q8,q14 +	eor	r10,r7,r4 +	add	r5,r5,r9 +	veor	q12,q12,q9 +	ldr	r9,[sp,#24] +	eor	r11,r10,r3 +	vshr.u32	q9,q12,#30 +	add	r5,r5,r6,ror#27 +	mov	r7,r7,ror#2 +	vst1.32	{q13},[r12,:128]! +	add	r5,r5,r11 +	eor	r10,r6,r3 +	vsli.32	q9,q12,#2 +	add	r4,r4,r9 +	ldr	r9,[sp,#28] +	eor	r11,r10,r7 +	add	r4,r4,r5,ror#27 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	eor	r10,r5,r7 +	add	r3,r3,r9 +	ldr	r9,[sp,#32] +	eor	r11,r10,r6 +	add	r3,r3,r4,ror#27 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	vext.8	q12,q8,q9,#8 +	add	r7,r7,r9 +	and	r10,r5,r6 +	ldr	r9,[sp,#36] +	veor	q10,q10,q2 +	add	r7,r7,r3,ror#27 +	eor	r11,r5,r6 +	veor	q10,q10,q11 +	add	r7,r7,r10 +	and	r11,r11,r4 +	vadd.i32	q13,q9,q14 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	veor	q12,q12,q10 +	add	r6,r6,r9 +	and	r10,r4,r5 +	vshr.u32	q10,q12,#30 +	ldr	r9,[sp,#40] +	add	r6,r6,r7,ror#27 +	vst1.32	{q13},[r12,:128]! +	eor	r11,r4,r5 +	add	r6,r6,r10 +	vsli.32	q10,q12,#2 +	and	r11,r11,r3 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	add	r5,r5,r9 +	and	r10,r3,r4 +	ldr	r9,[sp,#44] +	add	r5,r5,r6,ror#27 +	eor	r11,r3,r4 +	add	r5,r5,r10 +	and	r11,r11,r7 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	add	r4,r4,r9 +	and	r10,r7,r3 +	ldr	r9,[sp,#48] +	add	r4,r4,r5,ror#27 +	eor	r11,r7,r3 +	add	r4,r4,r10 +	and	r11,r11,r6 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	vext.8	q12,q9,q10,#8 +	add	r3,r3,r9 +	and	r10,r6,r7 +	ldr	r9,[sp,#52] +	veor	q11,q11,q3 +	add	r3,r3,r4,ror#27 +	eor	r11,r6,r7 +	veor	q11,q11,q0 +	add	r3,r3,r10 +	and	r11,r11,r5 +	vadd.i32	q13,q10,q14 +	mov	r5,r5,ror#2 +	vld1.32	{d28[],d29[]},[r8,:32]! +	add	r3,r3,r11 +	veor	q12,q12,q11 +	add	r7,r7,r9 +	and	r10,r5,r6 +	vshr.u32	q11,q12,#30 +	ldr	r9,[sp,#56] +	add	r7,r7,r3,ror#27 +	vst1.32	{q13},[r12,:128]! +	eor	r11,r5,r6 +	add	r7,r7,r10 +	vsli.32	q11,q12,#2 +	and	r11,r11,r4 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	add	r6,r6,r9 +	and	r10,r4,r5 +	ldr	r9,[sp,#60] +	add	r6,r6,r7,ror#27 +	eor	r11,r4,r5 +	add	r6,r6,r10 +	and	r11,r11,r3 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	add	r5,r5,r9 +	and	r10,r3,r4 +	ldr	r9,[sp,#0] +	add	r5,r5,r6,ror#27 +	eor	r11,r3,r4 +	add	r5,r5,r10 +	and	r11,r11,r7 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	vext.8	q12,q10,q11,#8 +	add	r4,r4,r9 +	and	r10,r7,r3 +	ldr	r9,[sp,#4] +	veor	q0,q0,q8 +	add	r4,r4,r5,ror#27 +	eor	r11,r7,r3 +	veor	q0,q0,q1 +	add	r4,r4,r10 +	and	r11,r11,r6 +	vadd.i32	q13,q11,q14 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	veor	q12,q12,q0 +	add	r3,r3,r9 +	and	r10,r6,r7 +	vshr.u32	q0,q12,#30 +	ldr	r9,[sp,#8] +	add	r3,r3,r4,ror#27 +	vst1.32	{q13},[r12,:128]! +	sub	r12,r12,#64 +	eor	r11,r6,r7 +	add	r3,r3,r10 +	vsli.32	q0,q12,#2 +	and	r11,r11,r5 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	add	r7,r7,r9 +	and	r10,r5,r6 +	ldr	r9,[sp,#12] +	add	r7,r7,r3,ror#27 +	eor	r11,r5,r6 +	add	r7,r7,r10 +	and	r11,r11,r4 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	add	r6,r6,r9 +	and	r10,r4,r5 +	ldr	r9,[sp,#16] +	add	r6,r6,r7,ror#27 +	eor	r11,r4,r5 +	add	r6,r6,r10 +	and	r11,r11,r3 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	vext.8	q12,q11,q0,#8 +	add	r5,r5,r9 +	and	r10,r3,r4 +	ldr	r9,[sp,#20] +	veor	q1,q1,q9 +	add	r5,r5,r6,ror#27 +	eor	r11,r3,r4 +	veor	q1,q1,q2 +	add	r5,r5,r10 +	and	r11,r11,r7 +	vadd.i32	q13,q0,q14 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	veor	q12,q12,q1 +	add	r4,r4,r9 +	and	r10,r7,r3 +	vshr.u32	q1,q12,#30 +	ldr	r9,[sp,#24] +	add	r4,r4,r5,ror#27 +	vst1.32	{q13},[r12,:128]! +	eor	r11,r7,r3 +	add	r4,r4,r10 +	vsli.32	q1,q12,#2 +	and	r11,r11,r6 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	add	r3,r3,r9 +	and	r10,r6,r7 +	ldr	r9,[sp,#28] +	add	r3,r3,r4,ror#27 +	eor	r11,r6,r7 +	add	r3,r3,r10 +	and	r11,r11,r5 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	add	r7,r7,r9 +	and	r10,r5,r6 +	ldr	r9,[sp,#32] +	add	r7,r7,r3,ror#27 +	eor	r11,r5,r6 +	add	r7,r7,r10 +	and	r11,r11,r4 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	vext.8	q12,q0,q1,#8 +	add	r6,r6,r9 +	and	r10,r4,r5 +	ldr	r9,[sp,#36] +	veor	q2,q2,q10 +	add	r6,r6,r7,ror#27 +	eor	r11,r4,r5 +	veor	q2,q2,q3 +	add	r6,r6,r10 +	and	r11,r11,r3 +	vadd.i32	q13,q1,q14 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	veor	q12,q12,q2 +	add	r5,r5,r9 +	and	r10,r3,r4 +	vshr.u32	q2,q12,#30 +	ldr	r9,[sp,#40] +	add	r5,r5,r6,ror#27 +	vst1.32	{q13},[r12,:128]! +	eor	r11,r3,r4 +	add	r5,r5,r10 +	vsli.32	q2,q12,#2 +	and	r11,r11,r7 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	add	r4,r4,r9 +	and	r10,r7,r3 +	ldr	r9,[sp,#44] +	add	r4,r4,r5,ror#27 +	eor	r11,r7,r3 +	add	r4,r4,r10 +	and	r11,r11,r6 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	add	r3,r3,r9 +	and	r10,r6,r7 +	ldr	r9,[sp,#48] +	add	r3,r3,r4,ror#27 +	eor	r11,r6,r7 +	add	r3,r3,r10 +	and	r11,r11,r5 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	vext.8	q12,q1,q2,#8 +	eor	r10,r4,r6 +	add	r7,r7,r9 +	ldr	r9,[sp,#52] +	veor	q3,q3,q11 +	eor	r11,r10,r5 +	add	r7,r7,r3,ror#27 +	veor	q3,q3,q8 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	vadd.i32	q13,q2,q14 +	eor	r10,r3,r5 +	add	r6,r6,r9 +	veor	q12,q12,q3 +	ldr	r9,[sp,#56] +	eor	r11,r10,r4 +	vshr.u32	q3,q12,#30 +	add	r6,r6,r7,ror#27 +	mov	r3,r3,ror#2 +	vst1.32	{q13},[r12,:128]! +	add	r6,r6,r11 +	eor	r10,r7,r4 +	vsli.32	q3,q12,#2 +	add	r5,r5,r9 +	ldr	r9,[sp,#60] +	eor	r11,r10,r3 +	add	r5,r5,r6,ror#27 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	eor	r10,r6,r3 +	add	r4,r4,r9 +	ldr	r9,[sp,#0] +	eor	r11,r10,r7 +	add	r4,r4,r5,ror#27 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	vadd.i32	q13,q3,q14 +	eor	r10,r5,r7 +	add	r3,r3,r9 +	vst1.32	{q13},[r12,:128]! +	sub	r12,r12,#64 +	teq	r1,r2 +	sub	r8,r8,#16 +	subeq	r1,r1,#64 +	vld1.8	{q0-q1},[r1]! +	ldr	r9,[sp,#4] +	eor	r11,r10,r6 +	vld1.8	{q2-q3},[r1]! +	add	r3,r3,r4,ror#27 +	mov	r5,r5,ror#2 +	vld1.32	{d28[],d29[]},[r8,:32]! +	add	r3,r3,r11 +	eor	r10,r4,r6 +	vrev32.8	q0,q0 +	add	r7,r7,r9 +	ldr	r9,[sp,#8] +	eor	r11,r10,r5 +	add	r7,r7,r3,ror#27 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	eor	r10,r3,r5 +	add	r6,r6,r9 +	ldr	r9,[sp,#12] +	eor	r11,r10,r4 +	add	r6,r6,r7,ror#27 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	eor	r10,r7,r4 +	add	r5,r5,r9 +	ldr	r9,[sp,#16] +	eor	r11,r10,r3 +	add	r5,r5,r6,ror#27 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	vrev32.8	q1,q1 +	eor	r10,r6,r3 +	add	r4,r4,r9 +	vadd.i32	q8,q0,q14 +	ldr	r9,[sp,#20] +	eor	r11,r10,r7 +	vst1.32	{q8},[r12,:128]! +	add	r4,r4,r5,ror#27 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	eor	r10,r5,r7 +	add	r3,r3,r9 +	ldr	r9,[sp,#24] +	eor	r11,r10,r6 +	add	r3,r3,r4,ror#27 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	eor	r10,r4,r6 +	add	r7,r7,r9 +	ldr	r9,[sp,#28] +	eor	r11,r10,r5 +	add	r7,r7,r3,ror#27 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	eor	r10,r3,r5 +	add	r6,r6,r9 +	ldr	r9,[sp,#32] +	eor	r11,r10,r4 +	add	r6,r6,r7,ror#27 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	vrev32.8	q2,q2 +	eor	r10,r7,r4 +	add	r5,r5,r9 +	vadd.i32	q9,q1,q14 +	ldr	r9,[sp,#36] +	eor	r11,r10,r3 +	vst1.32	{q9},[r12,:128]! +	add	r5,r5,r6,ror#27 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	eor	r10,r6,r3 +	add	r4,r4,r9 +	ldr	r9,[sp,#40] +	eor	r11,r10,r7 +	add	r4,r4,r5,ror#27 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	eor	r10,r5,r7 +	add	r3,r3,r9 +	ldr	r9,[sp,#44] +	eor	r11,r10,r6 +	add	r3,r3,r4,ror#27 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	eor	r10,r4,r6 +	add	r7,r7,r9 +	ldr	r9,[sp,#48] +	eor	r11,r10,r5 +	add	r7,r7,r3,ror#27 +	mov	r4,r4,ror#2 +	add	r7,r7,r11 +	vrev32.8	q3,q3 +	eor	r10,r3,r5 +	add	r6,r6,r9 +	vadd.i32	q10,q2,q14 +	ldr	r9,[sp,#52] +	eor	r11,r10,r4 +	vst1.32	{q10},[r12,:128]! +	add	r6,r6,r7,ror#27 +	mov	r3,r3,ror#2 +	add	r6,r6,r11 +	eor	r10,r7,r4 +	add	r5,r5,r9 +	ldr	r9,[sp,#56] +	eor	r11,r10,r3 +	add	r5,r5,r6,ror#27 +	mov	r7,r7,ror#2 +	add	r5,r5,r11 +	eor	r10,r6,r3 +	add	r4,r4,r9 +	ldr	r9,[sp,#60] +	eor	r11,r10,r7 +	add	r4,r4,r5,ror#27 +	mov	r6,r6,ror#2 +	add	r4,r4,r11 +	eor	r10,r5,r7 +	add	r3,r3,r9 +	eor	r11,r10,r6 +	add	r3,r3,r4,ror#27 +	mov	r5,r5,ror#2 +	add	r3,r3,r11 +	ldmia	r0,{r9,r10,r11,r12}	@ accumulate context +	add	r3,r3,r9 +	ldr	r9,[r0,#16] +	add	r4,r4,r10 +	add	r5,r5,r11 +	add	r6,r6,r12 +	moveq	sp,r14 +	add	r7,r7,r9 +	ldrne	r9,[sp] +	stmia	r0,{r3,r4,r5,r6,r7} +	addne	r12,sp,#3*16 +	bne	.Loop_neon + +	@ vldmia	sp!,{d8-d15} +	ldmia	sp!,{r4-r12,pc} +.size	sha1_block_data_order_neon,.-sha1_block_data_order_neon +#endif +#if __ARM_ARCH__>=7 +.type	sha1_block_data_order_armv8,%function +.align	5 +sha1_block_data_order_armv8: +.LARMv8: +	vstmdb	sp!,{d8-d15}		@ ABI specification says so + +	veor	q1,q1,q1 +	adr	r3,.LK_00_19 +	vld1.32	{q0},[r0]! +	vld1.32	{d2[0]},[r0] +	sub	r0,r0,#16 +	vld1.32	{d16[],d17[]},[r3,:32]! +	vld1.32	{d18[],d19[]},[r3,:32]! +	vld1.32	{d20[],d21[]},[r3,:32]! +	vld1.32	{d22[],d23[]},[r3,:32] + +.Loop_v8: +	vld1.8		{q4-q5},[r1]! +	vld1.8		{q6-q7},[r1]! +	vrev32.8	q4,q4 +	vrev32.8	q5,q5 + +	vadd.i32	q12,q8,q4 +	vrev32.8	q6,q6 +	vmov		q14,q0	@ offload +	subs		r2,r2,#1 + +	vadd.i32	q13,q8,q5 +	vrev32.8	q7,q7 +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 0 +	.byte	0x68,0x0c,0x02,0xf2	@ sha1c q0,q1,q12 +	vadd.i32	q12,q8,q6 +	.byte	0x4c,0x8c,0x3a,0xf2	@ sha1su0 q4,q5,q6 +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 1 +	.byte	0x6a,0x0c,0x06,0xf2	@ sha1c q0,q3,q13 +	vadd.i32	q13,q8,q7 +	.byte	0x8e,0x83,0xba,0xf3	@ sha1su1 q4,q7 +	.byte	0x4e,0xac,0x3c,0xf2	@ sha1su0 q5,q6,q7 +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 2 +	.byte	0x68,0x0c,0x04,0xf2	@ sha1c q0,q2,q12 +	vadd.i32	q12,q8,q4 +	.byte	0x88,0xa3,0xba,0xf3	@ sha1su1 q5,q4 +	.byte	0x48,0xcc,0x3e,0xf2	@ sha1su0 q6,q7,q4 +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 3 +	.byte	0x6a,0x0c,0x06,0xf2	@ sha1c q0,q3,q13 +	vadd.i32	q13,q9,q5 +	.byte	0x8a,0xc3,0xba,0xf3	@ sha1su1 q6,q5 +	.byte	0x4a,0xec,0x38,0xf2	@ sha1su0 q7,q4,q5 +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 4 +	.byte	0x68,0x0c,0x04,0xf2	@ sha1c q0,q2,q12 +	vadd.i32	q12,q9,q6 +	.byte	0x8c,0xe3,0xba,0xf3	@ sha1su1 q7,q6 +	.byte	0x4c,0x8c,0x3a,0xf2	@ sha1su0 q4,q5,q6 +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 5 +	.byte	0x6a,0x0c,0x16,0xf2	@ sha1p q0,q3,q13 +	vadd.i32	q13,q9,q7 +	.byte	0x8e,0x83,0xba,0xf3	@ sha1su1 q4,q7 +	.byte	0x4e,0xac,0x3c,0xf2	@ sha1su0 q5,q6,q7 +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 6 +	.byte	0x68,0x0c,0x14,0xf2	@ sha1p q0,q2,q12 +	vadd.i32	q12,q9,q4 +	.byte	0x88,0xa3,0xba,0xf3	@ sha1su1 q5,q4 +	.byte	0x48,0xcc,0x3e,0xf2	@ sha1su0 q6,q7,q4 +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 7 +	.byte	0x6a,0x0c,0x16,0xf2	@ sha1p q0,q3,q13 +	vadd.i32	q13,q9,q5 +	.byte	0x8a,0xc3,0xba,0xf3	@ sha1su1 q6,q5 +	.byte	0x4a,0xec,0x38,0xf2	@ sha1su0 q7,q4,q5 +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 8 +	.byte	0x68,0x0c,0x14,0xf2	@ sha1p q0,q2,q12 +	vadd.i32	q12,q10,q6 +	.byte	0x8c,0xe3,0xba,0xf3	@ sha1su1 q7,q6 +	.byte	0x4c,0x8c,0x3a,0xf2	@ sha1su0 q4,q5,q6 +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 9 +	.byte	0x6a,0x0c,0x16,0xf2	@ sha1p q0,q3,q13 +	vadd.i32	q13,q10,q7 +	.byte	0x8e,0x83,0xba,0xf3	@ sha1su1 q4,q7 +	.byte	0x4e,0xac,0x3c,0xf2	@ sha1su0 q5,q6,q7 +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 10 +	.byte	0x68,0x0c,0x24,0xf2	@ sha1m q0,q2,q12 +	vadd.i32	q12,q10,q4 +	.byte	0x88,0xa3,0xba,0xf3	@ sha1su1 q5,q4 +	.byte	0x48,0xcc,0x3e,0xf2	@ sha1su0 q6,q7,q4 +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 11 +	.byte	0x6a,0x0c,0x26,0xf2	@ sha1m q0,q3,q13 +	vadd.i32	q13,q10,q5 +	.byte	0x8a,0xc3,0xba,0xf3	@ sha1su1 q6,q5 +	.byte	0x4a,0xec,0x38,0xf2	@ sha1su0 q7,q4,q5 +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 12 +	.byte	0x68,0x0c,0x24,0xf2	@ sha1m q0,q2,q12 +	vadd.i32	q12,q10,q6 +	.byte	0x8c,0xe3,0xba,0xf3	@ sha1su1 q7,q6 +	.byte	0x4c,0x8c,0x3a,0xf2	@ sha1su0 q4,q5,q6 +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 13 +	.byte	0x6a,0x0c,0x26,0xf2	@ sha1m q0,q3,q13 +	vadd.i32	q13,q11,q7 +	.byte	0x8e,0x83,0xba,0xf3	@ sha1su1 q4,q7 +	.byte	0x4e,0xac,0x3c,0xf2	@ sha1su0 q5,q6,q7 +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 14 +	.byte	0x68,0x0c,0x24,0xf2	@ sha1m q0,q2,q12 +	vadd.i32	q12,q11,q4 +	.byte	0x88,0xa3,0xba,0xf3	@ sha1su1 q5,q4 +	.byte	0x48,0xcc,0x3e,0xf2	@ sha1su0 q6,q7,q4 +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 15 +	.byte	0x6a,0x0c,0x16,0xf2	@ sha1p q0,q3,q13 +	vadd.i32	q13,q11,q5 +	.byte	0x8a,0xc3,0xba,0xf3	@ sha1su1 q6,q5 +	.byte	0x4a,0xec,0x38,0xf2	@ sha1su0 q7,q4,q5 +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 16 +	.byte	0x68,0x0c,0x14,0xf2	@ sha1p q0,q2,q12 +	vadd.i32	q12,q11,q6 +	.byte	0x8c,0xe3,0xba,0xf3	@ sha1su1 q7,q6 +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 17 +	.byte	0x6a,0x0c,0x16,0xf2	@ sha1p q0,q3,q13 +	vadd.i32	q13,q11,q7 + +	.byte	0xc0,0x62,0xb9,0xf3	@ sha1h q3,q0		@ 18 +	.byte	0x68,0x0c,0x14,0xf2	@ sha1p q0,q2,q12 + +	.byte	0xc0,0x42,0xb9,0xf3	@ sha1h q2,q0		@ 19 +	.byte	0x6a,0x0c,0x16,0xf2	@ sha1p q0,q3,q13 + +	vadd.i32	q1,q1,q2 +	vadd.i32	q0,q0,q14 +	bne		.Loop_v8 + +	vst1.32		{q0},[r0]! +	vst1.32		{d2[0]},[r0] + +	vldmia	sp!,{d8-d15} +	bx	lr					@ bx lr +.size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 +#endif +.comm	OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha1-armv8.S b/main/openssl/crypto/sha/asm/sha1-armv8.S new file mode 100644 index 00000000..f9d12625 --- /dev/null +++ b/main/openssl/crypto/sha/asm/sha1-armv8.S @@ -0,0 +1,1211 @@ +#include "arm_arch.h" + +.text + +.globl	sha1_block_data_order +.type	sha1_block_data_order,%function +.align	6 +sha1_block_data_order: +	ldr	x16,.LOPENSSL_armcap_P +	adr	x17,.LOPENSSL_armcap_P +	add	x16,x16,x17 +	ldr	w16,[x16] +	tst	w16,#ARMV8_SHA1 +	b.ne	.Lv8_entry + +	stp	x29,x30,[sp,#-96]! +	add	x29,sp,#0 +	stp	x19,x20,[sp,#16] +	stp	x21,x22,[sp,#32] +	stp	x23,x24,[sp,#48] +	stp	x25,x26,[sp,#64] +	stp	x27,x28,[sp,#80] + +	ldp	w20,w21,[x0] +	ldp	w22,w23,[x0,#8] +	ldr	w24,[x0,#16] + +.Loop: +	ldr	x3,[x1],#64 +	movz	w28,#0x7999 +	sub	x2,x2,#1 +	movk	w28,#0x5a82,lsl#16 +#ifdef	__ARMEB__ +	ror	x3,x3,#32 +#else +	rev32	x3,x3 +#endif +	add	w24,w24,w28		// warm it up +	add	w24,w24,w3 +	lsr	x4,x3,#32 +	ldr	x5,[x1,#-56] +	bic	w25,w23,w21 +	and	w26,w22,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	orr	w25,w25,w26 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	add	w23,w23,w4	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +#ifdef	__ARMEB__ +	ror	x5,x5,#32 +#else +	rev32	x5,x5 +#endif +	bic	w25,w22,w20 +	and	w26,w21,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	orr	w25,w25,w26 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	add	w22,w22,w5	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	lsr	x6,x5,#32 +	ldr	x7,[x1,#-48] +	bic	w25,w21,w24 +	and	w26,w20,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	orr	w25,w25,w26 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	add	w21,w21,w6	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +#ifdef	__ARMEB__ +	ror	x7,x7,#32 +#else +	rev32	x7,x7 +#endif +	bic	w25,w20,w23 +	and	w26,w24,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	orr	w25,w25,w26 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	add	w20,w20,w7	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	lsr	x8,x7,#32 +	ldr	x9,[x1,#-40] +	bic	w25,w24,w22 +	and	w26,w23,w22 +	ror	w27,w21,#27 +	add	w24,w24,w28		// future e+=K +	orr	w25,w25,w26 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	add	w24,w24,w8	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +#ifdef	__ARMEB__ +	ror	x9,x9,#32 +#else +	rev32	x9,x9 +#endif +	bic	w25,w23,w21 +	and	w26,w22,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	orr	w25,w25,w26 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	add	w23,w23,w9	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	lsr	x10,x9,#32 +	ldr	x11,[x1,#-32] +	bic	w25,w22,w20 +	and	w26,w21,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	orr	w25,w25,w26 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	add	w22,w22,w10	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +#ifdef	__ARMEB__ +	ror	x11,x11,#32 +#else +	rev32	x11,x11 +#endif +	bic	w25,w21,w24 +	and	w26,w20,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	orr	w25,w25,w26 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	add	w21,w21,w11	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	lsr	x12,x11,#32 +	ldr	x13,[x1,#-24] +	bic	w25,w20,w23 +	and	w26,w24,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	orr	w25,w25,w26 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	add	w20,w20,w12	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +#ifdef	__ARMEB__ +	ror	x13,x13,#32 +#else +	rev32	x13,x13 +#endif +	bic	w25,w24,w22 +	and	w26,w23,w22 +	ror	w27,w21,#27 +	add	w24,w24,w28		// future e+=K +	orr	w25,w25,w26 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	add	w24,w24,w13	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	lsr	x14,x13,#32 +	ldr	x15,[x1,#-16] +	bic	w25,w23,w21 +	and	w26,w22,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	orr	w25,w25,w26 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	add	w23,w23,w14	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +#ifdef	__ARMEB__ +	ror	x15,x15,#32 +#else +	rev32	x15,x15 +#endif +	bic	w25,w22,w20 +	and	w26,w21,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	orr	w25,w25,w26 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	add	w22,w22,w15	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	lsr	x16,x15,#32 +	ldr	x17,[x1,#-8] +	bic	w25,w21,w24 +	and	w26,w20,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	orr	w25,w25,w26 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	add	w21,w21,w16	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +#ifdef	__ARMEB__ +	ror	x17,x17,#32 +#else +	rev32	x17,x17 +#endif +	bic	w25,w20,w23 +	and	w26,w24,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	orr	w25,w25,w26 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	add	w20,w20,w17	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	lsr	x19,x17,#32 +	 eor	w3,w3,w5 +	bic	w25,w24,w22 +	and	w26,w23,w22 +	ror	w27,w21,#27 +	 eor	w3,w3,w11 +	add	w24,w24,w28		// future e+=K +	orr	w25,w25,w26 +	add	w20,w20,w27		// e+=rot(a,5) +	 eor	w3,w3,w16 +	ror	w22,w22,#2 +	add	w24,w24,w19	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w3,w3,#31 +	 eor	w4,w4,w6 +	bic	w25,w23,w21 +	and	w26,w22,w21 +	ror	w27,w20,#27 +	 eor	w4,w4,w12 +	add	w23,w23,w28		// future e+=K +	orr	w25,w25,w26 +	add	w24,w24,w27		// e+=rot(a,5) +	 eor	w4,w4,w17 +	ror	w21,w21,#2 +	add	w23,w23,w3	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w4,w4,#31 +	 eor	w5,w5,w7 +	bic	w25,w22,w20 +	and	w26,w21,w20 +	ror	w27,w24,#27 +	 eor	w5,w5,w13 +	add	w22,w22,w28		// future e+=K +	orr	w25,w25,w26 +	add	w23,w23,w27		// e+=rot(a,5) +	 eor	w5,w5,w19 +	ror	w20,w20,#2 +	add	w22,w22,w4	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w5,w5,#31 +	 eor	w6,w6,w8 +	bic	w25,w21,w24 +	and	w26,w20,w24 +	ror	w27,w23,#27 +	 eor	w6,w6,w14 +	add	w21,w21,w28		// future e+=K +	orr	w25,w25,w26 +	add	w22,w22,w27		// e+=rot(a,5) +	 eor	w6,w6,w3 +	ror	w24,w24,#2 +	add	w21,w21,w5	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w6,w6,#31 +	 eor	w7,w7,w9 +	bic	w25,w20,w23 +	and	w26,w24,w23 +	ror	w27,w22,#27 +	 eor	w7,w7,w15 +	add	w20,w20,w28		// future e+=K +	orr	w25,w25,w26 +	add	w21,w21,w27		// e+=rot(a,5) +	 eor	w7,w7,w4 +	ror	w23,w23,#2 +	add	w20,w20,w6	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w7,w7,#31 +	movz	w28,#0xeba1 +	movk	w28,#0x6ed9,lsl#16 +	 eor	w8,w8,w10 +	bic	w25,w24,w22 +	and	w26,w23,w22 +	ror	w27,w21,#27 +	 eor	w8,w8,w16 +	add	w24,w24,w28		// future e+=K +	orr	w25,w25,w26 +	add	w20,w20,w27		// e+=rot(a,5) +	 eor	w8,w8,w5 +	ror	w22,w22,#2 +	add	w24,w24,w7	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w8,w8,#31 +	 eor	w9,w9,w11 +	eor	w25,w23,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	 eor	w9,w9,w17 +	eor	w25,w25,w22 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	 eor	w9,w9,w6 +	add	w23,w23,w8	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w9,w9,#31 +	 eor	w10,w10,w12 +	eor	w25,w22,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	 eor	w10,w10,w19 +	eor	w25,w25,w21 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	 eor	w10,w10,w7 +	add	w22,w22,w9	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w10,w10,#31 +	 eor	w11,w11,w13 +	eor	w25,w21,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	 eor	w11,w11,w3 +	eor	w25,w25,w20 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	 eor	w11,w11,w8 +	add	w21,w21,w10	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w11,w11,#31 +	 eor	w12,w12,w14 +	eor	w25,w20,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	 eor	w12,w12,w4 +	eor	w25,w25,w24 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	 eor	w12,w12,w9 +	add	w20,w20,w11	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w12,w12,#31 +	 eor	w13,w13,w15 +	eor	w25,w24,w22 +	ror	w27,w21,#27 +	add	w24,w24,w28		// future e+=K +	 eor	w13,w13,w5 +	eor	w25,w25,w23 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	 eor	w13,w13,w10 +	add	w24,w24,w12	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w13,w13,#31 +	 eor	w14,w14,w16 +	eor	w25,w23,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	 eor	w14,w14,w6 +	eor	w25,w25,w22 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	 eor	w14,w14,w11 +	add	w23,w23,w13	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w14,w14,#31 +	 eor	w15,w15,w17 +	eor	w25,w22,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	 eor	w15,w15,w7 +	eor	w25,w25,w21 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	 eor	w15,w15,w12 +	add	w22,w22,w14	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w15,w15,#31 +	 eor	w16,w16,w19 +	eor	w25,w21,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	 eor	w16,w16,w8 +	eor	w25,w25,w20 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	 eor	w16,w16,w13 +	add	w21,w21,w15	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w16,w16,#31 +	 eor	w17,w17,w3 +	eor	w25,w20,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	 eor	w17,w17,w9 +	eor	w25,w25,w24 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	 eor	w17,w17,w14 +	add	w20,w20,w16	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w17,w17,#31 +	 eor	w19,w19,w4 +	eor	w25,w24,w22 +	ror	w27,w21,#27 +	add	w24,w24,w28		// future e+=K +	 eor	w19,w19,w10 +	eor	w25,w25,w23 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	 eor	w19,w19,w15 +	add	w24,w24,w17	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w19,w19,#31 +	 eor	w3,w3,w5 +	eor	w25,w23,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	 eor	w3,w3,w11 +	eor	w25,w25,w22 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	 eor	w3,w3,w16 +	add	w23,w23,w19	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w3,w3,#31 +	 eor	w4,w4,w6 +	eor	w25,w22,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	 eor	w4,w4,w12 +	eor	w25,w25,w21 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	 eor	w4,w4,w17 +	add	w22,w22,w3	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w4,w4,#31 +	 eor	w5,w5,w7 +	eor	w25,w21,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	 eor	w5,w5,w13 +	eor	w25,w25,w20 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	 eor	w5,w5,w19 +	add	w21,w21,w4	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w5,w5,#31 +	 eor	w6,w6,w8 +	eor	w25,w20,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	 eor	w6,w6,w14 +	eor	w25,w25,w24 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	 eor	w6,w6,w3 +	add	w20,w20,w5	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w6,w6,#31 +	 eor	w7,w7,w9 +	eor	w25,w24,w22 +	ror	w27,w21,#27 +	add	w24,w24,w28		// future e+=K +	 eor	w7,w7,w15 +	eor	w25,w25,w23 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	 eor	w7,w7,w4 +	add	w24,w24,w6	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w7,w7,#31 +	 eor	w8,w8,w10 +	eor	w25,w23,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	 eor	w8,w8,w16 +	eor	w25,w25,w22 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	 eor	w8,w8,w5 +	add	w23,w23,w7	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w8,w8,#31 +	 eor	w9,w9,w11 +	eor	w25,w22,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	 eor	w9,w9,w17 +	eor	w25,w25,w21 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	 eor	w9,w9,w6 +	add	w22,w22,w8	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w9,w9,#31 +	 eor	w10,w10,w12 +	eor	w25,w21,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	 eor	w10,w10,w19 +	eor	w25,w25,w20 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	 eor	w10,w10,w7 +	add	w21,w21,w9	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w10,w10,#31 +	 eor	w11,w11,w13 +	eor	w25,w20,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	 eor	w11,w11,w3 +	eor	w25,w25,w24 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	 eor	w11,w11,w8 +	add	w20,w20,w10	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w11,w11,#31 +	movz	w28,#0xbcdc +	movk	w28,#0x8f1b,lsl#16 +	 eor	w12,w12,w14 +	eor	w25,w24,w22 +	ror	w27,w21,#27 +	add	w24,w24,w28		// future e+=K +	 eor	w12,w12,w4 +	eor	w25,w25,w23 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	 eor	w12,w12,w9 +	add	w24,w24,w11	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w12,w12,#31 +	orr	w25,w21,w22 +	and	w26,w21,w22 +	 eor	w13,w13,w15 +	ror	w27,w20,#27 +	and	w25,w25,w23 +	add	w23,w23,w28		// future e+=K +	 eor	w13,w13,w5 +	add	w24,w24,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w21,w21,#2 +	 eor	w13,w13,w10 +	add	w23,w23,w12	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w13,w13,#31 +	orr	w25,w20,w21 +	and	w26,w20,w21 +	 eor	w14,w14,w16 +	ror	w27,w24,#27 +	and	w25,w25,w22 +	add	w22,w22,w28		// future e+=K +	 eor	w14,w14,w6 +	add	w23,w23,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w20,w20,#2 +	 eor	w14,w14,w11 +	add	w22,w22,w13	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w14,w14,#31 +	orr	w25,w24,w20 +	and	w26,w24,w20 +	 eor	w15,w15,w17 +	ror	w27,w23,#27 +	and	w25,w25,w21 +	add	w21,w21,w28		// future e+=K +	 eor	w15,w15,w7 +	add	w22,w22,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w24,w24,#2 +	 eor	w15,w15,w12 +	add	w21,w21,w14	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w15,w15,#31 +	orr	w25,w23,w24 +	and	w26,w23,w24 +	 eor	w16,w16,w19 +	ror	w27,w22,#27 +	and	w25,w25,w20 +	add	w20,w20,w28		// future e+=K +	 eor	w16,w16,w8 +	add	w21,w21,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w23,w23,#2 +	 eor	w16,w16,w13 +	add	w20,w20,w15	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w16,w16,#31 +	orr	w25,w22,w23 +	and	w26,w22,w23 +	 eor	w17,w17,w3 +	ror	w27,w21,#27 +	and	w25,w25,w24 +	add	w24,w24,w28		// future e+=K +	 eor	w17,w17,w9 +	add	w20,w20,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w22,w22,#2 +	 eor	w17,w17,w14 +	add	w24,w24,w16	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w17,w17,#31 +	orr	w25,w21,w22 +	and	w26,w21,w22 +	 eor	w19,w19,w4 +	ror	w27,w20,#27 +	and	w25,w25,w23 +	add	w23,w23,w28		// future e+=K +	 eor	w19,w19,w10 +	add	w24,w24,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w21,w21,#2 +	 eor	w19,w19,w15 +	add	w23,w23,w17	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w19,w19,#31 +	orr	w25,w20,w21 +	and	w26,w20,w21 +	 eor	w3,w3,w5 +	ror	w27,w24,#27 +	and	w25,w25,w22 +	add	w22,w22,w28		// future e+=K +	 eor	w3,w3,w11 +	add	w23,w23,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w20,w20,#2 +	 eor	w3,w3,w16 +	add	w22,w22,w19	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w3,w3,#31 +	orr	w25,w24,w20 +	and	w26,w24,w20 +	 eor	w4,w4,w6 +	ror	w27,w23,#27 +	and	w25,w25,w21 +	add	w21,w21,w28		// future e+=K +	 eor	w4,w4,w12 +	add	w22,w22,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w24,w24,#2 +	 eor	w4,w4,w17 +	add	w21,w21,w3	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w4,w4,#31 +	orr	w25,w23,w24 +	and	w26,w23,w24 +	 eor	w5,w5,w7 +	ror	w27,w22,#27 +	and	w25,w25,w20 +	add	w20,w20,w28		// future e+=K +	 eor	w5,w5,w13 +	add	w21,w21,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w23,w23,#2 +	 eor	w5,w5,w19 +	add	w20,w20,w4	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w5,w5,#31 +	orr	w25,w22,w23 +	and	w26,w22,w23 +	 eor	w6,w6,w8 +	ror	w27,w21,#27 +	and	w25,w25,w24 +	add	w24,w24,w28		// future e+=K +	 eor	w6,w6,w14 +	add	w20,w20,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w22,w22,#2 +	 eor	w6,w6,w3 +	add	w24,w24,w5	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w6,w6,#31 +	orr	w25,w21,w22 +	and	w26,w21,w22 +	 eor	w7,w7,w9 +	ror	w27,w20,#27 +	and	w25,w25,w23 +	add	w23,w23,w28		// future e+=K +	 eor	w7,w7,w15 +	add	w24,w24,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w21,w21,#2 +	 eor	w7,w7,w4 +	add	w23,w23,w6	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w7,w7,#31 +	orr	w25,w20,w21 +	and	w26,w20,w21 +	 eor	w8,w8,w10 +	ror	w27,w24,#27 +	and	w25,w25,w22 +	add	w22,w22,w28		// future e+=K +	 eor	w8,w8,w16 +	add	w23,w23,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w20,w20,#2 +	 eor	w8,w8,w5 +	add	w22,w22,w7	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w8,w8,#31 +	orr	w25,w24,w20 +	and	w26,w24,w20 +	 eor	w9,w9,w11 +	ror	w27,w23,#27 +	and	w25,w25,w21 +	add	w21,w21,w28		// future e+=K +	 eor	w9,w9,w17 +	add	w22,w22,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w24,w24,#2 +	 eor	w9,w9,w6 +	add	w21,w21,w8	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w9,w9,#31 +	orr	w25,w23,w24 +	and	w26,w23,w24 +	 eor	w10,w10,w12 +	ror	w27,w22,#27 +	and	w25,w25,w20 +	add	w20,w20,w28		// future e+=K +	 eor	w10,w10,w19 +	add	w21,w21,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w23,w23,#2 +	 eor	w10,w10,w7 +	add	w20,w20,w9	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w10,w10,#31 +	orr	w25,w22,w23 +	and	w26,w22,w23 +	 eor	w11,w11,w13 +	ror	w27,w21,#27 +	and	w25,w25,w24 +	add	w24,w24,w28		// future e+=K +	 eor	w11,w11,w3 +	add	w20,w20,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w22,w22,#2 +	 eor	w11,w11,w8 +	add	w24,w24,w10	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w11,w11,#31 +	orr	w25,w21,w22 +	and	w26,w21,w22 +	 eor	w12,w12,w14 +	ror	w27,w20,#27 +	and	w25,w25,w23 +	add	w23,w23,w28		// future e+=K +	 eor	w12,w12,w4 +	add	w24,w24,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w21,w21,#2 +	 eor	w12,w12,w9 +	add	w23,w23,w11	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w12,w12,#31 +	orr	w25,w20,w21 +	and	w26,w20,w21 +	 eor	w13,w13,w15 +	ror	w27,w24,#27 +	and	w25,w25,w22 +	add	w22,w22,w28		// future e+=K +	 eor	w13,w13,w5 +	add	w23,w23,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w20,w20,#2 +	 eor	w13,w13,w10 +	add	w22,w22,w12	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w13,w13,#31 +	orr	w25,w24,w20 +	and	w26,w24,w20 +	 eor	w14,w14,w16 +	ror	w27,w23,#27 +	and	w25,w25,w21 +	add	w21,w21,w28		// future e+=K +	 eor	w14,w14,w6 +	add	w22,w22,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w24,w24,#2 +	 eor	w14,w14,w11 +	add	w21,w21,w13	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w14,w14,#31 +	orr	w25,w23,w24 +	and	w26,w23,w24 +	 eor	w15,w15,w17 +	ror	w27,w22,#27 +	and	w25,w25,w20 +	add	w20,w20,w28		// future e+=K +	 eor	w15,w15,w7 +	add	w21,w21,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w23,w23,#2 +	 eor	w15,w15,w12 +	add	w20,w20,w14	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w15,w15,#31 +	movz	w28,#0xc1d6 +	movk	w28,#0xca62,lsl#16 +	orr	w25,w22,w23 +	and	w26,w22,w23 +	 eor	w16,w16,w19 +	ror	w27,w21,#27 +	and	w25,w25,w24 +	add	w24,w24,w28		// future e+=K +	 eor	w16,w16,w8 +	add	w20,w20,w27		// e+=rot(a,5) +	orr	w25,w25,w26 +	ror	w22,w22,#2 +	 eor	w16,w16,w13 +	add	w24,w24,w15	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w16,w16,#31 +	 eor	w17,w17,w3 +	eor	w25,w23,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	 eor	w17,w17,w9 +	eor	w25,w25,w22 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	 eor	w17,w17,w14 +	add	w23,w23,w16	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w17,w17,#31 +	 eor	w19,w19,w4 +	eor	w25,w22,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	 eor	w19,w19,w10 +	eor	w25,w25,w21 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	 eor	w19,w19,w15 +	add	w22,w22,w17	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w19,w19,#31 +	 eor	w3,w3,w5 +	eor	w25,w21,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	 eor	w3,w3,w11 +	eor	w25,w25,w20 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	 eor	w3,w3,w16 +	add	w21,w21,w19	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w3,w3,#31 +	 eor	w4,w4,w6 +	eor	w25,w20,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	 eor	w4,w4,w12 +	eor	w25,w25,w24 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	 eor	w4,w4,w17 +	add	w20,w20,w3	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w4,w4,#31 +	 eor	w5,w5,w7 +	eor	w25,w24,w22 +	ror	w27,w21,#27 +	add	w24,w24,w28		// future e+=K +	 eor	w5,w5,w13 +	eor	w25,w25,w23 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	 eor	w5,w5,w19 +	add	w24,w24,w4	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w5,w5,#31 +	 eor	w6,w6,w8 +	eor	w25,w23,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	 eor	w6,w6,w14 +	eor	w25,w25,w22 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	 eor	w6,w6,w3 +	add	w23,w23,w5	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w6,w6,#31 +	 eor	w7,w7,w9 +	eor	w25,w22,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	 eor	w7,w7,w15 +	eor	w25,w25,w21 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	 eor	w7,w7,w4 +	add	w22,w22,w6	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w7,w7,#31 +	 eor	w8,w8,w10 +	eor	w25,w21,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	 eor	w8,w8,w16 +	eor	w25,w25,w20 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	 eor	w8,w8,w5 +	add	w21,w21,w7	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w8,w8,#31 +	 eor	w9,w9,w11 +	eor	w25,w20,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	 eor	w9,w9,w17 +	eor	w25,w25,w24 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	 eor	w9,w9,w6 +	add	w20,w20,w8	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w9,w9,#31 +	 eor	w10,w10,w12 +	eor	w25,w24,w22 +	ror	w27,w21,#27 +	add	w24,w24,w28		// future e+=K +	 eor	w10,w10,w19 +	eor	w25,w25,w23 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	 eor	w10,w10,w7 +	add	w24,w24,w9	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w10,w10,#31 +	 eor	w11,w11,w13 +	eor	w25,w23,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	 eor	w11,w11,w3 +	eor	w25,w25,w22 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	 eor	w11,w11,w8 +	add	w23,w23,w10	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w11,w11,#31 +	 eor	w12,w12,w14 +	eor	w25,w22,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	 eor	w12,w12,w4 +	eor	w25,w25,w21 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	 eor	w12,w12,w9 +	add	w22,w22,w11	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w12,w12,#31 +	 eor	w13,w13,w15 +	eor	w25,w21,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	 eor	w13,w13,w5 +	eor	w25,w25,w20 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	 eor	w13,w13,w10 +	add	w21,w21,w12	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w13,w13,#31 +	 eor	w14,w14,w16 +	eor	w25,w20,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	 eor	w14,w14,w6 +	eor	w25,w25,w24 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	 eor	w14,w14,w11 +	add	w20,w20,w13	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	 ror	w14,w14,#31 +	 eor	w15,w15,w17 +	eor	w25,w24,w22 +	ror	w27,w21,#27 +	add	w24,w24,w28		// future e+=K +	 eor	w15,w15,w7 +	eor	w25,w25,w23 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	 eor	w15,w15,w12 +	add	w24,w24,w14	// future e+=X[i] +	add	w20,w20,w25		// e+=F(b,c,d) +	 ror	w15,w15,#31 +	 eor	w16,w16,w19 +	eor	w25,w23,w21 +	ror	w27,w20,#27 +	add	w23,w23,w28		// future e+=K +	 eor	w16,w16,w8 +	eor	w25,w25,w22 +	add	w24,w24,w27		// e+=rot(a,5) +	ror	w21,w21,#2 +	 eor	w16,w16,w13 +	add	w23,w23,w15	// future e+=X[i] +	add	w24,w24,w25		// e+=F(b,c,d) +	 ror	w16,w16,#31 +	 eor	w17,w17,w3 +	eor	w25,w22,w20 +	ror	w27,w24,#27 +	add	w22,w22,w28		// future e+=K +	 eor	w17,w17,w9 +	eor	w25,w25,w21 +	add	w23,w23,w27		// e+=rot(a,5) +	ror	w20,w20,#2 +	 eor	w17,w17,w14 +	add	w22,w22,w16	// future e+=X[i] +	add	w23,w23,w25		// e+=F(b,c,d) +	 ror	w17,w17,#31 +	 eor	w19,w19,w4 +	eor	w25,w21,w24 +	ror	w27,w23,#27 +	add	w21,w21,w28		// future e+=K +	 eor	w19,w19,w10 +	eor	w25,w25,w20 +	add	w22,w22,w27		// e+=rot(a,5) +	ror	w24,w24,#2 +	 eor	w19,w19,w15 +	add	w21,w21,w17	// future e+=X[i] +	add	w22,w22,w25		// e+=F(b,c,d) +	 ror	w19,w19,#31 +	ldp	w4,w5,[x0] +	eor	w25,w20,w23 +	ror	w27,w22,#27 +	add	w20,w20,w28		// future e+=K +	eor	w25,w25,w24 +	add	w21,w21,w27		// e+=rot(a,5) +	ror	w23,w23,#2 +	add	w20,w20,w19	// future e+=X[i] +	add	w21,w21,w25		// e+=F(b,c,d) +	ldp	w6,w7,[x0,#8] +	eor	w25,w24,w22 +	ror	w27,w21,#27 +	eor	w25,w25,w23 +	add	w20,w20,w27		// e+=rot(a,5) +	ror	w22,w22,#2 +	ldr	w8,[x0,#16] +	add	w20,w20,w25		// e+=F(b,c,d) +	add	w21,w21,w5 +	add	w22,w22,w6 +	add	w20,w20,w4 +	add	w23,w23,w7 +	add	w24,w24,w8 +	stp	w20,w21,[x0] +	stp	w22,w23,[x0,#8] +	str	w24,[x0,#16] +	cbnz	x2,.Loop + +	ldp	x19,x20,[sp,#16] +	ldp	x21,x22,[sp,#32] +	ldp	x23,x24,[sp,#48] +	ldp	x25,x26,[sp,#64] +	ldp	x27,x28,[sp,#80] +	ldr	x29,[sp],#96 +	ret +.size	sha1_block_data_order,.-sha1_block_data_order +.type	sha1_block_armv8,%function +.align	6 +sha1_block_armv8: +.Lv8_entry: +	stp	x29,x30,[sp,#-16]! +	add	x29,sp,#0 + +	adr	x4,.Lconst +	eor	v1.16b,v1.16b,v1.16b +	ld1	{v0.4s},[x0],#16 +	ld1	{v1.s}[0],[x0] +	sub	x0,x0,#16 +	ld1	{v16.4s-v19.4s},[x4] + +.Loop_hw: +	ld1	{v4.16b-v7.16b},[x1],#64 +	sub	x2,x2,#1 +	rev32	v4.16b,v4.16b +	rev32	v5.16b,v5.16b + +	add	v20.4s,v16.4s,v4.4s +	rev32	v6.16b,v6.16b +	orr	v22.16b,v0.16b,v0.16b	// offload + +	add	v21.4s,v16.4s,v5.4s +	rev32	v7.16b,v7.16b +	.inst	0x5e280803	//sha1h v3.16b,v0.16b +	.inst	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0 +	add	v20.4s,v16.4s,v6.4s +	.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 1 +	.inst	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s +	add	v21.4s,v16.4s,v7.4s +	.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b +	.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b +	.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 2 +	.inst	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s +	add	v20.4s,v16.4s,v4.4s +	.inst	0x5e281885	//sha1su1 v5.16b,v4.16b +	.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 3 +	.inst	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s +	add	v21.4s,v17.4s,v5.4s +	.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b +	.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b +	.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 4 +	.inst	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s +	add	v20.4s,v17.4s,v6.4s +	.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b +	.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 5 +	.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s +	add	v21.4s,v17.4s,v7.4s +	.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b +	.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b +	.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 6 +	.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s +	add	v20.4s,v17.4s,v4.4s +	.inst	0x5e281885	//sha1su1 v5.16b,v4.16b +	.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 7 +	.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s +	add	v21.4s,v17.4s,v5.4s +	.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b +	.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b +	.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 8 +	.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s +	add	v20.4s,v18.4s,v6.4s +	.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b +	.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 9 +	.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s +	add	v21.4s,v18.4s,v7.4s +	.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b +	.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b +	.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 10 +	.inst	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s +	add	v20.4s,v18.4s,v4.4s +	.inst	0x5e281885	//sha1su1 v5.16b,v4.16b +	.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 11 +	.inst	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s +	add	v21.4s,v18.4s,v5.4s +	.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b +	.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b +	.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 12 +	.inst	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s +	add	v20.4s,v18.4s,v6.4s +	.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b +	.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 13 +	.inst	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s +	add	v21.4s,v19.4s,v7.4s +	.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b +	.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b +	.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 14 +	.inst	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s +	add	v20.4s,v19.4s,v4.4s +	.inst	0x5e281885	//sha1su1 v5.16b,v4.16b +	.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 15 +	.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s +	add	v21.4s,v19.4s,v5.4s +	.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b +	.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b +	.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 16 +	.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s +	add	v20.4s,v19.4s,v6.4s +	.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 17 +	.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s +	add	v21.4s,v19.4s,v7.4s + +	.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 18 +	.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s + +	.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 19 +	.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s + +	add	v1.4s,v1.4s,v2.4s +	add	v0.4s,v0.4s,v22.4s + +	cbnz	x2,.Loop_hw + +	st1	{v0.4s},[x0],#16 +	st1	{v1.s}[0],[x0] + +	ldr	x29,[sp],#16 +	ret +.size	sha1_block_armv8,.-sha1_block_armv8 +.align	6 +.Lconst: +.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19 +.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39 +.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59 +.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79 +.LOPENSSL_armcap_P: +.quad	OPENSSL_armcap_P-. +.asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align	2 +.comm	OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha1-armv8.pl b/main/openssl/crypto/sha/asm/sha1-armv8.pl new file mode 100644 index 00000000..c1f552b6 --- /dev/null +++ b/main/openssl/crypto/sha/asm/sha1-armv8.pl @@ -0,0 +1,333 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +#		hardware-assisted	software(*) +# Apple A7	2.31			4.13 (+14%) +# Cortex-A5x	n/a			n/a +# +# (*)	Software results are presented mostly for reference purposes. + +$flavour = shift; +open STDOUT,">".shift; + +($ctx,$inp,$num)=("x0","x1","x2"); +@Xw=map("w$_",(3..17,19)); +@Xx=map("x$_",(3..17,19)); +@V=($A,$B,$C,$D,$E)=map("w$_",(20..24)); +($t0,$t1,$t2,$K)=map("w$_",(25..28)); + + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i<15 && !($i&1)); +	lsr	@Xx[$i+1],@Xx[$i],#32 +___ +$code.=<<___ if ($i<14 && !($i&1)); +	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`] +___ +$code.=<<___ if ($i<14 && ($i&1)); +#ifdef	__ARMEB__ +	ror	@Xx[$i+1],@Xx[$i+1],#32 +#else +	rev32	@Xx[$i+1],@Xx[$i+1] +#endif +___ +$code.=<<___ if ($i<14); +	bic	$t0,$d,$b +	and	$t1,$c,$b +	ror	$t2,$a,#27 +	add	$d,$d,$K		// future e+=K +	orr	$t0,$t0,$t1 +	add	$e,$e,$t2		// e+=rot(a,5) +	ror	$b,$b,#2 +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i] +	add	$e,$e,$t0		// e+=F(b,c,d) +___ +$code.=<<___ if ($i==19); +	movz	$K,#0xeba1 +	movk	$K,#0x6ed9,lsl#16 +___ +$code.=<<___ if ($i>=14); +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15] +	bic	$t0,$d,$b +	and	$t1,$c,$b +	ror	$t2,$a,#27 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15] +	add	$d,$d,$K		// future e+=K +	orr	$t0,$t0,$t1 +	add	$e,$e,$t2		// e+=rot(a,5) +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15] +	ror	$b,$b,#2 +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i] +	add	$e,$e,$t0		// e+=F(b,c,d) +	 ror	@Xw[$j],@Xw[$j],#31 +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i==59); +	movz	$K,#0xc1d6 +	movk	$K,#0xca62,lsl#16 +___ +$code.=<<___; +	orr	$t0,$b,$c +	and	$t1,$b,$c +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15] +	ror	$t2,$a,#27 +	and	$t0,$t0,$d +	add	$d,$d,$K		// future e+=K +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15] +	add	$e,$e,$t2		// e+=rot(a,5) +	orr	$t0,$t0,$t1 +	ror	$b,$b,#2 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15] +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i] +	add	$e,$e,$t0		// e+=F(b,c,d) +	 ror	@Xw[$j],@Xw[$j],#31 +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i==39); +	movz	$K,#0xbcdc +	movk	$K,#0x8f1b,lsl#16 +___ +$code.=<<___ if ($i<78); +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15] +	eor	$t0,$d,$b +	ror	$t2,$a,#27 +	add	$d,$d,$K		// future e+=K +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15] +	eor	$t0,$t0,$c +	add	$e,$e,$t2		// e+=rot(a,5) +	ror	$b,$b,#2 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15] +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i] +	add	$e,$e,$t0		// e+=F(b,c,d) +	 ror	@Xw[$j],@Xw[$j],#31 +___ +$code.=<<___ if ($i==78); +	ldp	@Xw[1],@Xw[2],[$ctx] +	eor	$t0,$d,$b +	ror	$t2,$a,#27 +	add	$d,$d,$K		// future e+=K +	eor	$t0,$t0,$c +	add	$e,$e,$t2		// e+=rot(a,5) +	ror	$b,$b,#2 +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i] +	add	$e,$e,$t0		// e+=F(b,c,d) +___ +$code.=<<___ if ($i==79); +	ldp	@Xw[3],@Xw[4],[$ctx,#8] +	eor	$t0,$d,$b +	ror	$t2,$a,#27 +	eor	$t0,$t0,$c +	add	$e,$e,$t2		// e+=rot(a,5) +	ror	$b,$b,#2 +	ldr	@Xw[5],[$ctx,#16] +	add	$e,$e,$t0		// e+=F(b,c,d) +___ +} + +$code.=<<___; +#include "arm_arch.h" + +.text + +.globl	sha1_block_data_order +.type	sha1_block_data_order,%function +.align	6 +sha1_block_data_order: +	ldr	x16,.LOPENSSL_armcap_P +	adr	x17,.LOPENSSL_armcap_P +	add	x16,x16,x17 +	ldr	w16,[x16] +	tst	w16,#ARMV8_SHA1 +	b.ne	.Lv8_entry + +	stp	x29,x30,[sp,#-96]! +	add	x29,sp,#0 +	stp	x19,x20,[sp,#16] +	stp	x21,x22,[sp,#32] +	stp	x23,x24,[sp,#48] +	stp	x25,x26,[sp,#64] +	stp	x27,x28,[sp,#80] + +	ldp	$A,$B,[$ctx] +	ldp	$C,$D,[$ctx,#8] +	ldr	$E,[$ctx,#16] + +.Loop: +	ldr	@Xx[0],[$inp],#64 +	movz	$K,#0x7999 +	sub	$num,$num,#1 +	movk	$K,#0x5a82,lsl#16 +#ifdef	__ARMEB__ +	ror	$Xx[0],@Xx[0],#32 +#else +	rev32	@Xx[0],@Xx[0] +#endif +	add	$E,$E,$K		// warm it up +	add	$E,$E,@Xw[0] +___ +for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +	add	$B,$B,@Xw[2] +	add	$C,$C,@Xw[3] +	add	$A,$A,@Xw[1] +	add	$D,$D,@Xw[4] +	add	$E,$E,@Xw[5] +	stp	$A,$B,[$ctx] +	stp	$C,$D,[$ctx,#8] +	str	$E,[$ctx,#16] +	cbnz	$num,.Loop + +	ldp	x19,x20,[sp,#16] +	ldp	x21,x22,[sp,#32] +	ldp	x23,x24,[sp,#48] +	ldp	x25,x26,[sp,#64] +	ldp	x27,x28,[sp,#80] +	ldr	x29,[sp],#96 +	ret +.size	sha1_block_data_order,.-sha1_block_data_order +___ +{{{ +my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3)); +my @MSG=map("v$_.16b",(4..7)); +my @Kxx=map("v$_.4s",(16..19)); +my ($W0,$W1)=("v20.4s","v21.4s"); +my $ABCD_SAVE="v22.16b"; + +$code.=<<___; +.type	sha1_block_armv8,%function +.align	6 +sha1_block_armv8: +.Lv8_entry: +	stp	x29,x30,[sp,#-16]! +	add	x29,sp,#0 + +	adr	x4,.Lconst +	eor	$E,$E,$E +	ld1.32	{$ABCD},[$ctx],#16 +	ld1.32	{$E}[0],[$ctx] +	sub	$ctx,$ctx,#16 +	ld1.32	{@Kxx[0]-@Kxx[3]},[x4] + +.Loop_hw: +	ld1	{@MSG[0]-@MSG[3]},[$inp],#64 +	sub	$num,$num,#1 +	rev32	@MSG[0],@MSG[0] +	rev32	@MSG[1],@MSG[1] + +	add.i32	$W0,@Kxx[0],@MSG[0] +	rev32	@MSG[2],@MSG[2] +	orr	$ABCD_SAVE,$ABCD,$ABCD	// offload + +	add.i32	$W1,@Kxx[0],@MSG[1] +	rev32	@MSG[3],@MSG[3] +	sha1h	$E1,$ABCD +	sha1c	$ABCD,$E,$W0		// 0 +	add.i32	$W0,@Kxx[$j],@MSG[2] +	sha1su0	@MSG[0],@MSG[1],@MSG[2] +___ +for ($j=0,$i=1;$i<20-3;$i++) { +my $f=("c","p","m","p")[$i/5]; +$code.=<<___; +	sha1h	$E0,$ABCD		// $i +	sha1$f	$ABCD,$E1,$W1 +	add.i32	$W1,@Kxx[$j],@MSG[3] +	sha1su1	@MSG[0],@MSG[3] +___ +$code.=<<___ if ($i<20-4); +	sha1su0	@MSG[1],@MSG[2],@MSG[3] +___ +	($E0,$E1)=($E1,$E0);		($W0,$W1)=($W1,$W0); +	push(@MSG,shift(@MSG));		$j++ if ((($i+3)%5)==0); +} +$code.=<<___; +	sha1h	$E0,$ABCD		// $i +	sha1p	$ABCD,$E1,$W1 +	add.i32	$W1,@Kxx[$j],@MSG[3] + +	sha1h	$E1,$ABCD		// 18 +	sha1p	$ABCD,$E0,$W0 + +	sha1h	$E0,$ABCD		// 19 +	sha1p	$ABCD,$E1,$W1 + +	add.i32	$E,$E,$E0 +	add.i32	$ABCD,$ABCD,$ABCD_SAVE + +	cbnz	$num,.Loop_hw + +	st1.32	{$ABCD},[$ctx],#16 +	st1.32	{$E}[0],[$ctx] + +	ldr	x29,[sp],#16 +	ret +.size	sha1_block_armv8,.-sha1_block_armv8 +.align	6 +.Lconst: +.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19 +.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39 +.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59 +.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79 +.LOPENSSL_armcap_P: +.quad	OPENSSL_armcap_P-. +.asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align	2 +.comm	OPENSSL_armcap_P,4,4 +___ +}}} + +{   my	%opcode = ( +	"sha1c"		=> 0x5e000000,	"sha1p"		=> 0x5e001000, +	"sha1m"		=> 0x5e002000,	"sha1su0"	=> 0x5e003000, +	"sha1h"		=> 0x5e280800,	"sha1su1"	=> 0x5e281800	); + +    sub unsha1 { +	my ($mnemonic,$arg)=@_; + +	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o +	&& +	sprintf ".inst\t0x%08x\t//%s %s", +			$opcode{$mnemonic}|$1|($2<<5)|($3<<16), +			$mnemonic,$arg; +    } +} + +foreach(split("\n",$code)) { + +	s/\`([^\`]*)\`/eval($1)/geo; + +	s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo; + +	s/\.\w?32\b//o		and s/\.16b/\.4s/go; +	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go; + +	print $_,"\n"; +} + +close STDOUT; diff --git a/main/openssl/crypto/sha/asm/sha256-armv4.pl b/main/openssl/crypto/sha/asm/sha256-armv4.pl index 9c84e8d9..505ca8f3 100644 --- a/main/openssl/crypto/sha/asm/sha256-armv4.pl +++ b/main/openssl/crypto/sha/asm/sha256-armv4.pl @@ -1,7 +1,7 @@  #!/usr/bin/env perl  # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL  # project. The module is, however, dual licensed under OpenSSL and  # CRYPTOGAMS licenses depending on where you obtain it. For further  # details see http://www.openssl.org/~appro/cryptogams/. @@ -21,15 +21,27 @@  # February 2011.  #  # Profiler-assisted and platform-specific optimization resulted in 16% -# improvement on Cortex A8 core and ~17 cycles per processed byte. +# improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +# September 2013. +# +# Add NEON implementation. On Cortex A8 it was measured to process one +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +# code (meaning that latter performs sub-optimally, nothing was done +# about it). + +# May 2014. +# +# Add ARMv8 code path performing at 2.0 cpb on Apple A7.  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}  open STDOUT,">$output";  $ctx="r0";	$t0="r0"; -$inp="r1";	$t3="r1"; +$inp="r1";	$t4="r1";  $len="r2";	$t1="r2"; -$T1="r3"; +$T1="r3";	$t3="r3";  $A="r4";  $B="r5";  $C="r6"; @@ -52,71 +64,88 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;  $code.=<<___ if ($i<16);  #if __ARM_ARCH__>=7 -	ldr	$T1,[$inp],#4 +	@ ldr	$t1,[$inp],#4			@ $i +# if $i==15 +	str	$inp,[sp,#17*4]			@ make room for $t4 +# endif +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e) +	rev	$t1,$t1  #else -	ldrb	$T1,[$inp,#3]			@ $i +	@ ldrb	$t1,[$inp,#3]			@ $i +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past  	ldrb	$t2,[$inp,#2] -	ldrb	$t1,[$inp,#1] -	ldrb	$t0,[$inp],#4 -	orr	$T1,$T1,$t2,lsl#8 -	orr	$T1,$T1,$t1,lsl#16 -	orr	$T1,$T1,$t0,lsl#24 +	ldrb	$t0,[$inp,#1] +	orr	$t1,$t1,$t2,lsl#8 +	ldrb	$t2,[$inp],#4 +	orr	$t1,$t1,$t0,lsl#16 +# if $i==15 +	str	$inp,[sp,#17*4]			@ make room for $t4 +# endif +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` +	orr	$t1,$t1,$t2,lsl#24 +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)  #endif  ___  $code.=<<___; -	mov	$t0,$e,ror#$Sigma1[0]  	ldr	$t2,[$Ktbl],#4			@ *K256++ -	eor	$t0,$t0,$e,ror#$Sigma1[1] +	add	$h,$h,$t1			@ h+=X[i] +	str	$t1,[sp,#`$i%16`*4]  	eor	$t1,$f,$g -#if $i>=16 -	add	$T1,$T1,$t3			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	$T1,$T1 -#endif -#if $i==15 -	str	$inp,[sp,#17*4]			@ leave room for $t3 -#endif -	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e) +	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)  	and	$t1,$t1,$e -	str	$T1,[sp,#`$i%16`*4] -	add	$T1,$T1,$t0 +	add	$h,$h,$t2			@ h+=K256[i]  	eor	$t1,$t1,$g			@ Ch(e,f,g) -	add	$T1,$T1,$h -	mov	$h,$a,ror#$Sigma0[0] -	add	$T1,$T1,$t1 -	eor	$h,$h,$a,ror#$Sigma0[1] -	add	$T1,$T1,$t2 -	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a) -#if $i>=15 -	ldr	$t3,[sp,#`($i+2)%16`*4]		@ from BODY_16_xx +	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` +	add	$h,$h,$t1			@ h+=Ch(e,f,g) +#if $i==31 +	and	$t2,$t2,#0xff +	cmp	$t2,#0xf2			@ done?  #endif -	orr	$t0,$a,$b -	and	$t1,$a,$b -	and	$t0,$t0,$c -	add	$h,$h,$T1 -	orr	$t0,$t0,$t1			@ Maj(a,b,c) -	add	$d,$d,$T1 -	add	$h,$h,$t0 +#if $i<15 +# if __ARM_ARCH__>=7 +	ldr	$t1,[$inp],#4			@ prefetch +# else +	ldrb	$t1,[$inp,#3] +# endif +	eor	$t2,$a,$b			@ a^b, b^c in next round +#else +	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx +	eor	$t2,$a,$b			@ a^b, b^c in next round +	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx +#endif +	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a) +	and	$t3,$t3,$t2			@ (b^c)&=(a^b) +	add	$d,$d,$h			@ d+=h +	eor	$t3,$t3,$b			@ Maj(a,b,c) +	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a) +	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)  ___ +	($t2,$t3)=($t3,$t2);  }  sub BODY_16_XX {  my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;  $code.=<<___; -	@ ldr	$t3,[sp,#`($i+1)%16`*4]		@ $i -	ldr	$t2,[sp,#`($i+14)%16`*4] -	mov	$t0,$t3,ror#$sigma0[0] -	ldr	$T1,[sp,#`($i+0)%16`*4] -	eor	$t0,$t0,$t3,ror#$sigma0[1] -	ldr	$t1,[sp,#`($i+9)%16`*4] -	eor	$t0,$t0,$t3,lsr#$sigma0[2]	@ sigma0(X[i+1]) -	mov	$t3,$t2,ror#$sigma1[0] -	add	$T1,$T1,$t0 -	eor	$t3,$t3,$t2,ror#$sigma1[1] -	add	$T1,$T1,$t1 -	eor	$t3,$t3,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14]) -	@ add	$T1,$T1,$t3 +	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i +	@ ldr	$t4,[sp,#`($i+14)%16`*4] +	mov	$t0,$t1,ror#$sigma0[0] +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past +	mov	$t2,$t4,ror#$sigma1[0] +	eor	$t0,$t0,$t1,ror#$sigma0[1] +	eor	$t2,$t2,$t4,ror#$sigma1[1] +	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1]) +	ldr	$t1,[sp,#`($i+0)%16`*4] +	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14]) +	ldr	$t4,[sp,#`($i+9)%16`*4] + +	add	$t2,$t2,$t0 +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15 +	add	$t1,$t1,$t2 +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e) +	add	$t1,$t1,$t4			@ X[i]  ___  	&BODY_00_15(@_);  } @@ -147,46 +176,64 @@ K256:  .word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208  .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2  .size	K256,.-K256 +.word	0				@ terminator +.LOPENSSL_armcap: +.word	OPENSSL_armcap_P-sha256_block_data_order +.align	5  .global	sha256_block_data_order  .type	sha256_block_data_order,%function  sha256_block_data_order:  	sub	r3,pc,#8		@ sha256_block_data_order  	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp +#if __ARM_ARCH__>=7 +	ldr	r12,.LOPENSSL_armcap +	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P +	tst	r12,#ARMV8_SHA256 +	bne	.LARMv8 +	tst	r12,#ARMV7_NEON +	bne	.LNEON +#endif  	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}  	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H} -	sub	$Ktbl,r3,#256		@ K256 +	sub	$Ktbl,r3,#256+32	@ K256  	sub	sp,sp,#16*4		@ alloca(X[16])  .Loop: +# if __ARM_ARCH__>=7 +	ldr	$t1,[$inp],#4 +# else +	ldrb	$t1,[$inp,#3] +# endif +	eor	$t3,$B,$C		@ magic +	eor	$t2,$t2,$t2  ___  for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }  $code.=".Lrounds_16_xx:\n";  for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }  $code.=<<___; -	and	$t2,$t2,#0xff -	cmp	$t2,#0xf2 +	ldreq	$t3,[sp,#16*4]		@ pull ctx  	bne	.Lrounds_16_xx -	ldr	$T1,[sp,#16*4]		@ pull ctx -	ldr	$t0,[$T1,#0] -	ldr	$t1,[$T1,#4] -	ldr	$t2,[$T1,#8] +	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past +	ldr	$t0,[$t3,#0] +	ldr	$t1,[$t3,#4] +	ldr	$t2,[$t3,#8]  	add	$A,$A,$t0 -	ldr	$t0,[$T1,#12] +	ldr	$t0,[$t3,#12]  	add	$B,$B,$t1 -	ldr	$t1,[$T1,#16] +	ldr	$t1,[$t3,#16]  	add	$C,$C,$t2 -	ldr	$t2,[$T1,#20] +	ldr	$t2,[$t3,#20]  	add	$D,$D,$t0 -	ldr	$t0,[$T1,#24] +	ldr	$t0,[$t3,#24]  	add	$E,$E,$t1 -	ldr	$t1,[$T1,#28] +	ldr	$t1,[$t3,#28]  	add	$F,$F,$t2  	ldr	$inp,[sp,#17*4]		@ pull inp  	ldr	$t2,[sp,#18*4]		@ pull inp+len  	add	$G,$G,$t0  	add	$H,$H,$t1 -	stmia	$T1,{$A,$B,$C,$D,$E,$F,$G,$H} +	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}  	cmp	$inp,$t2  	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl  	bne	.Loop @@ -200,12 +247,410 @@ $code.=<<___;  	moveq	pc,lr			@ be binary compatible with V4, yet  	bx	lr			@ interoperable with Thumb ISA:-)  #endif -.size   sha256_block_data_order,.-sha256_block_data_order -.asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +.size	sha256_block_data_order,.-sha256_block_data_order +___ +###################################################################### +# NEON stuff +# +{{{ +my @X=map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); +my $Xfer=$t4; +my $j=0; + +sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     } +sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   } + +sub AUTOLOAD()          # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; +  my $arg = pop; +    $arg = "#$arg" if ($arg*1 eq $arg); +    $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Xupdate() +{ use integer; +  my $body = shift; +  my @insns = (&$body,&$body,&$body,&$body); +  my ($a,$b,$c,$d,$e,$f,$g,$h); + +	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4] +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12] +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vshr_u32	($T2,$T0,$sigma0[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12] +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vshr_u32	($T1,$T0,$sigma0[2]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vsli_32	($T2,$T0,32-$sigma0[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vshr_u32	($T3,$T0,$sigma0[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&veor		($T1,$T1,$T2); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vsli_32	($T3,$T0,32-$sigma0[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&veor		($T1,$T1,$T3);		# sigma0(X[1..4]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &veor		($T5,$T5,$T4); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &veor		($T5,$T5,$T4); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vld1_32	("{$T0}","[$Ktbl,:128]!"); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vadd_i32	($T0,$T0,@X[0]); +	 while($#insns>=2) { eval(shift(@insns)); } +	&vst1_32	("{$T0}","[$Xfer,:128]!"); +	 eval(shift(@insns)); +	 eval(shift(@insns)); + +	push(@X,shift(@X));		# "rotate" X[] +} + +sub Xpreload() +{ use integer; +  my $body = shift; +  my @insns = (&$body,&$body,&$body,&$body); +  my ($a,$b,$c,$d,$e,$f,$g,$h); + +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vld1_32	("{$T0}","[$Ktbl,:128]!"); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vrev32_8	(@X[0],@X[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&vadd_i32	($T0,$T0,@X[0]); +	 foreach (@insns) { eval; }	# remaining instructions +	&vst1_32	("{$T0}","[$Xfer,:128]!"); + +	push(@X,shift(@X));		# "rotate" X[] +} + +sub body_00_15 () { +	( +	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. +	'&add	($h,$h,$t1)',			# h+=X[i]+K[i] +	'&eor	($t1,$f,$g)', +	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', +	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past +	'&and	($t1,$t1,$e)', +	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e) +	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', +	'&eor	($t1,$t1,$g)',			# Ch(e,f,g) +	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e) +	'&eor	($t2,$a,$b)',			# a^b, b^c in next round +	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a) +	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g) +	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'. +	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'. +	'&ldr	($t1,"[sp,#64]")			if ($j==31)', +	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b) +	'&add	($d,$d,$h)',			# d+=h +	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a) +	'&eor	($t3,$t3,$b)',			# Maj(a,b,c) +	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' +	) +} + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu	neon + +.type	sha256_block_data_order_neon,%function +.align	4 +sha256_block_data_order_neon: +.LNEON: +	stmdb	sp!,{r4-r12,lr} + +	mov	$t2,sp +	sub	sp,sp,#16*4+16		@ alloca +	sub	$Ktbl,r3,#256+32	@ K256 +	bic	sp,sp,#15		@ align for 128-bit stores + +	vld1.8		{@X[0]},[$inp]! +	vld1.8		{@X[1]},[$inp]! +	vld1.8		{@X[2]},[$inp]! +	vld1.8		{@X[3]},[$inp]! +	vld1.32		{$T0},[$Ktbl,:128]! +	vld1.32		{$T1},[$Ktbl,:128]! +	vld1.32		{$T2},[$Ktbl,:128]! +	vld1.32		{$T3},[$Ktbl,:128]! +	vrev32.8	@X[0],@X[0]		@ yes, even on +	str		$ctx,[sp,#64] +	vrev32.8	@X[1],@X[1]		@ big-endian +	str		$inp,[sp,#68] +	mov		$Xfer,sp +	vrev32.8	@X[2],@X[2] +	str		$len,[sp,#72] +	vrev32.8	@X[3],@X[3] +	str		$t2,[sp,#76]		@ save original sp +	vadd.i32	$T0,$T0,@X[0] +	vadd.i32	$T1,$T1,@X[1] +	vst1.32		{$T0},[$Xfer,:128]! +	vadd.i32	$T2,$T2,@X[2] +	vst1.32		{$T1},[$Xfer,:128]! +	vadd.i32	$T3,$T3,@X[3] +	vst1.32		{$T2},[$Xfer,:128]! +	vst1.32		{$T3},[$Xfer,:128]! + +	ldmia		$ctx,{$A-$H} +	sub		$Xfer,$Xfer,#64 +	ldr		$t1,[sp,#0] +	eor		$t2,$t2,$t2 +	eor		$t3,$B,$C +	b		.L_00_48 + +.align	4 +.L_00_48: +___ +	&Xupdate(\&body_00_15); +	&Xupdate(\&body_00_15); +	&Xupdate(\&body_00_15); +	&Xupdate(\&body_00_15); +$code.=<<___; +	teq	$t1,#0				@ check for K256 terminator +	ldr	$t1,[sp,#0] +	sub	$Xfer,$Xfer,#64 +	bne	.L_00_48 + +	ldr		$inp,[sp,#68] +	ldr		$t0,[sp,#72] +	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl +	teq		$inp,$t0 +	subeq		$inp,$inp,#64		@ avoid SEGV +	vld1.8		{@X[0]},[$inp]!		@ load next input block +	vld1.8		{@X[1]},[$inp]! +	vld1.8		{@X[2]},[$inp]! +	vld1.8		{@X[3]},[$inp]! +	strne		$inp,[sp,#68] +	mov		$Xfer,sp +___ +	&Xpreload(\&body_00_15); +	&Xpreload(\&body_00_15); +	&Xpreload(\&body_00_15); +	&Xpreload(\&body_00_15); +$code.=<<___; +	ldr	$t0,[$t1,#0] +	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past +	ldr	$t2,[$t1,#4] +	ldr	$t3,[$t1,#8] +	ldr	$t4,[$t1,#12] +	add	$A,$A,$t0			@ accumulate +	ldr	$t0,[$t1,#16] +	add	$B,$B,$t2 +	ldr	$t2,[$t1,#20] +	add	$C,$C,$t3 +	ldr	$t3,[$t1,#24] +	add	$D,$D,$t4 +	ldr	$t4,[$t1,#28] +	add	$E,$E,$t0 +	str	$A,[$t1],#4 +	add	$F,$F,$t2 +	str	$B,[$t1],#4 +	add	$G,$G,$t3 +	str	$C,[$t1],#4 +	add	$H,$H,$t4 +	str	$D,[$t1],#4 +	stmia	$t1,{$E-$H} + +	movne	$Xfer,sp +	ldrne	$t1,[sp,#0] +	eorne	$t2,$t2,$t2 +	ldreq	sp,[sp,#76]			@ restore original sp +	eorne	$t3,$B,$C +	bne	.L_00_48 + +	ldmia	sp!,{r4-r12,pc} +.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +___ +}}} +###################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); +my @MSG=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); +my $Ktbl="r3"; + +$code.=<<___; +#if __ARM_ARCH__>=7 +.type	sha256_block_data_order_armv8,%function +.align	5 +sha256_block_data_order_armv8: +.LARMv8: +	vld1.32	{$ABCD,$EFGH},[$ctx] +	sub	$Ktbl,r3,#sha256_block_data_order-K256 + +.Loop_v8: +	vld1.8		{@MSG[0]-@MSG[1]},[$inp]! +	vld1.8		{@MSG[2]-@MSG[3]},[$inp]! +	vld1.32		{$W0},[$Ktbl]! +	vrev32.8	@MSG[0],@MSG[0] +	vrev32.8	@MSG[1],@MSG[1] +	vrev32.8	@MSG[2],@MSG[2] +	vrev32.8	@MSG[3],@MSG[3] +	vmov		$ABCD_SAVE,$ABCD	@ offload +	vmov		$EFGH_SAVE,$EFGH +	teq		$inp,$len +___ +for($i=0;$i<12;$i++) { +$code.=<<___; +	vld1.32		{$W1},[$Ktbl]! +	vadd.i32	$W0,$W0,@MSG[0] +	sha256su0	@MSG[0],@MSG[1] +	vmov		$abcd,$ABCD +	sha256h		$ABCD,$EFGH,$W0 +	sha256h2	$EFGH,$abcd,$W0 +	sha256su1	@MSG[0],@MSG[2],@MSG[3] +___ +	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG)); +} +$code.=<<___; +	vld1.32		{$W1},[$Ktbl]! +	vadd.i32	$W0,$W0,@MSG[0] +	vmov		$abcd,$ABCD +	sha256h		$ABCD,$EFGH,$W0 +	sha256h2	$EFGH,$abcd,$W0 + +	vld1.32		{$W0},[$Ktbl]! +	vadd.i32	$W1,$W1,@MSG[1] +	vmov		$abcd,$ABCD +	sha256h		$ABCD,$EFGH,$W1 +	sha256h2	$EFGH,$abcd,$W1 + +	vld1.32		{$W1},[$Ktbl] +	vadd.i32	$W0,$W0,@MSG[2] +	sub		$Ktbl,$Ktbl,#256-16	@ rewind +	vmov		$abcd,$ABCD +	sha256h		$ABCD,$EFGH,$W0 +	sha256h2	$EFGH,$abcd,$W0 + +	vadd.i32	$W1,$W1,@MSG[3] +	vmov		$abcd,$ABCD +	sha256h		$ABCD,$EFGH,$W1 +	sha256h2	$EFGH,$abcd,$W1 + +	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE +	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE +	bne		.Loop_v8 + +	vst1.32		{$ABCD,$EFGH},[$ctx] + +	ret		@ bx lr +.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"  .align	2 +.comm   OPENSSL_armcap_P,4,4  ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4 -print $code; +{   my  %opcode = ( +	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40, +	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	); + +    sub unsha256 { +	my ($mnemonic,$arg)=@_; + +	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { +	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) +					 |(($2&7)<<17)|(($2&8)<<4) +					 |(($3&7)<<1) |(($3&8)<<2); +	    # since ARMv7 instructions are always encoded little-endian. +	    # correct solution is to use .inst directive, but older +	    # assemblers don't implement it:-( +	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", +			$word&0xff,($word>>8)&0xff, +			($word>>16)&0xff,($word>>24)&0xff, +			$mnemonic,$arg; +	} +    } +} + +foreach (split($/,$code)) { + +	s/\`([^\`]*)\`/eval $1/geo; + +	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; + +	s/\bret\b/bx	lr/go		or +	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4 + +	print $_,"\n"; +} +  close STDOUT; # enforce flush diff --git a/main/openssl/crypto/sha/asm/sha256-armv4.s b/main/openssl/crypto/sha/asm/sha256-armv4.s index 9c20a63c..853d7da5 100644 --- a/main/openssl/crypto/sha/asm/sha256-armv4.s +++ b/main/openssl/crypto/sha/asm/sha256-armv4.s @@ -23,1463 +23,1721 @@ K256:  .word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208  .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2  .size	K256,.-K256 +.word	0				@ terminator +.LOPENSSL_armcap: +.word	OPENSSL_armcap_P-sha256_block_data_order +.align	5  .global	sha256_block_data_order  .type	sha256_block_data_order,%function  sha256_block_data_order:  	sub	r3,pc,#8		@ sha256_block_data_order  	add	r2,r1,r2,lsl#6	@ len to point at the end of inp +#if __ARM_ARCH__>=7 +	ldr	r12,.LOPENSSL_armcap +	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P +	tst	r12,#ARMV8_SHA256 +	bne	.LARMv8 +	tst	r12,#ARMV7_NEON +	bne	.LNEON +#endif  	stmdb	sp!,{r0,r1,r2,r4-r11,lr}  	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11} -	sub	r14,r3,#256		@ K256 +	sub	r14,r3,#256+32	@ K256  	sub	sp,sp,#16*4		@ alloca(X[16])  .Loop: +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4 +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r5,r6		@ magic +	eor	r12,r12,r12  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 0 +# if 0==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r8,r8,ror#5 +	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r8,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 0 +	@ ldrb	r2,[r1,#3]			@ 0 +	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past  	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 +	ldrb	r0,[r1,#1] +	orr	r2,r2,r12,lsl#8 +	ldrb	r12,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 0==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r8,r8,ror#5 +	orr	r2,r2,r12,lsl#24 +	eor	r0,r0,r8,ror#19	@ Sigma1(e)  #endif -	mov	r0,r8,ror#6  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r8,ror#11 +	add	r11,r11,r2			@ h+=X[i] +	str	r2,[sp,#0*4]  	eor	r2,r9,r10 -#if 0>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 0==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r8,ror#25	@ Sigma1(e) +	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r8 -	str	r3,[sp,#0*4] -	add	r3,r3,r0 +	add	r11,r11,r12			@ h+=K256[i]  	eor	r2,r2,r10			@ Ch(e,f,g) -	add	r3,r3,r11 -	mov	r11,r4,ror#2 -	add	r3,r3,r2 -	eor	r11,r11,r4,ror#13 -	add	r3,r3,r12 -	eor	r11,r11,r4,ror#22		@ Sigma0(a) -#if 0>=15 -	ldr	r1,[sp,#2*4]		@ from BODY_16_xx -#endif -	orr	r0,r4,r5 -	and	r2,r4,r5 -	and	r0,r0,r6 -	add	r11,r11,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r7,r7,r3 -	add	r11,r11,r0 +	eor	r0,r4,r4,ror#11 +	add	r11,r11,r2			@ h+=Ch(e,f,g) +#if 0==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 0<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r4,r5			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx +	eor	r12,r4,r5			@ a^b, b^c in next round +	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r4,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r7,r7,r11			@ d+=h +	eor	r3,r3,r5			@ Maj(a,b,c) +	add	r11,r11,r0,ror#2	@ h+=Sigma0(a) +	@ add	r11,r11,r3			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 1 +# if 1==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r7,r7,ror#5 +	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r7,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 1 -	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 -#endif -	mov	r0,r7,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r7,ror#11 +	@ ldrb	r2,[r1,#3]			@ 1 +	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past +	ldrb	r3,[r1,#2] +	ldrb	r0,[r1,#1] +	orr	r2,r2,r3,lsl#8 +	ldrb	r3,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 1==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r7,r7,ror#5 +	orr	r2,r2,r3,lsl#24 +	eor	r0,r0,r7,ror#19	@ Sigma1(e) +#endif +	ldr	r3,[r14],#4			@ *K256++ +	add	r10,r10,r2			@ h+=X[i] +	str	r2,[sp,#1*4]  	eor	r2,r8,r9 -#if 1>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 1==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r7,ror#25	@ Sigma1(e) +	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r7 -	str	r3,[sp,#1*4] -	add	r3,r3,r0 +	add	r10,r10,r3			@ h+=K256[i]  	eor	r2,r2,r9			@ Ch(e,f,g) -	add	r3,r3,r10 -	mov	r10,r11,ror#2 -	add	r3,r3,r2 -	eor	r10,r10,r11,ror#13 -	add	r3,r3,r12 -	eor	r10,r10,r11,ror#22		@ Sigma0(a) -#if 1>=15 -	ldr	r1,[sp,#3*4]		@ from BODY_16_xx -#endif -	orr	r0,r11,r4 -	and	r2,r11,r4 -	and	r0,r0,r5 -	add	r10,r10,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r6,r6,r3 -	add	r10,r10,r0 +	eor	r0,r11,r11,ror#11 +	add	r10,r10,r2			@ h+=Ch(e,f,g) +#if 1==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 1<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r11,r4			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx +	eor	r3,r11,r4			@ a^b, b^c in next round +	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r11,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r6,r6,r10			@ d+=h +	eor	r12,r12,r4			@ Maj(a,b,c) +	add	r10,r10,r0,ror#2	@ h+=Sigma0(a) +	@ add	r10,r10,r12			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 2 +# if 2==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r6,r6,ror#5 +	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r6,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 2 +	@ ldrb	r2,[r1,#3]			@ 2 +	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past  	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 +	ldrb	r0,[r1,#1] +	orr	r2,r2,r12,lsl#8 +	ldrb	r12,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 2==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r6,r6,ror#5 +	orr	r2,r2,r12,lsl#24 +	eor	r0,r0,r6,ror#19	@ Sigma1(e)  #endif -	mov	r0,r6,ror#6  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r6,ror#11 +	add	r9,r9,r2			@ h+=X[i] +	str	r2,[sp,#2*4]  	eor	r2,r7,r8 -#if 2>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 2==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r6,ror#25	@ Sigma1(e) +	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r6 -	str	r3,[sp,#2*4] -	add	r3,r3,r0 +	add	r9,r9,r12			@ h+=K256[i]  	eor	r2,r2,r8			@ Ch(e,f,g) -	add	r3,r3,r9 -	mov	r9,r10,ror#2 -	add	r3,r3,r2 -	eor	r9,r9,r10,ror#13 -	add	r3,r3,r12 -	eor	r9,r9,r10,ror#22		@ Sigma0(a) -#if 2>=15 -	ldr	r1,[sp,#4*4]		@ from BODY_16_xx -#endif -	orr	r0,r10,r11 -	and	r2,r10,r11 -	and	r0,r0,r4 -	add	r9,r9,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r5,r5,r3 -	add	r9,r9,r0 +	eor	r0,r10,r10,ror#11 +	add	r9,r9,r2			@ h+=Ch(e,f,g) +#if 2==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 2<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r10,r11			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx +	eor	r12,r10,r11			@ a^b, b^c in next round +	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r10,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r5,r5,r9			@ d+=h +	eor	r3,r3,r11			@ Maj(a,b,c) +	add	r9,r9,r0,ror#2	@ h+=Sigma0(a) +	@ add	r9,r9,r3			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 3 +# if 3==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r5,r5,ror#5 +	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r5,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 3 -	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 -#endif -	mov	r0,r5,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r5,ror#11 +	@ ldrb	r2,[r1,#3]			@ 3 +	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past +	ldrb	r3,[r1,#2] +	ldrb	r0,[r1,#1] +	orr	r2,r2,r3,lsl#8 +	ldrb	r3,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 3==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r5,r5,ror#5 +	orr	r2,r2,r3,lsl#24 +	eor	r0,r0,r5,ror#19	@ Sigma1(e) +#endif +	ldr	r3,[r14],#4			@ *K256++ +	add	r8,r8,r2			@ h+=X[i] +	str	r2,[sp,#3*4]  	eor	r2,r6,r7 -#if 3>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 3==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r5,ror#25	@ Sigma1(e) +	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r5 -	str	r3,[sp,#3*4] -	add	r3,r3,r0 +	add	r8,r8,r3			@ h+=K256[i]  	eor	r2,r2,r7			@ Ch(e,f,g) -	add	r3,r3,r8 -	mov	r8,r9,ror#2 -	add	r3,r3,r2 -	eor	r8,r8,r9,ror#13 -	add	r3,r3,r12 -	eor	r8,r8,r9,ror#22		@ Sigma0(a) -#if 3>=15 -	ldr	r1,[sp,#5*4]		@ from BODY_16_xx -#endif -	orr	r0,r9,r10 -	and	r2,r9,r10 -	and	r0,r0,r11 -	add	r8,r8,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r4,r4,r3 -	add	r8,r8,r0 +	eor	r0,r9,r9,ror#11 +	add	r8,r8,r2			@ h+=Ch(e,f,g) +#if 3==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 3<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r9,r10			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx +	eor	r3,r9,r10			@ a^b, b^c in next round +	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r9,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r4,r4,r8			@ d+=h +	eor	r12,r12,r10			@ Maj(a,b,c) +	add	r8,r8,r0,ror#2	@ h+=Sigma0(a) +	@ add	r8,r8,r12			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 4 +# if 4==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r4,r4,ror#5 +	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r4,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 4 +	@ ldrb	r2,[r1,#3]			@ 4 +	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past  	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 +	ldrb	r0,[r1,#1] +	orr	r2,r2,r12,lsl#8 +	ldrb	r12,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 4==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r4,r4,ror#5 +	orr	r2,r2,r12,lsl#24 +	eor	r0,r0,r4,ror#19	@ Sigma1(e)  #endif -	mov	r0,r4,ror#6  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r4,ror#11 +	add	r7,r7,r2			@ h+=X[i] +	str	r2,[sp,#4*4]  	eor	r2,r5,r6 -#if 4>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 4==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r4,ror#25	@ Sigma1(e) +	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r4 -	str	r3,[sp,#4*4] -	add	r3,r3,r0 +	add	r7,r7,r12			@ h+=K256[i]  	eor	r2,r2,r6			@ Ch(e,f,g) -	add	r3,r3,r7 -	mov	r7,r8,ror#2 -	add	r3,r3,r2 -	eor	r7,r7,r8,ror#13 -	add	r3,r3,r12 -	eor	r7,r7,r8,ror#22		@ Sigma0(a) -#if 4>=15 -	ldr	r1,[sp,#6*4]		@ from BODY_16_xx -#endif -	orr	r0,r8,r9 -	and	r2,r8,r9 -	and	r0,r0,r10 -	add	r7,r7,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r11,r11,r3 -	add	r7,r7,r0 +	eor	r0,r8,r8,ror#11 +	add	r7,r7,r2			@ h+=Ch(e,f,g) +#if 4==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 4<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r8,r9			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx +	eor	r12,r8,r9			@ a^b, b^c in next round +	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r8,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r11,r11,r7			@ d+=h +	eor	r3,r3,r9			@ Maj(a,b,c) +	add	r7,r7,r0,ror#2	@ h+=Sigma0(a) +	@ add	r7,r7,r3			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 5 +# if 5==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r11,r11,ror#5 +	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r11,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 5 -	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 -#endif -	mov	r0,r11,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r11,ror#11 +	@ ldrb	r2,[r1,#3]			@ 5 +	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past +	ldrb	r3,[r1,#2] +	ldrb	r0,[r1,#1] +	orr	r2,r2,r3,lsl#8 +	ldrb	r3,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 5==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r11,r11,ror#5 +	orr	r2,r2,r3,lsl#24 +	eor	r0,r0,r11,ror#19	@ Sigma1(e) +#endif +	ldr	r3,[r14],#4			@ *K256++ +	add	r6,r6,r2			@ h+=X[i] +	str	r2,[sp,#5*4]  	eor	r2,r4,r5 -#if 5>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 5==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r11,ror#25	@ Sigma1(e) +	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r11 -	str	r3,[sp,#5*4] -	add	r3,r3,r0 +	add	r6,r6,r3			@ h+=K256[i]  	eor	r2,r2,r5			@ Ch(e,f,g) -	add	r3,r3,r6 -	mov	r6,r7,ror#2 -	add	r3,r3,r2 -	eor	r6,r6,r7,ror#13 -	add	r3,r3,r12 -	eor	r6,r6,r7,ror#22		@ Sigma0(a) -#if 5>=15 -	ldr	r1,[sp,#7*4]		@ from BODY_16_xx -#endif -	orr	r0,r7,r8 -	and	r2,r7,r8 -	and	r0,r0,r9 -	add	r6,r6,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r10,r10,r3 -	add	r6,r6,r0 +	eor	r0,r7,r7,ror#11 +	add	r6,r6,r2			@ h+=Ch(e,f,g) +#if 5==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 5<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r7,r8			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx +	eor	r3,r7,r8			@ a^b, b^c in next round +	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r7,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r10,r10,r6			@ d+=h +	eor	r12,r12,r8			@ Maj(a,b,c) +	add	r6,r6,r0,ror#2	@ h+=Sigma0(a) +	@ add	r6,r6,r12			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 6 +# if 6==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r10,r10,ror#5 +	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r10,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 6 +	@ ldrb	r2,[r1,#3]			@ 6 +	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past  	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 +	ldrb	r0,[r1,#1] +	orr	r2,r2,r12,lsl#8 +	ldrb	r12,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 6==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r10,r10,ror#5 +	orr	r2,r2,r12,lsl#24 +	eor	r0,r0,r10,ror#19	@ Sigma1(e)  #endif -	mov	r0,r10,ror#6  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r10,ror#11 +	add	r5,r5,r2			@ h+=X[i] +	str	r2,[sp,#6*4]  	eor	r2,r11,r4 -#if 6>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 6==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r10,ror#25	@ Sigma1(e) +	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r10 -	str	r3,[sp,#6*4] -	add	r3,r3,r0 +	add	r5,r5,r12			@ h+=K256[i]  	eor	r2,r2,r4			@ Ch(e,f,g) -	add	r3,r3,r5 -	mov	r5,r6,ror#2 -	add	r3,r3,r2 -	eor	r5,r5,r6,ror#13 -	add	r3,r3,r12 -	eor	r5,r5,r6,ror#22		@ Sigma0(a) -#if 6>=15 -	ldr	r1,[sp,#8*4]		@ from BODY_16_xx -#endif -	orr	r0,r6,r7 -	and	r2,r6,r7 -	and	r0,r0,r8 -	add	r5,r5,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r9,r9,r3 -	add	r5,r5,r0 +	eor	r0,r6,r6,ror#11 +	add	r5,r5,r2			@ h+=Ch(e,f,g) +#if 6==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 6<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r6,r7			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx +	eor	r12,r6,r7			@ a^b, b^c in next round +	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r6,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r9,r9,r5			@ d+=h +	eor	r3,r3,r7			@ Maj(a,b,c) +	add	r5,r5,r0,ror#2	@ h+=Sigma0(a) +	@ add	r5,r5,r3			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 7 +# if 7==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r9,r9,ror#5 +	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r9,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 7 -	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 -#endif -	mov	r0,r9,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r9,ror#11 +	@ ldrb	r2,[r1,#3]			@ 7 +	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past +	ldrb	r3,[r1,#2] +	ldrb	r0,[r1,#1] +	orr	r2,r2,r3,lsl#8 +	ldrb	r3,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 7==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r9,r9,ror#5 +	orr	r2,r2,r3,lsl#24 +	eor	r0,r0,r9,ror#19	@ Sigma1(e) +#endif +	ldr	r3,[r14],#4			@ *K256++ +	add	r4,r4,r2			@ h+=X[i] +	str	r2,[sp,#7*4]  	eor	r2,r10,r11 -#if 7>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 7==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r9,ror#25	@ Sigma1(e) +	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r9 -	str	r3,[sp,#7*4] -	add	r3,r3,r0 +	add	r4,r4,r3			@ h+=K256[i]  	eor	r2,r2,r11			@ Ch(e,f,g) -	add	r3,r3,r4 -	mov	r4,r5,ror#2 -	add	r3,r3,r2 -	eor	r4,r4,r5,ror#13 -	add	r3,r3,r12 -	eor	r4,r4,r5,ror#22		@ Sigma0(a) -#if 7>=15 -	ldr	r1,[sp,#9*4]		@ from BODY_16_xx -#endif -	orr	r0,r5,r6 -	and	r2,r5,r6 -	and	r0,r0,r7 -	add	r4,r4,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r8,r8,r3 -	add	r4,r4,r0 +	eor	r0,r5,r5,ror#11 +	add	r4,r4,r2			@ h+=Ch(e,f,g) +#if 7==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 7<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r5,r6			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx +	eor	r3,r5,r6			@ a^b, b^c in next round +	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r5,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r8,r8,r4			@ d+=h +	eor	r12,r12,r6			@ Maj(a,b,c) +	add	r4,r4,r0,ror#2	@ h+=Sigma0(a) +	@ add	r4,r4,r12			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 8 +# if 8==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r8,r8,ror#5 +	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r8,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 8 +	@ ldrb	r2,[r1,#3]			@ 8 +	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past  	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 +	ldrb	r0,[r1,#1] +	orr	r2,r2,r12,lsl#8 +	ldrb	r12,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 8==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r8,r8,ror#5 +	orr	r2,r2,r12,lsl#24 +	eor	r0,r0,r8,ror#19	@ Sigma1(e)  #endif -	mov	r0,r8,ror#6  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r8,ror#11 +	add	r11,r11,r2			@ h+=X[i] +	str	r2,[sp,#8*4]  	eor	r2,r9,r10 -#if 8>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 8==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r8,ror#25	@ Sigma1(e) +	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r8 -	str	r3,[sp,#8*4] -	add	r3,r3,r0 +	add	r11,r11,r12			@ h+=K256[i]  	eor	r2,r2,r10			@ Ch(e,f,g) -	add	r3,r3,r11 -	mov	r11,r4,ror#2 -	add	r3,r3,r2 -	eor	r11,r11,r4,ror#13 -	add	r3,r3,r12 -	eor	r11,r11,r4,ror#22		@ Sigma0(a) -#if 8>=15 -	ldr	r1,[sp,#10*4]		@ from BODY_16_xx -#endif -	orr	r0,r4,r5 -	and	r2,r4,r5 -	and	r0,r0,r6 -	add	r11,r11,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r7,r7,r3 -	add	r11,r11,r0 +	eor	r0,r4,r4,ror#11 +	add	r11,r11,r2			@ h+=Ch(e,f,g) +#if 8==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 8<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r4,r5			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx +	eor	r12,r4,r5			@ a^b, b^c in next round +	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r4,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r7,r7,r11			@ d+=h +	eor	r3,r3,r5			@ Maj(a,b,c) +	add	r11,r11,r0,ror#2	@ h+=Sigma0(a) +	@ add	r11,r11,r3			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 9 +# if 9==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r7,r7,ror#5 +	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r7,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 9 -	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 -#endif -	mov	r0,r7,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r7,ror#11 +	@ ldrb	r2,[r1,#3]			@ 9 +	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past +	ldrb	r3,[r1,#2] +	ldrb	r0,[r1,#1] +	orr	r2,r2,r3,lsl#8 +	ldrb	r3,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 9==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r7,r7,ror#5 +	orr	r2,r2,r3,lsl#24 +	eor	r0,r0,r7,ror#19	@ Sigma1(e) +#endif +	ldr	r3,[r14],#4			@ *K256++ +	add	r10,r10,r2			@ h+=X[i] +	str	r2,[sp,#9*4]  	eor	r2,r8,r9 -#if 9>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 9==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r7,ror#25	@ Sigma1(e) +	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r7 -	str	r3,[sp,#9*4] -	add	r3,r3,r0 +	add	r10,r10,r3			@ h+=K256[i]  	eor	r2,r2,r9			@ Ch(e,f,g) -	add	r3,r3,r10 -	mov	r10,r11,ror#2 -	add	r3,r3,r2 -	eor	r10,r10,r11,ror#13 -	add	r3,r3,r12 -	eor	r10,r10,r11,ror#22		@ Sigma0(a) -#if 9>=15 -	ldr	r1,[sp,#11*4]		@ from BODY_16_xx -#endif -	orr	r0,r11,r4 -	and	r2,r11,r4 -	and	r0,r0,r5 -	add	r10,r10,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r6,r6,r3 -	add	r10,r10,r0 +	eor	r0,r11,r11,ror#11 +	add	r10,r10,r2			@ h+=Ch(e,f,g) +#if 9==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 9<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r11,r4			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx +	eor	r3,r11,r4			@ a^b, b^c in next round +	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r11,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r6,r6,r10			@ d+=h +	eor	r12,r12,r4			@ Maj(a,b,c) +	add	r10,r10,r0,ror#2	@ h+=Sigma0(a) +	@ add	r10,r10,r12			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 10 +# if 10==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r6,r6,ror#5 +	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r6,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 10 +	@ ldrb	r2,[r1,#3]			@ 10 +	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past  	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 +	ldrb	r0,[r1,#1] +	orr	r2,r2,r12,lsl#8 +	ldrb	r12,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 10==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r6,r6,ror#5 +	orr	r2,r2,r12,lsl#24 +	eor	r0,r0,r6,ror#19	@ Sigma1(e)  #endif -	mov	r0,r6,ror#6  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r6,ror#11 +	add	r9,r9,r2			@ h+=X[i] +	str	r2,[sp,#10*4]  	eor	r2,r7,r8 -#if 10>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 10==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r6,ror#25	@ Sigma1(e) +	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r6 -	str	r3,[sp,#10*4] -	add	r3,r3,r0 +	add	r9,r9,r12			@ h+=K256[i]  	eor	r2,r2,r8			@ Ch(e,f,g) -	add	r3,r3,r9 -	mov	r9,r10,ror#2 -	add	r3,r3,r2 -	eor	r9,r9,r10,ror#13 -	add	r3,r3,r12 -	eor	r9,r9,r10,ror#22		@ Sigma0(a) -#if 10>=15 -	ldr	r1,[sp,#12*4]		@ from BODY_16_xx -#endif -	orr	r0,r10,r11 -	and	r2,r10,r11 -	and	r0,r0,r4 -	add	r9,r9,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r5,r5,r3 -	add	r9,r9,r0 +	eor	r0,r10,r10,ror#11 +	add	r9,r9,r2			@ h+=Ch(e,f,g) +#if 10==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 10<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r10,r11			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx +	eor	r12,r10,r11			@ a^b, b^c in next round +	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r10,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r5,r5,r9			@ d+=h +	eor	r3,r3,r11			@ Maj(a,b,c) +	add	r9,r9,r0,ror#2	@ h+=Sigma0(a) +	@ add	r9,r9,r3			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 11 +# if 11==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r5,r5,ror#5 +	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r5,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 11 -	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 -#endif -	mov	r0,r5,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r5,ror#11 +	@ ldrb	r2,[r1,#3]			@ 11 +	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past +	ldrb	r3,[r1,#2] +	ldrb	r0,[r1,#1] +	orr	r2,r2,r3,lsl#8 +	ldrb	r3,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 11==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r5,r5,ror#5 +	orr	r2,r2,r3,lsl#24 +	eor	r0,r0,r5,ror#19	@ Sigma1(e) +#endif +	ldr	r3,[r14],#4			@ *K256++ +	add	r8,r8,r2			@ h+=X[i] +	str	r2,[sp,#11*4]  	eor	r2,r6,r7 -#if 11>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 11==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r5,ror#25	@ Sigma1(e) +	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r5 -	str	r3,[sp,#11*4] -	add	r3,r3,r0 +	add	r8,r8,r3			@ h+=K256[i]  	eor	r2,r2,r7			@ Ch(e,f,g) -	add	r3,r3,r8 -	mov	r8,r9,ror#2 -	add	r3,r3,r2 -	eor	r8,r8,r9,ror#13 -	add	r3,r3,r12 -	eor	r8,r8,r9,ror#22		@ Sigma0(a) -#if 11>=15 -	ldr	r1,[sp,#13*4]		@ from BODY_16_xx -#endif -	orr	r0,r9,r10 -	and	r2,r9,r10 -	and	r0,r0,r11 -	add	r8,r8,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r4,r4,r3 -	add	r8,r8,r0 +	eor	r0,r9,r9,ror#11 +	add	r8,r8,r2			@ h+=Ch(e,f,g) +#if 11==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 11<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r9,r10			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx +	eor	r3,r9,r10			@ a^b, b^c in next round +	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r9,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r4,r4,r8			@ d+=h +	eor	r12,r12,r10			@ Maj(a,b,c) +	add	r8,r8,r0,ror#2	@ h+=Sigma0(a) +	@ add	r8,r8,r12			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 12 +# if 12==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r4,r4,ror#5 +	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r4,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 12 +	@ ldrb	r2,[r1,#3]			@ 12 +	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past  	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 +	ldrb	r0,[r1,#1] +	orr	r2,r2,r12,lsl#8 +	ldrb	r12,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 12==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r4,r4,ror#5 +	orr	r2,r2,r12,lsl#24 +	eor	r0,r0,r4,ror#19	@ Sigma1(e)  #endif -	mov	r0,r4,ror#6  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r4,ror#11 +	add	r7,r7,r2			@ h+=X[i] +	str	r2,[sp,#12*4]  	eor	r2,r5,r6 -#if 12>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 12==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r4,ror#25	@ Sigma1(e) +	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r4 -	str	r3,[sp,#12*4] -	add	r3,r3,r0 +	add	r7,r7,r12			@ h+=K256[i]  	eor	r2,r2,r6			@ Ch(e,f,g) -	add	r3,r3,r7 -	mov	r7,r8,ror#2 -	add	r3,r3,r2 -	eor	r7,r7,r8,ror#13 -	add	r3,r3,r12 -	eor	r7,r7,r8,ror#22		@ Sigma0(a) -#if 12>=15 -	ldr	r1,[sp,#14*4]		@ from BODY_16_xx -#endif -	orr	r0,r8,r9 -	and	r2,r8,r9 -	and	r0,r0,r10 -	add	r7,r7,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r11,r11,r3 -	add	r7,r7,r0 +	eor	r0,r8,r8,ror#11 +	add	r7,r7,r2			@ h+=Ch(e,f,g) +#if 12==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 12<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r8,r9			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx +	eor	r12,r8,r9			@ a^b, b^c in next round +	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r8,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r11,r11,r7			@ d+=h +	eor	r3,r3,r9			@ Maj(a,b,c) +	add	r7,r7,r0,ror#2	@ h+=Sigma0(a) +	@ add	r7,r7,r3			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 13 +# if 13==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r11,r11,ror#5 +	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r11,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 13 -	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 -#endif -	mov	r0,r11,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r11,ror#11 +	@ ldrb	r2,[r1,#3]			@ 13 +	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past +	ldrb	r3,[r1,#2] +	ldrb	r0,[r1,#1] +	orr	r2,r2,r3,lsl#8 +	ldrb	r3,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 13==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r11,r11,ror#5 +	orr	r2,r2,r3,lsl#24 +	eor	r0,r0,r11,ror#19	@ Sigma1(e) +#endif +	ldr	r3,[r14],#4			@ *K256++ +	add	r6,r6,r2			@ h+=X[i] +	str	r2,[sp,#13*4]  	eor	r2,r4,r5 -#if 13>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 13==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r11,ror#25	@ Sigma1(e) +	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r11 -	str	r3,[sp,#13*4] -	add	r3,r3,r0 +	add	r6,r6,r3			@ h+=K256[i]  	eor	r2,r2,r5			@ Ch(e,f,g) -	add	r3,r3,r6 -	mov	r6,r7,ror#2 -	add	r3,r3,r2 -	eor	r6,r6,r7,ror#13 -	add	r3,r3,r12 -	eor	r6,r6,r7,ror#22		@ Sigma0(a) -#if 13>=15 -	ldr	r1,[sp,#15*4]		@ from BODY_16_xx -#endif -	orr	r0,r7,r8 -	and	r2,r7,r8 -	and	r0,r0,r9 -	add	r6,r6,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r10,r10,r3 -	add	r6,r6,r0 +	eor	r0,r7,r7,ror#11 +	add	r6,r6,r2			@ h+=Ch(e,f,g) +#if 13==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 13<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r7,r8			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx +	eor	r3,r7,r8			@ a^b, b^c in next round +	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r7,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r10,r10,r6			@ d+=h +	eor	r12,r12,r8			@ Maj(a,b,c) +	add	r6,r6,r0,ror#2	@ h+=Sigma0(a) +	@ add	r6,r6,r12			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 14 +# if 14==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r10,r10,ror#5 +	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r10,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 14 +	@ ldrb	r2,[r1,#3]			@ 14 +	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past  	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 +	ldrb	r0,[r1,#1] +	orr	r2,r2,r12,lsl#8 +	ldrb	r12,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 14==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r10,r10,ror#5 +	orr	r2,r2,r12,lsl#24 +	eor	r0,r0,r10,ror#19	@ Sigma1(e)  #endif -	mov	r0,r10,ror#6  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r10,ror#11 +	add	r5,r5,r2			@ h+=X[i] +	str	r2,[sp,#14*4]  	eor	r2,r11,r4 -#if 14>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 14==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r10,ror#25	@ Sigma1(e) +	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r10 -	str	r3,[sp,#14*4] -	add	r3,r3,r0 +	add	r5,r5,r12			@ h+=K256[i]  	eor	r2,r2,r4			@ Ch(e,f,g) -	add	r3,r3,r5 -	mov	r5,r6,ror#2 -	add	r3,r3,r2 -	eor	r5,r5,r6,ror#13 -	add	r3,r3,r12 -	eor	r5,r5,r6,ror#22		@ Sigma0(a) -#if 14>=15 -	ldr	r1,[sp,#0*4]		@ from BODY_16_xx -#endif -	orr	r0,r6,r7 -	and	r2,r6,r7 -	and	r0,r0,r8 -	add	r5,r5,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r9,r9,r3 -	add	r5,r5,r0 +	eor	r0,r6,r6,ror#11 +	add	r5,r5,r2			@ h+=Ch(e,f,g) +#if 14==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 14<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r6,r7			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx +	eor	r12,r6,r7			@ a^b, b^c in next round +	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r6,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r9,r9,r5			@ d+=h +	eor	r3,r3,r7			@ Maj(a,b,c) +	add	r5,r5,r0,ror#2	@ h+=Sigma0(a) +	@ add	r5,r5,r3			@ h+=Maj(a,b,c)  #if __ARM_ARCH__>=7 -	ldr	r3,[r1],#4 +	@ ldr	r2,[r1],#4			@ 15 +# if 15==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r9,r9,ror#5 +	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past +	eor	r0,r0,r9,ror#19	@ Sigma1(e) +	rev	r2,r2  #else -	ldrb	r3,[r1,#3]			@ 15 -	ldrb	r12,[r1,#2] -	ldrb	r2,[r1,#1] -	ldrb	r0,[r1],#4 -	orr	r3,r3,r12,lsl#8 -	orr	r3,r3,r2,lsl#16 -	orr	r3,r3,r0,lsl#24 -#endif -	mov	r0,r9,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r9,ror#11 +	@ ldrb	r2,[r1,#3]			@ 15 +	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past +	ldrb	r3,[r1,#2] +	ldrb	r0,[r1,#1] +	orr	r2,r2,r3,lsl#8 +	ldrb	r3,[r1],#4 +	orr	r2,r2,r0,lsl#16 +# if 15==15 +	str	r1,[sp,#17*4]			@ make room for r1 +# endif +	eor	r0,r9,r9,ror#5 +	orr	r2,r2,r3,lsl#24 +	eor	r0,r0,r9,ror#19	@ Sigma1(e) +#endif +	ldr	r3,[r14],#4			@ *K256++ +	add	r4,r4,r2			@ h+=X[i] +	str	r2,[sp,#15*4]  	eor	r2,r10,r11 -#if 15>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 15==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r9,ror#25	@ Sigma1(e) +	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r9 -	str	r3,[sp,#15*4] -	add	r3,r3,r0 +	add	r4,r4,r3			@ h+=K256[i]  	eor	r2,r2,r11			@ Ch(e,f,g) -	add	r3,r3,r4 -	mov	r4,r5,ror#2 -	add	r3,r3,r2 -	eor	r4,r4,r5,ror#13 -	add	r3,r3,r12 -	eor	r4,r4,r5,ror#22		@ Sigma0(a) -#if 15>=15 -	ldr	r1,[sp,#1*4]		@ from BODY_16_xx -#endif -	orr	r0,r5,r6 -	and	r2,r5,r6 -	and	r0,r0,r7 -	add	r4,r4,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r8,r8,r3 -	add	r4,r4,r0 +	eor	r0,r5,r5,ror#11 +	add	r4,r4,r2			@ h+=Ch(e,f,g) +#if 15==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 15<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r5,r6			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx +	eor	r3,r5,r6			@ a^b, b^c in next round +	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r5,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r8,r8,r4			@ d+=h +	eor	r12,r12,r6			@ Maj(a,b,c) +	add	r4,r4,r0,ror#2	@ h+=Sigma0(a) +	@ add	r4,r4,r12			@ h+=Maj(a,b,c)  .Lrounds_16_xx: -	@ ldr	r1,[sp,#1*4]		@ 16 -	ldr	r12,[sp,#14*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#0*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#9*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 -	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r8,ror#6 +	@ ldr	r2,[sp,#1*4]		@ 16 +	@ ldr	r1,[sp,#14*4] +	mov	r0,r2,ror#7 +	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past +	mov	r12,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r12,r12,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#0*4] +	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#9*4] + +	add	r12,r12,r0 +	eor	r0,r8,r8,ror#5	@ from BODY_00_15 +	add	r2,r2,r12 +	eor	r0,r0,r8,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i]  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r8,ror#11 +	add	r11,r11,r2			@ h+=X[i] +	str	r2,[sp,#0*4]  	eor	r2,r9,r10 -#if 16>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 16==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r8,ror#25	@ Sigma1(e) +	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r8 -	str	r3,[sp,#0*4] -	add	r3,r3,r0 +	add	r11,r11,r12			@ h+=K256[i]  	eor	r2,r2,r10			@ Ch(e,f,g) -	add	r3,r3,r11 -	mov	r11,r4,ror#2 -	add	r3,r3,r2 -	eor	r11,r11,r4,ror#13 -	add	r3,r3,r12 -	eor	r11,r11,r4,ror#22		@ Sigma0(a) -#if 16>=15 -	ldr	r1,[sp,#2*4]		@ from BODY_16_xx -#endif -	orr	r0,r4,r5 -	and	r2,r4,r5 -	and	r0,r0,r6 -	add	r11,r11,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r7,r7,r3 -	add	r11,r11,r0 -	@ ldr	r1,[sp,#2*4]		@ 17 -	ldr	r12,[sp,#15*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#1*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#10*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 +	eor	r0,r4,r4,ror#11 +	add	r11,r11,r2			@ h+=Ch(e,f,g) +#if 16==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 16<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r4,r5			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx +	eor	r12,r4,r5			@ a^b, b^c in next round +	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r4,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r7,r7,r11			@ d+=h +	eor	r3,r3,r5			@ Maj(a,b,c) +	add	r11,r11,r0,ror#2	@ h+=Sigma0(a) +	@ add	r11,r11,r3			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#2*4]		@ 17 +	@ ldr	r1,[sp,#15*4] +	mov	r0,r2,ror#7 +	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past +	mov	r3,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r3,r3,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#1*4] +	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#10*4] +  	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r7,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r7,ror#11 +	eor	r0,r7,r7,ror#5	@ from BODY_00_15 +	add	r2,r2,r3 +	eor	r0,r0,r7,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i] +	ldr	r3,[r14],#4			@ *K256++ +	add	r10,r10,r2			@ h+=X[i] +	str	r2,[sp,#1*4]  	eor	r2,r8,r9 -#if 17>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 17==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r7,ror#25	@ Sigma1(e) +	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r7 -	str	r3,[sp,#1*4] -	add	r3,r3,r0 +	add	r10,r10,r3			@ h+=K256[i]  	eor	r2,r2,r9			@ Ch(e,f,g) -	add	r3,r3,r10 -	mov	r10,r11,ror#2 -	add	r3,r3,r2 -	eor	r10,r10,r11,ror#13 -	add	r3,r3,r12 -	eor	r10,r10,r11,ror#22		@ Sigma0(a) -#if 17>=15 -	ldr	r1,[sp,#3*4]		@ from BODY_16_xx -#endif -	orr	r0,r11,r4 -	and	r2,r11,r4 -	and	r0,r0,r5 -	add	r10,r10,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r6,r6,r3 -	add	r10,r10,r0 -	@ ldr	r1,[sp,#3*4]		@ 18 -	ldr	r12,[sp,#0*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#2*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#11*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 -	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r6,ror#6 +	eor	r0,r11,r11,ror#11 +	add	r10,r10,r2			@ h+=Ch(e,f,g) +#if 17==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 17<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r11,r4			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx +	eor	r3,r11,r4			@ a^b, b^c in next round +	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r11,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r6,r6,r10			@ d+=h +	eor	r12,r12,r4			@ Maj(a,b,c) +	add	r10,r10,r0,ror#2	@ h+=Sigma0(a) +	@ add	r10,r10,r12			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#3*4]		@ 18 +	@ ldr	r1,[sp,#0*4] +	mov	r0,r2,ror#7 +	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past +	mov	r12,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r12,r12,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#2*4] +	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#11*4] + +	add	r12,r12,r0 +	eor	r0,r6,r6,ror#5	@ from BODY_00_15 +	add	r2,r2,r12 +	eor	r0,r0,r6,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i]  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r6,ror#11 +	add	r9,r9,r2			@ h+=X[i] +	str	r2,[sp,#2*4]  	eor	r2,r7,r8 -#if 18>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 18==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r6,ror#25	@ Sigma1(e) +	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r6 -	str	r3,[sp,#2*4] -	add	r3,r3,r0 +	add	r9,r9,r12			@ h+=K256[i]  	eor	r2,r2,r8			@ Ch(e,f,g) -	add	r3,r3,r9 -	mov	r9,r10,ror#2 -	add	r3,r3,r2 -	eor	r9,r9,r10,ror#13 -	add	r3,r3,r12 -	eor	r9,r9,r10,ror#22		@ Sigma0(a) -#if 18>=15 -	ldr	r1,[sp,#4*4]		@ from BODY_16_xx -#endif -	orr	r0,r10,r11 -	and	r2,r10,r11 -	and	r0,r0,r4 -	add	r9,r9,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r5,r5,r3 -	add	r9,r9,r0 -	@ ldr	r1,[sp,#4*4]		@ 19 -	ldr	r12,[sp,#1*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#3*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#12*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 +	eor	r0,r10,r10,ror#11 +	add	r9,r9,r2			@ h+=Ch(e,f,g) +#if 18==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 18<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r10,r11			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx +	eor	r12,r10,r11			@ a^b, b^c in next round +	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r10,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r5,r5,r9			@ d+=h +	eor	r3,r3,r11			@ Maj(a,b,c) +	add	r9,r9,r0,ror#2	@ h+=Sigma0(a) +	@ add	r9,r9,r3			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#4*4]		@ 19 +	@ ldr	r1,[sp,#1*4] +	mov	r0,r2,ror#7 +	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past +	mov	r3,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r3,r3,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#3*4] +	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#12*4] +  	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r5,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r5,ror#11 +	eor	r0,r5,r5,ror#5	@ from BODY_00_15 +	add	r2,r2,r3 +	eor	r0,r0,r5,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i] +	ldr	r3,[r14],#4			@ *K256++ +	add	r8,r8,r2			@ h+=X[i] +	str	r2,[sp,#3*4]  	eor	r2,r6,r7 -#if 19>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 19==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r5,ror#25	@ Sigma1(e) +	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r5 -	str	r3,[sp,#3*4] -	add	r3,r3,r0 +	add	r8,r8,r3			@ h+=K256[i]  	eor	r2,r2,r7			@ Ch(e,f,g) -	add	r3,r3,r8 -	mov	r8,r9,ror#2 -	add	r3,r3,r2 -	eor	r8,r8,r9,ror#13 -	add	r3,r3,r12 -	eor	r8,r8,r9,ror#22		@ Sigma0(a) -#if 19>=15 -	ldr	r1,[sp,#5*4]		@ from BODY_16_xx -#endif -	orr	r0,r9,r10 -	and	r2,r9,r10 -	and	r0,r0,r11 -	add	r8,r8,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r4,r4,r3 -	add	r8,r8,r0 -	@ ldr	r1,[sp,#5*4]		@ 20 -	ldr	r12,[sp,#2*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#4*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#13*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 -	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r4,ror#6 +	eor	r0,r9,r9,ror#11 +	add	r8,r8,r2			@ h+=Ch(e,f,g) +#if 19==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 19<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r9,r10			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx +	eor	r3,r9,r10			@ a^b, b^c in next round +	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r9,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r4,r4,r8			@ d+=h +	eor	r12,r12,r10			@ Maj(a,b,c) +	add	r8,r8,r0,ror#2	@ h+=Sigma0(a) +	@ add	r8,r8,r12			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#5*4]		@ 20 +	@ ldr	r1,[sp,#2*4] +	mov	r0,r2,ror#7 +	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past +	mov	r12,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r12,r12,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#4*4] +	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#13*4] + +	add	r12,r12,r0 +	eor	r0,r4,r4,ror#5	@ from BODY_00_15 +	add	r2,r2,r12 +	eor	r0,r0,r4,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i]  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r4,ror#11 +	add	r7,r7,r2			@ h+=X[i] +	str	r2,[sp,#4*4]  	eor	r2,r5,r6 -#if 20>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 20==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r4,ror#25	@ Sigma1(e) +	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r4 -	str	r3,[sp,#4*4] -	add	r3,r3,r0 +	add	r7,r7,r12			@ h+=K256[i]  	eor	r2,r2,r6			@ Ch(e,f,g) -	add	r3,r3,r7 -	mov	r7,r8,ror#2 -	add	r3,r3,r2 -	eor	r7,r7,r8,ror#13 -	add	r3,r3,r12 -	eor	r7,r7,r8,ror#22		@ Sigma0(a) -#if 20>=15 -	ldr	r1,[sp,#6*4]		@ from BODY_16_xx -#endif -	orr	r0,r8,r9 -	and	r2,r8,r9 -	and	r0,r0,r10 -	add	r7,r7,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r11,r11,r3 -	add	r7,r7,r0 -	@ ldr	r1,[sp,#6*4]		@ 21 -	ldr	r12,[sp,#3*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#5*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#14*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 +	eor	r0,r8,r8,ror#11 +	add	r7,r7,r2			@ h+=Ch(e,f,g) +#if 20==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 20<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r8,r9			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx +	eor	r12,r8,r9			@ a^b, b^c in next round +	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r8,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r11,r11,r7			@ d+=h +	eor	r3,r3,r9			@ Maj(a,b,c) +	add	r7,r7,r0,ror#2	@ h+=Sigma0(a) +	@ add	r7,r7,r3			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#6*4]		@ 21 +	@ ldr	r1,[sp,#3*4] +	mov	r0,r2,ror#7 +	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past +	mov	r3,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r3,r3,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#5*4] +	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#14*4] +  	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r11,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r11,ror#11 +	eor	r0,r11,r11,ror#5	@ from BODY_00_15 +	add	r2,r2,r3 +	eor	r0,r0,r11,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i] +	ldr	r3,[r14],#4			@ *K256++ +	add	r6,r6,r2			@ h+=X[i] +	str	r2,[sp,#5*4]  	eor	r2,r4,r5 -#if 21>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 21==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r11,ror#25	@ Sigma1(e) +	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r11 -	str	r3,[sp,#5*4] -	add	r3,r3,r0 +	add	r6,r6,r3			@ h+=K256[i]  	eor	r2,r2,r5			@ Ch(e,f,g) -	add	r3,r3,r6 -	mov	r6,r7,ror#2 -	add	r3,r3,r2 -	eor	r6,r6,r7,ror#13 -	add	r3,r3,r12 -	eor	r6,r6,r7,ror#22		@ Sigma0(a) -#if 21>=15 -	ldr	r1,[sp,#7*4]		@ from BODY_16_xx -#endif -	orr	r0,r7,r8 -	and	r2,r7,r8 -	and	r0,r0,r9 -	add	r6,r6,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r10,r10,r3 -	add	r6,r6,r0 -	@ ldr	r1,[sp,#7*4]		@ 22 -	ldr	r12,[sp,#4*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#6*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#15*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 -	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r10,ror#6 +	eor	r0,r7,r7,ror#11 +	add	r6,r6,r2			@ h+=Ch(e,f,g) +#if 21==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 21<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r7,r8			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx +	eor	r3,r7,r8			@ a^b, b^c in next round +	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r7,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r10,r10,r6			@ d+=h +	eor	r12,r12,r8			@ Maj(a,b,c) +	add	r6,r6,r0,ror#2	@ h+=Sigma0(a) +	@ add	r6,r6,r12			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#7*4]		@ 22 +	@ ldr	r1,[sp,#4*4] +	mov	r0,r2,ror#7 +	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past +	mov	r12,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r12,r12,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#6*4] +	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#15*4] + +	add	r12,r12,r0 +	eor	r0,r10,r10,ror#5	@ from BODY_00_15 +	add	r2,r2,r12 +	eor	r0,r0,r10,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i]  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r10,ror#11 +	add	r5,r5,r2			@ h+=X[i] +	str	r2,[sp,#6*4]  	eor	r2,r11,r4 -#if 22>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 22==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r10,ror#25	@ Sigma1(e) +	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r10 -	str	r3,[sp,#6*4] -	add	r3,r3,r0 +	add	r5,r5,r12			@ h+=K256[i]  	eor	r2,r2,r4			@ Ch(e,f,g) -	add	r3,r3,r5 -	mov	r5,r6,ror#2 -	add	r3,r3,r2 -	eor	r5,r5,r6,ror#13 -	add	r3,r3,r12 -	eor	r5,r5,r6,ror#22		@ Sigma0(a) -#if 22>=15 -	ldr	r1,[sp,#8*4]		@ from BODY_16_xx -#endif -	orr	r0,r6,r7 -	and	r2,r6,r7 -	and	r0,r0,r8 -	add	r5,r5,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r9,r9,r3 -	add	r5,r5,r0 -	@ ldr	r1,[sp,#8*4]		@ 23 -	ldr	r12,[sp,#5*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#7*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#0*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 +	eor	r0,r6,r6,ror#11 +	add	r5,r5,r2			@ h+=Ch(e,f,g) +#if 22==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 22<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r6,r7			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx +	eor	r12,r6,r7			@ a^b, b^c in next round +	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r6,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r9,r9,r5			@ d+=h +	eor	r3,r3,r7			@ Maj(a,b,c) +	add	r5,r5,r0,ror#2	@ h+=Sigma0(a) +	@ add	r5,r5,r3			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#8*4]		@ 23 +	@ ldr	r1,[sp,#5*4] +	mov	r0,r2,ror#7 +	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past +	mov	r3,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r3,r3,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#7*4] +	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#0*4] +  	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r9,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r9,ror#11 +	eor	r0,r9,r9,ror#5	@ from BODY_00_15 +	add	r2,r2,r3 +	eor	r0,r0,r9,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i] +	ldr	r3,[r14],#4			@ *K256++ +	add	r4,r4,r2			@ h+=X[i] +	str	r2,[sp,#7*4]  	eor	r2,r10,r11 -#if 23>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 23==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r9,ror#25	@ Sigma1(e) +	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r9 -	str	r3,[sp,#7*4] -	add	r3,r3,r0 +	add	r4,r4,r3			@ h+=K256[i]  	eor	r2,r2,r11			@ Ch(e,f,g) -	add	r3,r3,r4 -	mov	r4,r5,ror#2 -	add	r3,r3,r2 -	eor	r4,r4,r5,ror#13 -	add	r3,r3,r12 -	eor	r4,r4,r5,ror#22		@ Sigma0(a) -#if 23>=15 -	ldr	r1,[sp,#9*4]		@ from BODY_16_xx -#endif -	orr	r0,r5,r6 -	and	r2,r5,r6 -	and	r0,r0,r7 -	add	r4,r4,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r8,r8,r3 -	add	r4,r4,r0 -	@ ldr	r1,[sp,#9*4]		@ 24 -	ldr	r12,[sp,#6*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#8*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#1*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 -	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r8,ror#6 +	eor	r0,r5,r5,ror#11 +	add	r4,r4,r2			@ h+=Ch(e,f,g) +#if 23==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 23<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r5,r6			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx +	eor	r3,r5,r6			@ a^b, b^c in next round +	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r5,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r8,r8,r4			@ d+=h +	eor	r12,r12,r6			@ Maj(a,b,c) +	add	r4,r4,r0,ror#2	@ h+=Sigma0(a) +	@ add	r4,r4,r12			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#9*4]		@ 24 +	@ ldr	r1,[sp,#6*4] +	mov	r0,r2,ror#7 +	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past +	mov	r12,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r12,r12,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#8*4] +	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#1*4] + +	add	r12,r12,r0 +	eor	r0,r8,r8,ror#5	@ from BODY_00_15 +	add	r2,r2,r12 +	eor	r0,r0,r8,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i]  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r8,ror#11 +	add	r11,r11,r2			@ h+=X[i] +	str	r2,[sp,#8*4]  	eor	r2,r9,r10 -#if 24>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 24==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r8,ror#25	@ Sigma1(e) +	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r8 -	str	r3,[sp,#8*4] -	add	r3,r3,r0 +	add	r11,r11,r12			@ h+=K256[i]  	eor	r2,r2,r10			@ Ch(e,f,g) -	add	r3,r3,r11 -	mov	r11,r4,ror#2 -	add	r3,r3,r2 -	eor	r11,r11,r4,ror#13 -	add	r3,r3,r12 -	eor	r11,r11,r4,ror#22		@ Sigma0(a) -#if 24>=15 -	ldr	r1,[sp,#10*4]		@ from BODY_16_xx -#endif -	orr	r0,r4,r5 -	and	r2,r4,r5 -	and	r0,r0,r6 -	add	r11,r11,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r7,r7,r3 -	add	r11,r11,r0 -	@ ldr	r1,[sp,#10*4]		@ 25 -	ldr	r12,[sp,#7*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#9*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#2*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 +	eor	r0,r4,r4,ror#11 +	add	r11,r11,r2			@ h+=Ch(e,f,g) +#if 24==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 24<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r4,r5			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx +	eor	r12,r4,r5			@ a^b, b^c in next round +	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r4,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r7,r7,r11			@ d+=h +	eor	r3,r3,r5			@ Maj(a,b,c) +	add	r11,r11,r0,ror#2	@ h+=Sigma0(a) +	@ add	r11,r11,r3			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#10*4]		@ 25 +	@ ldr	r1,[sp,#7*4] +	mov	r0,r2,ror#7 +	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past +	mov	r3,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r3,r3,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#9*4] +	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#2*4] +  	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r7,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r7,ror#11 +	eor	r0,r7,r7,ror#5	@ from BODY_00_15 +	add	r2,r2,r3 +	eor	r0,r0,r7,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i] +	ldr	r3,[r14],#4			@ *K256++ +	add	r10,r10,r2			@ h+=X[i] +	str	r2,[sp,#9*4]  	eor	r2,r8,r9 -#if 25>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 25==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r7,ror#25	@ Sigma1(e) +	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r7 -	str	r3,[sp,#9*4] -	add	r3,r3,r0 +	add	r10,r10,r3			@ h+=K256[i]  	eor	r2,r2,r9			@ Ch(e,f,g) -	add	r3,r3,r10 -	mov	r10,r11,ror#2 -	add	r3,r3,r2 -	eor	r10,r10,r11,ror#13 -	add	r3,r3,r12 -	eor	r10,r10,r11,ror#22		@ Sigma0(a) -#if 25>=15 -	ldr	r1,[sp,#11*4]		@ from BODY_16_xx -#endif -	orr	r0,r11,r4 -	and	r2,r11,r4 -	and	r0,r0,r5 -	add	r10,r10,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r6,r6,r3 -	add	r10,r10,r0 -	@ ldr	r1,[sp,#11*4]		@ 26 -	ldr	r12,[sp,#8*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#10*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#3*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 -	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r6,ror#6 +	eor	r0,r11,r11,ror#11 +	add	r10,r10,r2			@ h+=Ch(e,f,g) +#if 25==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 25<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r11,r4			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx +	eor	r3,r11,r4			@ a^b, b^c in next round +	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r11,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r6,r6,r10			@ d+=h +	eor	r12,r12,r4			@ Maj(a,b,c) +	add	r10,r10,r0,ror#2	@ h+=Sigma0(a) +	@ add	r10,r10,r12			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#11*4]		@ 26 +	@ ldr	r1,[sp,#8*4] +	mov	r0,r2,ror#7 +	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past +	mov	r12,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r12,r12,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#10*4] +	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#3*4] + +	add	r12,r12,r0 +	eor	r0,r6,r6,ror#5	@ from BODY_00_15 +	add	r2,r2,r12 +	eor	r0,r0,r6,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i]  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r6,ror#11 +	add	r9,r9,r2			@ h+=X[i] +	str	r2,[sp,#10*4]  	eor	r2,r7,r8 -#if 26>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 26==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r6,ror#25	@ Sigma1(e) +	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r6 -	str	r3,[sp,#10*4] -	add	r3,r3,r0 +	add	r9,r9,r12			@ h+=K256[i]  	eor	r2,r2,r8			@ Ch(e,f,g) -	add	r3,r3,r9 -	mov	r9,r10,ror#2 -	add	r3,r3,r2 -	eor	r9,r9,r10,ror#13 -	add	r3,r3,r12 -	eor	r9,r9,r10,ror#22		@ Sigma0(a) -#if 26>=15 -	ldr	r1,[sp,#12*4]		@ from BODY_16_xx -#endif -	orr	r0,r10,r11 -	and	r2,r10,r11 -	and	r0,r0,r4 -	add	r9,r9,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r5,r5,r3 -	add	r9,r9,r0 -	@ ldr	r1,[sp,#12*4]		@ 27 -	ldr	r12,[sp,#9*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#11*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#4*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 +	eor	r0,r10,r10,ror#11 +	add	r9,r9,r2			@ h+=Ch(e,f,g) +#if 26==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 26<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r10,r11			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx +	eor	r12,r10,r11			@ a^b, b^c in next round +	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r10,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r5,r5,r9			@ d+=h +	eor	r3,r3,r11			@ Maj(a,b,c) +	add	r9,r9,r0,ror#2	@ h+=Sigma0(a) +	@ add	r9,r9,r3			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#12*4]		@ 27 +	@ ldr	r1,[sp,#9*4] +	mov	r0,r2,ror#7 +	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past +	mov	r3,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r3,r3,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#11*4] +	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#4*4] +  	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r5,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r5,ror#11 +	eor	r0,r5,r5,ror#5	@ from BODY_00_15 +	add	r2,r2,r3 +	eor	r0,r0,r5,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i] +	ldr	r3,[r14],#4			@ *K256++ +	add	r8,r8,r2			@ h+=X[i] +	str	r2,[sp,#11*4]  	eor	r2,r6,r7 -#if 27>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 27==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r5,ror#25	@ Sigma1(e) +	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r5 -	str	r3,[sp,#11*4] -	add	r3,r3,r0 +	add	r8,r8,r3			@ h+=K256[i]  	eor	r2,r2,r7			@ Ch(e,f,g) -	add	r3,r3,r8 -	mov	r8,r9,ror#2 -	add	r3,r3,r2 -	eor	r8,r8,r9,ror#13 -	add	r3,r3,r12 -	eor	r8,r8,r9,ror#22		@ Sigma0(a) -#if 27>=15 -	ldr	r1,[sp,#13*4]		@ from BODY_16_xx -#endif -	orr	r0,r9,r10 -	and	r2,r9,r10 -	and	r0,r0,r11 -	add	r8,r8,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r4,r4,r3 -	add	r8,r8,r0 -	@ ldr	r1,[sp,#13*4]		@ 28 -	ldr	r12,[sp,#10*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#12*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#5*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 -	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r4,ror#6 +	eor	r0,r9,r9,ror#11 +	add	r8,r8,r2			@ h+=Ch(e,f,g) +#if 27==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 27<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r9,r10			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx +	eor	r3,r9,r10			@ a^b, b^c in next round +	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r9,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r4,r4,r8			@ d+=h +	eor	r12,r12,r10			@ Maj(a,b,c) +	add	r8,r8,r0,ror#2	@ h+=Sigma0(a) +	@ add	r8,r8,r12			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#13*4]		@ 28 +	@ ldr	r1,[sp,#10*4] +	mov	r0,r2,ror#7 +	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past +	mov	r12,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r12,r12,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#12*4] +	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#5*4] + +	add	r12,r12,r0 +	eor	r0,r4,r4,ror#5	@ from BODY_00_15 +	add	r2,r2,r12 +	eor	r0,r0,r4,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i]  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r4,ror#11 +	add	r7,r7,r2			@ h+=X[i] +	str	r2,[sp,#12*4]  	eor	r2,r5,r6 -#if 28>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 28==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r4,ror#25	@ Sigma1(e) +	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r4 -	str	r3,[sp,#12*4] -	add	r3,r3,r0 +	add	r7,r7,r12			@ h+=K256[i]  	eor	r2,r2,r6			@ Ch(e,f,g) -	add	r3,r3,r7 -	mov	r7,r8,ror#2 -	add	r3,r3,r2 -	eor	r7,r7,r8,ror#13 -	add	r3,r3,r12 -	eor	r7,r7,r8,ror#22		@ Sigma0(a) -#if 28>=15 -	ldr	r1,[sp,#14*4]		@ from BODY_16_xx -#endif -	orr	r0,r8,r9 -	and	r2,r8,r9 -	and	r0,r0,r10 -	add	r7,r7,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r11,r11,r3 -	add	r7,r7,r0 -	@ ldr	r1,[sp,#14*4]		@ 29 -	ldr	r12,[sp,#11*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#13*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#6*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 +	eor	r0,r8,r8,ror#11 +	add	r7,r7,r2			@ h+=Ch(e,f,g) +#if 28==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 28<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r8,r9			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx +	eor	r12,r8,r9			@ a^b, b^c in next round +	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r8,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r11,r11,r7			@ d+=h +	eor	r3,r3,r9			@ Maj(a,b,c) +	add	r7,r7,r0,ror#2	@ h+=Sigma0(a) +	@ add	r7,r7,r3			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#14*4]		@ 29 +	@ ldr	r1,[sp,#11*4] +	mov	r0,r2,ror#7 +	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past +	mov	r3,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r3,r3,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#13*4] +	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#6*4] +  	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r11,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r11,ror#11 +	eor	r0,r11,r11,ror#5	@ from BODY_00_15 +	add	r2,r2,r3 +	eor	r0,r0,r11,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i] +	ldr	r3,[r14],#4			@ *K256++ +	add	r6,r6,r2			@ h+=X[i] +	str	r2,[sp,#13*4]  	eor	r2,r4,r5 -#if 29>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 29==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r11,ror#25	@ Sigma1(e) +	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r11 -	str	r3,[sp,#13*4] -	add	r3,r3,r0 +	add	r6,r6,r3			@ h+=K256[i]  	eor	r2,r2,r5			@ Ch(e,f,g) -	add	r3,r3,r6 -	mov	r6,r7,ror#2 -	add	r3,r3,r2 -	eor	r6,r6,r7,ror#13 -	add	r3,r3,r12 -	eor	r6,r6,r7,ror#22		@ Sigma0(a) -#if 29>=15 -	ldr	r1,[sp,#15*4]		@ from BODY_16_xx -#endif -	orr	r0,r7,r8 -	and	r2,r7,r8 -	and	r0,r0,r9 -	add	r6,r6,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r10,r10,r3 -	add	r6,r6,r0 -	@ ldr	r1,[sp,#15*4]		@ 30 -	ldr	r12,[sp,#12*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#14*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#7*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 -	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r10,ror#6 +	eor	r0,r7,r7,ror#11 +	add	r6,r6,r2			@ h+=Ch(e,f,g) +#if 29==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 29<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r7,r8			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx +	eor	r3,r7,r8			@ a^b, b^c in next round +	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r7,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r10,r10,r6			@ d+=h +	eor	r12,r12,r8			@ Maj(a,b,c) +	add	r6,r6,r0,ror#2	@ h+=Sigma0(a) +	@ add	r6,r6,r12			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#15*4]		@ 30 +	@ ldr	r1,[sp,#12*4] +	mov	r0,r2,ror#7 +	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past +	mov	r12,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r12,r12,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#14*4] +	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#7*4] + +	add	r12,r12,r0 +	eor	r0,r10,r10,ror#5	@ from BODY_00_15 +	add	r2,r2,r12 +	eor	r0,r0,r10,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i]  	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r10,ror#11 +	add	r5,r5,r2			@ h+=X[i] +	str	r2,[sp,#14*4]  	eor	r2,r11,r4 -#if 30>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 30==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r10,ror#25	@ Sigma1(e) +	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r10 -	str	r3,[sp,#14*4] -	add	r3,r3,r0 +	add	r5,r5,r12			@ h+=K256[i]  	eor	r2,r2,r4			@ Ch(e,f,g) -	add	r3,r3,r5 -	mov	r5,r6,ror#2 -	add	r3,r3,r2 -	eor	r5,r5,r6,ror#13 -	add	r3,r3,r12 -	eor	r5,r5,r6,ror#22		@ Sigma0(a) -#if 30>=15 -	ldr	r1,[sp,#0*4]		@ from BODY_16_xx -#endif -	orr	r0,r6,r7 -	and	r2,r6,r7 -	and	r0,r0,r8 -	add	r5,r5,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r9,r9,r3 -	add	r5,r5,r0 -	@ ldr	r1,[sp,#0*4]		@ 31 -	ldr	r12,[sp,#13*4] -	mov	r0,r1,ror#7 -	ldr	r3,[sp,#15*4] -	eor	r0,r0,r1,ror#18 -	ldr	r2,[sp,#8*4] -	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1]) -	mov	r1,r12,ror#17 +	eor	r0,r6,r6,ror#11 +	add	r5,r5,r2			@ h+=Ch(e,f,g) +#if 30==31 +	and	r12,r12,#0xff +	cmp	r12,#0xf2			@ done? +#endif +#if 30<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r12,r6,r7			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx +	eor	r12,r6,r7			@ a^b, b^c in next round +	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r6,ror#20	@ Sigma0(a) +	and	r3,r3,r12			@ (b^c)&=(a^b) +	add	r9,r9,r5			@ d+=h +	eor	r3,r3,r7			@ Maj(a,b,c) +	add	r5,r5,r0,ror#2	@ h+=Sigma0(a) +	@ add	r5,r5,r3			@ h+=Maj(a,b,c) +	@ ldr	r2,[sp,#0*4]		@ 31 +	@ ldr	r1,[sp,#13*4] +	mov	r0,r2,ror#7 +	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past +	mov	r3,r1,ror#17 +	eor	r0,r0,r2,ror#18 +	eor	r3,r3,r1,ror#19 +	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1]) +	ldr	r2,[sp,#15*4] +	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14]) +	ldr	r1,[sp,#8*4] +  	add	r3,r3,r0 -	eor	r1,r1,r12,ror#19 -	add	r3,r3,r2 -	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14]) -	@ add	r3,r3,r1 -	mov	r0,r9,ror#6 -	ldr	r12,[r14],#4			@ *K256++ -	eor	r0,r0,r9,ror#11 +	eor	r0,r9,r9,ror#5	@ from BODY_00_15 +	add	r2,r2,r3 +	eor	r0,r0,r9,ror#19	@ Sigma1(e) +	add	r2,r2,r1			@ X[i] +	ldr	r3,[r14],#4			@ *K256++ +	add	r4,r4,r2			@ h+=X[i] +	str	r2,[sp,#15*4]  	eor	r2,r10,r11 -#if 31>=16 -	add	r3,r3,r1			@ from BODY_16_xx -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) -	rev	r3,r3 -#endif -#if 31==15 -	str	r1,[sp,#17*4]			@ leave room for r1 -#endif -	eor	r0,r0,r9,ror#25	@ Sigma1(e) +	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)  	and	r2,r2,r9 -	str	r3,[sp,#15*4] -	add	r3,r3,r0 +	add	r4,r4,r3			@ h+=K256[i]  	eor	r2,r2,r11			@ Ch(e,f,g) -	add	r3,r3,r4 -	mov	r4,r5,ror#2 -	add	r3,r3,r2 -	eor	r4,r4,r5,ror#13 -	add	r3,r3,r12 -	eor	r4,r4,r5,ror#22		@ Sigma0(a) -#if 31>=15 -	ldr	r1,[sp,#1*4]		@ from BODY_16_xx -#endif -	orr	r0,r5,r6 -	and	r2,r5,r6 -	and	r0,r0,r7 -	add	r4,r4,r3 -	orr	r0,r0,r2			@ Maj(a,b,c) -	add	r8,r8,r3 -	add	r4,r4,r0 -	and	r12,r12,#0xff -	cmp	r12,#0xf2 +	eor	r0,r5,r5,ror#11 +	add	r4,r4,r2			@ h+=Ch(e,f,g) +#if 31==31 +	and	r3,r3,#0xff +	cmp	r3,#0xf2			@ done? +#endif +#if 31<15 +# if __ARM_ARCH__>=7 +	ldr	r2,[r1],#4			@ prefetch +# else +	ldrb	r2,[r1,#3] +# endif +	eor	r3,r5,r6			@ a^b, b^c in next round +#else +	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx +	eor	r3,r5,r6			@ a^b, b^c in next round +	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx +#endif +	eor	r0,r0,r5,ror#20	@ Sigma0(a) +	and	r12,r12,r3			@ (b^c)&=(a^b) +	add	r8,r8,r4			@ d+=h +	eor	r12,r12,r6			@ Maj(a,b,c) +	add	r4,r4,r0,ror#2	@ h+=Sigma0(a) +	@ add	r4,r4,r12			@ h+=Maj(a,b,c) +	ldreq	r3,[sp,#16*4]		@ pull ctx  	bne	.Lrounds_16_xx -	ldr	r3,[sp,#16*4]		@ pull ctx +	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past  	ldr	r0,[r3,#0]  	ldr	r2,[r3,#4]  	ldr	r12,[r3,#8] @@ -1512,6 +1770,921 @@ sha256_block_data_order:  	moveq	pc,lr			@ be binary compatible with V4, yet  	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)  #endif -.size   sha256_block_data_order,.-sha256_block_data_order -.asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" +.size	sha256_block_data_order,.-sha256_block_data_order +#if __ARM_ARCH__>=7 +.fpu	neon + +.type	sha256_block_data_order_neon,%function +.align	4 +sha256_block_data_order_neon: +.LNEON: +	stmdb	sp!,{r4-r12,lr} + +	mov	r12,sp +	sub	sp,sp,#16*4+16		@ alloca +	sub	r14,r3,#256+32	@ K256 +	bic	sp,sp,#15		@ align for 128-bit stores + +	vld1.8		{q0},[r1]! +	vld1.8		{q1},[r1]! +	vld1.8		{q2},[r1]! +	vld1.8		{q3},[r1]! +	vld1.32		{q8},[r14,:128]! +	vld1.32		{q9},[r14,:128]! +	vld1.32		{q10},[r14,:128]! +	vld1.32		{q11},[r14,:128]! +	vrev32.8	q0,q0		@ yes, even on +	str		r0,[sp,#64] +	vrev32.8	q1,q1		@ big-endian +	str		r1,[sp,#68] +	mov		r1,sp +	vrev32.8	q2,q2 +	str		r2,[sp,#72] +	vrev32.8	q3,q3 +	str		r12,[sp,#76]		@ save original sp +	vadd.i32	q8,q8,q0 +	vadd.i32	q9,q9,q1 +	vst1.32		{q8},[r1,:128]! +	vadd.i32	q10,q10,q2 +	vst1.32		{q9},[r1,:128]! +	vadd.i32	q11,q11,q3 +	vst1.32		{q10},[r1,:128]! +	vst1.32		{q11},[r1,:128]! + +	ldmia		r0,{r4-r11} +	sub		r1,r1,#64 +	ldr		r2,[sp,#0] +	eor		r12,r12,r12 +	eor		r3,r5,r6 +	b		.L_00_48 + +.align	4 +.L_00_48: +	vext.8	q8,q0,q1,#4 +	add	r11,r11,r2 +	eor	r2,r9,r10 +	eor	r0,r8,r8,ror#5 +	vext.8	q9,q2,q3,#4 +	add	r4,r4,r12 +	and	r2,r2,r8 +	eor	r12,r0,r8,ror#19 +	vshr.u32	q10,q8,#7 +	eor	r0,r4,r4,ror#11 +	eor	r2,r2,r10 +	vadd.i32	q0,q0,q9 +	add	r11,r11,r12,ror#6 +	eor	r12,r4,r5 +	vshr.u32	q9,q8,#3 +	eor	r0,r0,r4,ror#20 +	add	r11,r11,r2 +	vsli.32	q10,q8,#25 +	ldr	r2,[sp,#4] +	and	r3,r3,r12 +	vshr.u32	q11,q8,#18 +	add	r7,r7,r11 +	add	r11,r11,r0,ror#2 +	eor	r3,r3,r5 +	veor	q9,q9,q10 +	add	r10,r10,r2 +	vsli.32	q11,q8,#14 +	eor	r2,r8,r9 +	eor	r0,r7,r7,ror#5 +	vshr.u32	d24,d7,#17 +	add	r11,r11,r3 +	and	r2,r2,r7 +	veor	q9,q9,q11 +	eor	r3,r0,r7,ror#19 +	eor	r0,r11,r11,ror#11 +	vsli.32	d24,d7,#15 +	eor	r2,r2,r9 +	add	r10,r10,r3,ror#6 +	vshr.u32	d25,d7,#10 +	eor	r3,r11,r4 +	eor	r0,r0,r11,ror#20 +	vadd.i32	q0,q0,q9 +	add	r10,r10,r2 +	ldr	r2,[sp,#8] +	veor	d25,d25,d24 +	and	r12,r12,r3 +	add	r6,r6,r10 +	vshr.u32	d24,d7,#19 +	add	r10,r10,r0,ror#2 +	eor	r12,r12,r4 +	vsli.32	d24,d7,#13 +	add	r9,r9,r2 +	eor	r2,r7,r8 +	veor	d25,d25,d24 +	eor	r0,r6,r6,ror#5 +	add	r10,r10,r12 +	vadd.i32	d0,d0,d25 +	and	r2,r2,r6 +	eor	r12,r0,r6,ror#19 +	vshr.u32	d24,d0,#17 +	eor	r0,r10,r10,ror#11 +	eor	r2,r2,r8 +	vsli.32	d24,d0,#15 +	add	r9,r9,r12,ror#6 +	eor	r12,r10,r11 +	vshr.u32	d25,d0,#10 +	eor	r0,r0,r10,ror#20 +	add	r9,r9,r2 +	veor	d25,d25,d24 +	ldr	r2,[sp,#12] +	and	r3,r3,r12 +	vshr.u32	d24,d0,#19 +	add	r5,r5,r9 +	add	r9,r9,r0,ror#2 +	eor	r3,r3,r11 +	vld1.32	{q8},[r14,:128]! +	add	r8,r8,r2 +	vsli.32	d24,d0,#13 +	eor	r2,r6,r7 +	eor	r0,r5,r5,ror#5 +	veor	d25,d25,d24 +	add	r9,r9,r3 +	and	r2,r2,r5 +	vadd.i32	d1,d1,d25 +	eor	r3,r0,r5,ror#19 +	eor	r0,r9,r9,ror#11 +	vadd.i32	q8,q8,q0 +	eor	r2,r2,r7 +	add	r8,r8,r3,ror#6 +	eor	r3,r9,r10 +	eor	r0,r0,r9,ror#20 +	add	r8,r8,r2 +	ldr	r2,[sp,#16] +	and	r12,r12,r3 +	add	r4,r4,r8 +	vst1.32	{q8},[r1,:128]! +	add	r8,r8,r0,ror#2 +	eor	r12,r12,r10 +	vext.8	q8,q1,q2,#4 +	add	r7,r7,r2 +	eor	r2,r5,r6 +	eor	r0,r4,r4,ror#5 +	vext.8	q9,q3,q0,#4 +	add	r8,r8,r12 +	and	r2,r2,r4 +	eor	r12,r0,r4,ror#19 +	vshr.u32	q10,q8,#7 +	eor	r0,r8,r8,ror#11 +	eor	r2,r2,r6 +	vadd.i32	q1,q1,q9 +	add	r7,r7,r12,ror#6 +	eor	r12,r8,r9 +	vshr.u32	q9,q8,#3 +	eor	r0,r0,r8,ror#20 +	add	r7,r7,r2 +	vsli.32	q10,q8,#25 +	ldr	r2,[sp,#20] +	and	r3,r3,r12 +	vshr.u32	q11,q8,#18 +	add	r11,r11,r7 +	add	r7,r7,r0,ror#2 +	eor	r3,r3,r9 +	veor	q9,q9,q10 +	add	r6,r6,r2 +	vsli.32	q11,q8,#14 +	eor	r2,r4,r5 +	eor	r0,r11,r11,ror#5 +	vshr.u32	d24,d1,#17 +	add	r7,r7,r3 +	and	r2,r2,r11 +	veor	q9,q9,q11 +	eor	r3,r0,r11,ror#19 +	eor	r0,r7,r7,ror#11 +	vsli.32	d24,d1,#15 +	eor	r2,r2,r5 +	add	r6,r6,r3,ror#6 +	vshr.u32	d25,d1,#10 +	eor	r3,r7,r8 +	eor	r0,r0,r7,ror#20 +	vadd.i32	q1,q1,q9 +	add	r6,r6,r2 +	ldr	r2,[sp,#24] +	veor	d25,d25,d24 +	and	r12,r12,r3 +	add	r10,r10,r6 +	vshr.u32	d24,d1,#19 +	add	r6,r6,r0,ror#2 +	eor	r12,r12,r8 +	vsli.32	d24,d1,#13 +	add	r5,r5,r2 +	eor	r2,r11,r4 +	veor	d25,d25,d24 +	eor	r0,r10,r10,ror#5 +	add	r6,r6,r12 +	vadd.i32	d2,d2,d25 +	and	r2,r2,r10 +	eor	r12,r0,r10,ror#19 +	vshr.u32	d24,d2,#17 +	eor	r0,r6,r6,ror#11 +	eor	r2,r2,r4 +	vsli.32	d24,d2,#15 +	add	r5,r5,r12,ror#6 +	eor	r12,r6,r7 +	vshr.u32	d25,d2,#10 +	eor	r0,r0,r6,ror#20 +	add	r5,r5,r2 +	veor	d25,d25,d24 +	ldr	r2,[sp,#28] +	and	r3,r3,r12 +	vshr.u32	d24,d2,#19 +	add	r9,r9,r5 +	add	r5,r5,r0,ror#2 +	eor	r3,r3,r7 +	vld1.32	{q8},[r14,:128]! +	add	r4,r4,r2 +	vsli.32	d24,d2,#13 +	eor	r2,r10,r11 +	eor	r0,r9,r9,ror#5 +	veor	d25,d25,d24 +	add	r5,r5,r3 +	and	r2,r2,r9 +	vadd.i32	d3,d3,d25 +	eor	r3,r0,r9,ror#19 +	eor	r0,r5,r5,ror#11 +	vadd.i32	q8,q8,q1 +	eor	r2,r2,r11 +	add	r4,r4,r3,ror#6 +	eor	r3,r5,r6 +	eor	r0,r0,r5,ror#20 +	add	r4,r4,r2 +	ldr	r2,[sp,#32] +	and	r12,r12,r3 +	add	r8,r8,r4 +	vst1.32	{q8},[r1,:128]! +	add	r4,r4,r0,ror#2 +	eor	r12,r12,r6 +	vext.8	q8,q2,q3,#4 +	add	r11,r11,r2 +	eor	r2,r9,r10 +	eor	r0,r8,r8,ror#5 +	vext.8	q9,q0,q1,#4 +	add	r4,r4,r12 +	and	r2,r2,r8 +	eor	r12,r0,r8,ror#19 +	vshr.u32	q10,q8,#7 +	eor	r0,r4,r4,ror#11 +	eor	r2,r2,r10 +	vadd.i32	q2,q2,q9 +	add	r11,r11,r12,ror#6 +	eor	r12,r4,r5 +	vshr.u32	q9,q8,#3 +	eor	r0,r0,r4,ror#20 +	add	r11,r11,r2 +	vsli.32	q10,q8,#25 +	ldr	r2,[sp,#36] +	and	r3,r3,r12 +	vshr.u32	q11,q8,#18 +	add	r7,r7,r11 +	add	r11,r11,r0,ror#2 +	eor	r3,r3,r5 +	veor	q9,q9,q10 +	add	r10,r10,r2 +	vsli.32	q11,q8,#14 +	eor	r2,r8,r9 +	eor	r0,r7,r7,ror#5 +	vshr.u32	d24,d3,#17 +	add	r11,r11,r3 +	and	r2,r2,r7 +	veor	q9,q9,q11 +	eor	r3,r0,r7,ror#19 +	eor	r0,r11,r11,ror#11 +	vsli.32	d24,d3,#15 +	eor	r2,r2,r9 +	add	r10,r10,r3,ror#6 +	vshr.u32	d25,d3,#10 +	eor	r3,r11,r4 +	eor	r0,r0,r11,ror#20 +	vadd.i32	q2,q2,q9 +	add	r10,r10,r2 +	ldr	r2,[sp,#40] +	veor	d25,d25,d24 +	and	r12,r12,r3 +	add	r6,r6,r10 +	vshr.u32	d24,d3,#19 +	add	r10,r10,r0,ror#2 +	eor	r12,r12,r4 +	vsli.32	d24,d3,#13 +	add	r9,r9,r2 +	eor	r2,r7,r8 +	veor	d25,d25,d24 +	eor	r0,r6,r6,ror#5 +	add	r10,r10,r12 +	vadd.i32	d4,d4,d25 +	and	r2,r2,r6 +	eor	r12,r0,r6,ror#19 +	vshr.u32	d24,d4,#17 +	eor	r0,r10,r10,ror#11 +	eor	r2,r2,r8 +	vsli.32	d24,d4,#15 +	add	r9,r9,r12,ror#6 +	eor	r12,r10,r11 +	vshr.u32	d25,d4,#10 +	eor	r0,r0,r10,ror#20 +	add	r9,r9,r2 +	veor	d25,d25,d24 +	ldr	r2,[sp,#44] +	and	r3,r3,r12 +	vshr.u32	d24,d4,#19 +	add	r5,r5,r9 +	add	r9,r9,r0,ror#2 +	eor	r3,r3,r11 +	vld1.32	{q8},[r14,:128]! +	add	r8,r8,r2 +	vsli.32	d24,d4,#13 +	eor	r2,r6,r7 +	eor	r0,r5,r5,ror#5 +	veor	d25,d25,d24 +	add	r9,r9,r3 +	and	r2,r2,r5 +	vadd.i32	d5,d5,d25 +	eor	r3,r0,r5,ror#19 +	eor	r0,r9,r9,ror#11 +	vadd.i32	q8,q8,q2 +	eor	r2,r2,r7 +	add	r8,r8,r3,ror#6 +	eor	r3,r9,r10 +	eor	r0,r0,r9,ror#20 +	add	r8,r8,r2 +	ldr	r2,[sp,#48] +	and	r12,r12,r3 +	add	r4,r4,r8 +	vst1.32	{q8},[r1,:128]! +	add	r8,r8,r0,ror#2 +	eor	r12,r12,r10 +	vext.8	q8,q3,q0,#4 +	add	r7,r7,r2 +	eor	r2,r5,r6 +	eor	r0,r4,r4,ror#5 +	vext.8	q9,q1,q2,#4 +	add	r8,r8,r12 +	and	r2,r2,r4 +	eor	r12,r0,r4,ror#19 +	vshr.u32	q10,q8,#7 +	eor	r0,r8,r8,ror#11 +	eor	r2,r2,r6 +	vadd.i32	q3,q3,q9 +	add	r7,r7,r12,ror#6 +	eor	r12,r8,r9 +	vshr.u32	q9,q8,#3 +	eor	r0,r0,r8,ror#20 +	add	r7,r7,r2 +	vsli.32	q10,q8,#25 +	ldr	r2,[sp,#52] +	and	r3,r3,r12 +	vshr.u32	q11,q8,#18 +	add	r11,r11,r7 +	add	r7,r7,r0,ror#2 +	eor	r3,r3,r9 +	veor	q9,q9,q10 +	add	r6,r6,r2 +	vsli.32	q11,q8,#14 +	eor	r2,r4,r5 +	eor	r0,r11,r11,ror#5 +	vshr.u32	d24,d5,#17 +	add	r7,r7,r3 +	and	r2,r2,r11 +	veor	q9,q9,q11 +	eor	r3,r0,r11,ror#19 +	eor	r0,r7,r7,ror#11 +	vsli.32	d24,d5,#15 +	eor	r2,r2,r5 +	add	r6,r6,r3,ror#6 +	vshr.u32	d25,d5,#10 +	eor	r3,r7,r8 +	eor	r0,r0,r7,ror#20 +	vadd.i32	q3,q3,q9 +	add	r6,r6,r2 +	ldr	r2,[sp,#56] +	veor	d25,d25,d24 +	and	r12,r12,r3 +	add	r10,r10,r6 +	vshr.u32	d24,d5,#19 +	add	r6,r6,r0,ror#2 +	eor	r12,r12,r8 +	vsli.32	d24,d5,#13 +	add	r5,r5,r2 +	eor	r2,r11,r4 +	veor	d25,d25,d24 +	eor	r0,r10,r10,ror#5 +	add	r6,r6,r12 +	vadd.i32	d6,d6,d25 +	and	r2,r2,r10 +	eor	r12,r0,r10,ror#19 +	vshr.u32	d24,d6,#17 +	eor	r0,r6,r6,ror#11 +	eor	r2,r2,r4 +	vsli.32	d24,d6,#15 +	add	r5,r5,r12,ror#6 +	eor	r12,r6,r7 +	vshr.u32	d25,d6,#10 +	eor	r0,r0,r6,ror#20 +	add	r5,r5,r2 +	veor	d25,d25,d24 +	ldr	r2,[sp,#60] +	and	r3,r3,r12 +	vshr.u32	d24,d6,#19 +	add	r9,r9,r5 +	add	r5,r5,r0,ror#2 +	eor	r3,r3,r7 +	vld1.32	{q8},[r14,:128]! +	add	r4,r4,r2 +	vsli.32	d24,d6,#13 +	eor	r2,r10,r11 +	eor	r0,r9,r9,ror#5 +	veor	d25,d25,d24 +	add	r5,r5,r3 +	and	r2,r2,r9 +	vadd.i32	d7,d7,d25 +	eor	r3,r0,r9,ror#19 +	eor	r0,r5,r5,ror#11 +	vadd.i32	q8,q8,q3 +	eor	r2,r2,r11 +	add	r4,r4,r3,ror#6 +	eor	r3,r5,r6 +	eor	r0,r0,r5,ror#20 +	add	r4,r4,r2 +	ldr	r2,[r14] +	and	r12,r12,r3 +	add	r8,r8,r4 +	vst1.32	{q8},[r1,:128]! +	add	r4,r4,r0,ror#2 +	eor	r12,r12,r6 +	teq	r2,#0				@ check for K256 terminator +	ldr	r2,[sp,#0] +	sub	r1,r1,#64 +	bne	.L_00_48 + +	ldr		r1,[sp,#68] +	ldr		r0,[sp,#72] +	sub		r14,r14,#256	@ rewind r14 +	teq		r1,r0 +	subeq		r1,r1,#64		@ avoid SEGV +	vld1.8		{q0},[r1]!		@ load next input block +	vld1.8		{q1},[r1]! +	vld1.8		{q2},[r1]! +	vld1.8		{q3},[r1]! +	strne		r1,[sp,#68] +	mov		r1,sp +	add	r11,r11,r2 +	eor	r2,r9,r10 +	eor	r0,r8,r8,ror#5 +	add	r4,r4,r12 +	vld1.32	{q8},[r14,:128]! +	and	r2,r2,r8 +	eor	r12,r0,r8,ror#19 +	eor	r0,r4,r4,ror#11 +	eor	r2,r2,r10 +	vrev32.8	q0,q0 +	add	r11,r11,r12,ror#6 +	eor	r12,r4,r5 +	eor	r0,r0,r4,ror#20 +	add	r11,r11,r2 +	vadd.i32	q8,q8,q0 +	ldr	r2,[sp,#4] +	and	r3,r3,r12 +	add	r7,r7,r11 +	add	r11,r11,r0,ror#2 +	eor	r3,r3,r5 +	add	r10,r10,r2 +	eor	r2,r8,r9 +	eor	r0,r7,r7,ror#5 +	add	r11,r11,r3 +	and	r2,r2,r7 +	eor	r3,r0,r7,ror#19 +	eor	r0,r11,r11,ror#11 +	eor	r2,r2,r9 +	add	r10,r10,r3,ror#6 +	eor	r3,r11,r4 +	eor	r0,r0,r11,ror#20 +	add	r10,r10,r2 +	ldr	r2,[sp,#8] +	and	r12,r12,r3 +	add	r6,r6,r10 +	add	r10,r10,r0,ror#2 +	eor	r12,r12,r4 +	add	r9,r9,r2 +	eor	r2,r7,r8 +	eor	r0,r6,r6,ror#5 +	add	r10,r10,r12 +	and	r2,r2,r6 +	eor	r12,r0,r6,ror#19 +	eor	r0,r10,r10,ror#11 +	eor	r2,r2,r8 +	add	r9,r9,r12,ror#6 +	eor	r12,r10,r11 +	eor	r0,r0,r10,ror#20 +	add	r9,r9,r2 +	ldr	r2,[sp,#12] +	and	r3,r3,r12 +	add	r5,r5,r9 +	add	r9,r9,r0,ror#2 +	eor	r3,r3,r11 +	add	r8,r8,r2 +	eor	r2,r6,r7 +	eor	r0,r5,r5,ror#5 +	add	r9,r9,r3 +	and	r2,r2,r5 +	eor	r3,r0,r5,ror#19 +	eor	r0,r9,r9,ror#11 +	eor	r2,r2,r7 +	add	r8,r8,r3,ror#6 +	eor	r3,r9,r10 +	eor	r0,r0,r9,ror#20 +	add	r8,r8,r2 +	ldr	r2,[sp,#16] +	and	r12,r12,r3 +	add	r4,r4,r8 +	add	r8,r8,r0,ror#2 +	eor	r12,r12,r10 +	vst1.32	{q8},[r1,:128]! +	add	r7,r7,r2 +	eor	r2,r5,r6 +	eor	r0,r4,r4,ror#5 +	add	r8,r8,r12 +	vld1.32	{q8},[r14,:128]! +	and	r2,r2,r4 +	eor	r12,r0,r4,ror#19 +	eor	r0,r8,r8,ror#11 +	eor	r2,r2,r6 +	vrev32.8	q1,q1 +	add	r7,r7,r12,ror#6 +	eor	r12,r8,r9 +	eor	r0,r0,r8,ror#20 +	add	r7,r7,r2 +	vadd.i32	q8,q8,q1 +	ldr	r2,[sp,#20] +	and	r3,r3,r12 +	add	r11,r11,r7 +	add	r7,r7,r0,ror#2 +	eor	r3,r3,r9 +	add	r6,r6,r2 +	eor	r2,r4,r5 +	eor	r0,r11,r11,ror#5 +	add	r7,r7,r3 +	and	r2,r2,r11 +	eor	r3,r0,r11,ror#19 +	eor	r0,r7,r7,ror#11 +	eor	r2,r2,r5 +	add	r6,r6,r3,ror#6 +	eor	r3,r7,r8 +	eor	r0,r0,r7,ror#20 +	add	r6,r6,r2 +	ldr	r2,[sp,#24] +	and	r12,r12,r3 +	add	r10,r10,r6 +	add	r6,r6,r0,ror#2 +	eor	r12,r12,r8 +	add	r5,r5,r2 +	eor	r2,r11,r4 +	eor	r0,r10,r10,ror#5 +	add	r6,r6,r12 +	and	r2,r2,r10 +	eor	r12,r0,r10,ror#19 +	eor	r0,r6,r6,ror#11 +	eor	r2,r2,r4 +	add	r5,r5,r12,ror#6 +	eor	r12,r6,r7 +	eor	r0,r0,r6,ror#20 +	add	r5,r5,r2 +	ldr	r2,[sp,#28] +	and	r3,r3,r12 +	add	r9,r9,r5 +	add	r5,r5,r0,ror#2 +	eor	r3,r3,r7 +	add	r4,r4,r2 +	eor	r2,r10,r11 +	eor	r0,r9,r9,ror#5 +	add	r5,r5,r3 +	and	r2,r2,r9 +	eor	r3,r0,r9,ror#19 +	eor	r0,r5,r5,ror#11 +	eor	r2,r2,r11 +	add	r4,r4,r3,ror#6 +	eor	r3,r5,r6 +	eor	r0,r0,r5,ror#20 +	add	r4,r4,r2 +	ldr	r2,[sp,#32] +	and	r12,r12,r3 +	add	r8,r8,r4 +	add	r4,r4,r0,ror#2 +	eor	r12,r12,r6 +	vst1.32	{q8},[r1,:128]! +	add	r11,r11,r2 +	eor	r2,r9,r10 +	eor	r0,r8,r8,ror#5 +	add	r4,r4,r12 +	vld1.32	{q8},[r14,:128]! +	and	r2,r2,r8 +	eor	r12,r0,r8,ror#19 +	eor	r0,r4,r4,ror#11 +	eor	r2,r2,r10 +	vrev32.8	q2,q2 +	add	r11,r11,r12,ror#6 +	eor	r12,r4,r5 +	eor	r0,r0,r4,ror#20 +	add	r11,r11,r2 +	vadd.i32	q8,q8,q2 +	ldr	r2,[sp,#36] +	and	r3,r3,r12 +	add	r7,r7,r11 +	add	r11,r11,r0,ror#2 +	eor	r3,r3,r5 +	add	r10,r10,r2 +	eor	r2,r8,r9 +	eor	r0,r7,r7,ror#5 +	add	r11,r11,r3 +	and	r2,r2,r7 +	eor	r3,r0,r7,ror#19 +	eor	r0,r11,r11,ror#11 +	eor	r2,r2,r9 +	add	r10,r10,r3,ror#6 +	eor	r3,r11,r4 +	eor	r0,r0,r11,ror#20 +	add	r10,r10,r2 +	ldr	r2,[sp,#40] +	and	r12,r12,r3 +	add	r6,r6,r10 +	add	r10,r10,r0,ror#2 +	eor	r12,r12,r4 +	add	r9,r9,r2 +	eor	r2,r7,r8 +	eor	r0,r6,r6,ror#5 +	add	r10,r10,r12 +	and	r2,r2,r6 +	eor	r12,r0,r6,ror#19 +	eor	r0,r10,r10,ror#11 +	eor	r2,r2,r8 +	add	r9,r9,r12,ror#6 +	eor	r12,r10,r11 +	eor	r0,r0,r10,ror#20 +	add	r9,r9,r2 +	ldr	r2,[sp,#44] +	and	r3,r3,r12 +	add	r5,r5,r9 +	add	r9,r9,r0,ror#2 +	eor	r3,r3,r11 +	add	r8,r8,r2 +	eor	r2,r6,r7 +	eor	r0,r5,r5,ror#5 +	add	r9,r9,r3 +	and	r2,r2,r5 +	eor	r3,r0,r5,ror#19 +	eor	r0,r9,r9,ror#11 +	eor	r2,r2,r7 +	add	r8,r8,r3,ror#6 +	eor	r3,r9,r10 +	eor	r0,r0,r9,ror#20 +	add	r8,r8,r2 +	ldr	r2,[sp,#48] +	and	r12,r12,r3 +	add	r4,r4,r8 +	add	r8,r8,r0,ror#2 +	eor	r12,r12,r10 +	vst1.32	{q8},[r1,:128]! +	add	r7,r7,r2 +	eor	r2,r5,r6 +	eor	r0,r4,r4,ror#5 +	add	r8,r8,r12 +	vld1.32	{q8},[r14,:128]! +	and	r2,r2,r4 +	eor	r12,r0,r4,ror#19 +	eor	r0,r8,r8,ror#11 +	eor	r2,r2,r6 +	vrev32.8	q3,q3 +	add	r7,r7,r12,ror#6 +	eor	r12,r8,r9 +	eor	r0,r0,r8,ror#20 +	add	r7,r7,r2 +	vadd.i32	q8,q8,q3 +	ldr	r2,[sp,#52] +	and	r3,r3,r12 +	add	r11,r11,r7 +	add	r7,r7,r0,ror#2 +	eor	r3,r3,r9 +	add	r6,r6,r2 +	eor	r2,r4,r5 +	eor	r0,r11,r11,ror#5 +	add	r7,r7,r3 +	and	r2,r2,r11 +	eor	r3,r0,r11,ror#19 +	eor	r0,r7,r7,ror#11 +	eor	r2,r2,r5 +	add	r6,r6,r3,ror#6 +	eor	r3,r7,r8 +	eor	r0,r0,r7,ror#20 +	add	r6,r6,r2 +	ldr	r2,[sp,#56] +	and	r12,r12,r3 +	add	r10,r10,r6 +	add	r6,r6,r0,ror#2 +	eor	r12,r12,r8 +	add	r5,r5,r2 +	eor	r2,r11,r4 +	eor	r0,r10,r10,ror#5 +	add	r6,r6,r12 +	and	r2,r2,r10 +	eor	r12,r0,r10,ror#19 +	eor	r0,r6,r6,ror#11 +	eor	r2,r2,r4 +	add	r5,r5,r12,ror#6 +	eor	r12,r6,r7 +	eor	r0,r0,r6,ror#20 +	add	r5,r5,r2 +	ldr	r2,[sp,#60] +	and	r3,r3,r12 +	add	r9,r9,r5 +	add	r5,r5,r0,ror#2 +	eor	r3,r3,r7 +	add	r4,r4,r2 +	eor	r2,r10,r11 +	eor	r0,r9,r9,ror#5 +	add	r5,r5,r3 +	and	r2,r2,r9 +	eor	r3,r0,r9,ror#19 +	eor	r0,r5,r5,ror#11 +	eor	r2,r2,r11 +	add	r4,r4,r3,ror#6 +	eor	r3,r5,r6 +	eor	r0,r0,r5,ror#20 +	add	r4,r4,r2 +	ldr	r2,[sp,#64] +	and	r12,r12,r3 +	add	r8,r8,r4 +	add	r4,r4,r0,ror#2 +	eor	r12,r12,r6 +	vst1.32	{q8},[r1,:128]! +	ldr	r0,[r2,#0] +	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past +	ldr	r12,[r2,#4] +	ldr	r3,[r2,#8] +	ldr	r1,[r2,#12] +	add	r4,r4,r0			@ accumulate +	ldr	r0,[r2,#16] +	add	r5,r5,r12 +	ldr	r12,[r2,#20] +	add	r6,r6,r3 +	ldr	r3,[r2,#24] +	add	r7,r7,r1 +	ldr	r1,[r2,#28] +	add	r8,r8,r0 +	str	r4,[r2],#4 +	add	r9,r9,r12 +	str	r5,[r2],#4 +	add	r10,r10,r3 +	str	r6,[r2],#4 +	add	r11,r11,r1 +	str	r7,[r2],#4 +	stmia	r2,{r8-r11} + +	movne	r1,sp +	ldrne	r2,[sp,#0] +	eorne	r12,r12,r12 +	ldreq	sp,[sp,#76]			@ restore original sp +	eorne	r3,r5,r6 +	bne	.L_00_48 + +	ldmia	sp!,{r4-r12,pc} +.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +#if __ARM_ARCH__>=7 +.type	sha256_block_data_order_armv8,%function +.align	5 +sha256_block_data_order_armv8: +.LARMv8: +	vld1.32	{q0,q1},[r0] +	sub	r3,r3,#sha256_block_data_order-K256 + +.Loop_v8: +	vld1.8		{q8-q9},[r1]! +	vld1.8		{q10-q11},[r1]! +	vld1.32		{q12},[r3]! +	vrev32.8	q8,q8 +	vrev32.8	q9,q9 +	vrev32.8	q10,q10 +	vrev32.8	q11,q11 +	vmov		q14,q0	@ offload +	vmov		q15,q1 +	teq		r1,r2 +	vld1.32		{q13},[r3]! +	vadd.i32	q12,q12,q8 +	.byte	0xe2,0x03,0xfa,0xf3	@ sha256su0 q8,q9 +	vmov		q2,q0 +	.byte	0x68,0x0c,0x02,0xf3	@ sha256h q0,q1,q12 +	.byte	0x68,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q12 +	.byte	0xe6,0x0c,0x64,0xf3	@ sha256su1 q8,q10,q11 +	vld1.32		{q12},[r3]! +	vadd.i32	q13,q13,q9 +	.byte	0xe4,0x23,0xfa,0xf3	@ sha256su0 q9,q10 +	vmov		q2,q0 +	.byte	0x6a,0x0c,0x02,0xf3	@ sha256h q0,q1,q13 +	.byte	0x6a,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q13 +	.byte	0xe0,0x2c,0x66,0xf3	@ sha256su1 q9,q11,q8 +	vld1.32		{q13},[r3]! +	vadd.i32	q12,q12,q10 +	.byte	0xe6,0x43,0xfa,0xf3	@ sha256su0 q10,q11 +	vmov		q2,q0 +	.byte	0x68,0x0c,0x02,0xf3	@ sha256h q0,q1,q12 +	.byte	0x68,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q12 +	.byte	0xe2,0x4c,0x60,0xf3	@ sha256su1 q10,q8,q9 +	vld1.32		{q12},[r3]! +	vadd.i32	q13,q13,q11 +	.byte	0xe0,0x63,0xfa,0xf3	@ sha256su0 q11,q8 +	vmov		q2,q0 +	.byte	0x6a,0x0c,0x02,0xf3	@ sha256h q0,q1,q13 +	.byte	0x6a,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q13 +	.byte	0xe4,0x6c,0x62,0xf3	@ sha256su1 q11,q9,q10 +	vld1.32		{q13},[r3]! +	vadd.i32	q12,q12,q8 +	.byte	0xe2,0x03,0xfa,0xf3	@ sha256su0 q8,q9 +	vmov		q2,q0 +	.byte	0x68,0x0c,0x02,0xf3	@ sha256h q0,q1,q12 +	.byte	0x68,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q12 +	.byte	0xe6,0x0c,0x64,0xf3	@ sha256su1 q8,q10,q11 +	vld1.32		{q12},[r3]! +	vadd.i32	q13,q13,q9 +	.byte	0xe4,0x23,0xfa,0xf3	@ sha256su0 q9,q10 +	vmov		q2,q0 +	.byte	0x6a,0x0c,0x02,0xf3	@ sha256h q0,q1,q13 +	.byte	0x6a,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q13 +	.byte	0xe0,0x2c,0x66,0xf3	@ sha256su1 q9,q11,q8 +	vld1.32		{q13},[r3]! +	vadd.i32	q12,q12,q10 +	.byte	0xe6,0x43,0xfa,0xf3	@ sha256su0 q10,q11 +	vmov		q2,q0 +	.byte	0x68,0x0c,0x02,0xf3	@ sha256h q0,q1,q12 +	.byte	0x68,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q12 +	.byte	0xe2,0x4c,0x60,0xf3	@ sha256su1 q10,q8,q9 +	vld1.32		{q12},[r3]! +	vadd.i32	q13,q13,q11 +	.byte	0xe0,0x63,0xfa,0xf3	@ sha256su0 q11,q8 +	vmov		q2,q0 +	.byte	0x6a,0x0c,0x02,0xf3	@ sha256h q0,q1,q13 +	.byte	0x6a,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q13 +	.byte	0xe4,0x6c,0x62,0xf3	@ sha256su1 q11,q9,q10 +	vld1.32		{q13},[r3]! +	vadd.i32	q12,q12,q8 +	.byte	0xe2,0x03,0xfa,0xf3	@ sha256su0 q8,q9 +	vmov		q2,q0 +	.byte	0x68,0x0c,0x02,0xf3	@ sha256h q0,q1,q12 +	.byte	0x68,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q12 +	.byte	0xe6,0x0c,0x64,0xf3	@ sha256su1 q8,q10,q11 +	vld1.32		{q12},[r3]! +	vadd.i32	q13,q13,q9 +	.byte	0xe4,0x23,0xfa,0xf3	@ sha256su0 q9,q10 +	vmov		q2,q0 +	.byte	0x6a,0x0c,0x02,0xf3	@ sha256h q0,q1,q13 +	.byte	0x6a,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q13 +	.byte	0xe0,0x2c,0x66,0xf3	@ sha256su1 q9,q11,q8 +	vld1.32		{q13},[r3]! +	vadd.i32	q12,q12,q10 +	.byte	0xe6,0x43,0xfa,0xf3	@ sha256su0 q10,q11 +	vmov		q2,q0 +	.byte	0x68,0x0c,0x02,0xf3	@ sha256h q0,q1,q12 +	.byte	0x68,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q12 +	.byte	0xe2,0x4c,0x60,0xf3	@ sha256su1 q10,q8,q9 +	vld1.32		{q12},[r3]! +	vadd.i32	q13,q13,q11 +	.byte	0xe0,0x63,0xfa,0xf3	@ sha256su0 q11,q8 +	vmov		q2,q0 +	.byte	0x6a,0x0c,0x02,0xf3	@ sha256h q0,q1,q13 +	.byte	0x6a,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q13 +	.byte	0xe4,0x6c,0x62,0xf3	@ sha256su1 q11,q9,q10 +	vld1.32		{q13},[r3]! +	vadd.i32	q12,q12,q8 +	vmov		q2,q0 +	.byte	0x68,0x0c,0x02,0xf3	@ sha256h q0,q1,q12 +	.byte	0x68,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q12 + +	vld1.32		{q12},[r3]! +	vadd.i32	q13,q13,q9 +	vmov		q2,q0 +	.byte	0x6a,0x0c,0x02,0xf3	@ sha256h q0,q1,q13 +	.byte	0x6a,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q13 + +	vld1.32		{q13},[r3] +	vadd.i32	q12,q12,q10 +	sub		r3,r3,#256-16	@ rewind +	vmov		q2,q0 +	.byte	0x68,0x0c,0x02,0xf3	@ sha256h q0,q1,q12 +	.byte	0x68,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q12 + +	vadd.i32	q13,q13,q11 +	vmov		q2,q0 +	.byte	0x6a,0x0c,0x02,0xf3	@ sha256h q0,q1,q13 +	.byte	0x6a,0x2c,0x14,0xf3	@ sha256h2 q1,q2,q13 + +	vadd.i32	q0,q0,q14 +	vadd.i32	q1,q1,q15 +	bne		.Loop_v8 + +	vst1.32		{q0,q1},[r0] + +	bx	lr		@ bx lr +.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"  .align	2 +.comm   OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha256-armv8.S b/main/openssl/crypto/sha/asm/sha256-armv8.S new file mode 100644 index 00000000..bd43b1fe --- /dev/null +++ b/main/openssl/crypto/sha/asm/sha256-armv8.S @@ -0,0 +1,1141 @@ +#include "arm_arch.h" + +.text + +.globl	sha256_block_data_order +.type	sha256_block_data_order,%function +.align	6 +sha256_block_data_order: +	ldr	x16,.LOPENSSL_armcap_P +	adr	x17,.LOPENSSL_armcap_P +	add	x16,x16,x17 +	ldr	w16,[x16] +	tst	w16,#ARMV8_SHA256 +	b.ne	.Lv8_entry +	stp	x29,x30,[sp,#-128]! +	add	x29,sp,#0 + +	stp	x19,x20,[sp,#16] +	stp	x21,x22,[sp,#32] +	stp	x23,x24,[sp,#48] +	stp	x25,x26,[sp,#64] +	stp	x27,x28,[sp,#80] +	sub	sp,sp,#4*4 + +	ldp	w20,w21,[x0]				// load context +	ldp	w22,w23,[x0,#2*4] +	ldp	w24,w25,[x0,#4*4] +	add	x2,x1,x2,lsl#6	// end of input +	ldp	w26,w27,[x0,#6*4] +	adr	x30,K256 +	stp	x0,x2,[x29,#96] + +.Loop: +	ldp	w3,w4,[x1],#2*4 +	ldr	w19,[x30],#4			// *K++ +	eor	w28,w21,w22				// magic seed +	str	x1,[x29,#112] +#ifndef	__ARMEB__ +	rev	w3,w3			// 0 +#endif +	ror	w16,w24,#6 +	add	w27,w27,w19			// h+=K[i] +	eor	w6,w24,w24,ror#14 +	and	w17,w25,w24 +	bic	w19,w26,w24 +	add	w27,w27,w3			// h+=X[i] +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w20,w21			// a^b, b^c in next round +	eor	w16,w16,w6,ror#11	// Sigma1(e) +	ror	w6,w20,#2 +	add	w27,w27,w17			// h+=Ch(e,f,g) +	eor	w17,w20,w20,ror#9 +	add	w27,w27,w16			// h+=Sigma1(e) +	and	w28,w28,w19			// (b^c)&=(a^b) +	add	w23,w23,w27			// d+=h +	eor	w28,w28,w21			// Maj(a,b,c) +	eor	w17,w6,w17,ror#13	// Sigma0(a) +	add	w27,w27,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	//add	w27,w27,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w4,w4			// 1 +#endif +	ldp	w5,w6,[x1],#2*4 +	add	w27,w27,w17			// h+=Sigma0(a) +	ror	w16,w23,#6 +	add	w26,w26,w28			// h+=K[i] +	eor	w7,w23,w23,ror#14 +	and	w17,w24,w23 +	bic	w28,w25,w23 +	add	w26,w26,w4			// h+=X[i] +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w27,w20			// a^b, b^c in next round +	eor	w16,w16,w7,ror#11	// Sigma1(e) +	ror	w7,w27,#2 +	add	w26,w26,w17			// h+=Ch(e,f,g) +	eor	w17,w27,w27,ror#9 +	add	w26,w26,w16			// h+=Sigma1(e) +	and	w19,w19,w28			// (b^c)&=(a^b) +	add	w22,w22,w26			// d+=h +	eor	w19,w19,w20			// Maj(a,b,c) +	eor	w17,w7,w17,ror#13	// Sigma0(a) +	add	w26,w26,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	//add	w26,w26,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w5,w5			// 2 +#endif +	add	w26,w26,w17			// h+=Sigma0(a) +	ror	w16,w22,#6 +	add	w25,w25,w19			// h+=K[i] +	eor	w8,w22,w22,ror#14 +	and	w17,w23,w22 +	bic	w19,w24,w22 +	add	w25,w25,w5			// h+=X[i] +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w26,w27			// a^b, b^c in next round +	eor	w16,w16,w8,ror#11	// Sigma1(e) +	ror	w8,w26,#2 +	add	w25,w25,w17			// h+=Ch(e,f,g) +	eor	w17,w26,w26,ror#9 +	add	w25,w25,w16			// h+=Sigma1(e) +	and	w28,w28,w19			// (b^c)&=(a^b) +	add	w21,w21,w25			// d+=h +	eor	w28,w28,w27			// Maj(a,b,c) +	eor	w17,w8,w17,ror#13	// Sigma0(a) +	add	w25,w25,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	//add	w25,w25,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w6,w6			// 3 +#endif +	ldp	w7,w8,[x1],#2*4 +	add	w25,w25,w17			// h+=Sigma0(a) +	ror	w16,w21,#6 +	add	w24,w24,w28			// h+=K[i] +	eor	w9,w21,w21,ror#14 +	and	w17,w22,w21 +	bic	w28,w23,w21 +	add	w24,w24,w6			// h+=X[i] +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w25,w26			// a^b, b^c in next round +	eor	w16,w16,w9,ror#11	// Sigma1(e) +	ror	w9,w25,#2 +	add	w24,w24,w17			// h+=Ch(e,f,g) +	eor	w17,w25,w25,ror#9 +	add	w24,w24,w16			// h+=Sigma1(e) +	and	w19,w19,w28			// (b^c)&=(a^b) +	add	w20,w20,w24			// d+=h +	eor	w19,w19,w26			// Maj(a,b,c) +	eor	w17,w9,w17,ror#13	// Sigma0(a) +	add	w24,w24,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	//add	w24,w24,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w7,w7			// 4 +#endif +	add	w24,w24,w17			// h+=Sigma0(a) +	ror	w16,w20,#6 +	add	w23,w23,w19			// h+=K[i] +	eor	w10,w20,w20,ror#14 +	and	w17,w21,w20 +	bic	w19,w22,w20 +	add	w23,w23,w7			// h+=X[i] +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w24,w25			// a^b, b^c in next round +	eor	w16,w16,w10,ror#11	// Sigma1(e) +	ror	w10,w24,#2 +	add	w23,w23,w17			// h+=Ch(e,f,g) +	eor	w17,w24,w24,ror#9 +	add	w23,w23,w16			// h+=Sigma1(e) +	and	w28,w28,w19			// (b^c)&=(a^b) +	add	w27,w27,w23			// d+=h +	eor	w28,w28,w25			// Maj(a,b,c) +	eor	w17,w10,w17,ror#13	// Sigma0(a) +	add	w23,w23,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	//add	w23,w23,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w8,w8			// 5 +#endif +	ldp	w9,w10,[x1],#2*4 +	add	w23,w23,w17			// h+=Sigma0(a) +	ror	w16,w27,#6 +	add	w22,w22,w28			// h+=K[i] +	eor	w11,w27,w27,ror#14 +	and	w17,w20,w27 +	bic	w28,w21,w27 +	add	w22,w22,w8			// h+=X[i] +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w23,w24			// a^b, b^c in next round +	eor	w16,w16,w11,ror#11	// Sigma1(e) +	ror	w11,w23,#2 +	add	w22,w22,w17			// h+=Ch(e,f,g) +	eor	w17,w23,w23,ror#9 +	add	w22,w22,w16			// h+=Sigma1(e) +	and	w19,w19,w28			// (b^c)&=(a^b) +	add	w26,w26,w22			// d+=h +	eor	w19,w19,w24			// Maj(a,b,c) +	eor	w17,w11,w17,ror#13	// Sigma0(a) +	add	w22,w22,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	//add	w22,w22,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w9,w9			// 6 +#endif +	add	w22,w22,w17			// h+=Sigma0(a) +	ror	w16,w26,#6 +	add	w21,w21,w19			// h+=K[i] +	eor	w12,w26,w26,ror#14 +	and	w17,w27,w26 +	bic	w19,w20,w26 +	add	w21,w21,w9			// h+=X[i] +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w22,w23			// a^b, b^c in next round +	eor	w16,w16,w12,ror#11	// Sigma1(e) +	ror	w12,w22,#2 +	add	w21,w21,w17			// h+=Ch(e,f,g) +	eor	w17,w22,w22,ror#9 +	add	w21,w21,w16			// h+=Sigma1(e) +	and	w28,w28,w19			// (b^c)&=(a^b) +	add	w25,w25,w21			// d+=h +	eor	w28,w28,w23			// Maj(a,b,c) +	eor	w17,w12,w17,ror#13	// Sigma0(a) +	add	w21,w21,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	//add	w21,w21,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w10,w10			// 7 +#endif +	ldp	w11,w12,[x1],#2*4 +	add	w21,w21,w17			// h+=Sigma0(a) +	ror	w16,w25,#6 +	add	w20,w20,w28			// h+=K[i] +	eor	w13,w25,w25,ror#14 +	and	w17,w26,w25 +	bic	w28,w27,w25 +	add	w20,w20,w10			// h+=X[i] +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w21,w22			// a^b, b^c in next round +	eor	w16,w16,w13,ror#11	// Sigma1(e) +	ror	w13,w21,#2 +	add	w20,w20,w17			// h+=Ch(e,f,g) +	eor	w17,w21,w21,ror#9 +	add	w20,w20,w16			// h+=Sigma1(e) +	and	w19,w19,w28			// (b^c)&=(a^b) +	add	w24,w24,w20			// d+=h +	eor	w19,w19,w22			// Maj(a,b,c) +	eor	w17,w13,w17,ror#13	// Sigma0(a) +	add	w20,w20,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	//add	w20,w20,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w11,w11			// 8 +#endif +	add	w20,w20,w17			// h+=Sigma0(a) +	ror	w16,w24,#6 +	add	w27,w27,w19			// h+=K[i] +	eor	w14,w24,w24,ror#14 +	and	w17,w25,w24 +	bic	w19,w26,w24 +	add	w27,w27,w11			// h+=X[i] +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w20,w21			// a^b, b^c in next round +	eor	w16,w16,w14,ror#11	// Sigma1(e) +	ror	w14,w20,#2 +	add	w27,w27,w17			// h+=Ch(e,f,g) +	eor	w17,w20,w20,ror#9 +	add	w27,w27,w16			// h+=Sigma1(e) +	and	w28,w28,w19			// (b^c)&=(a^b) +	add	w23,w23,w27			// d+=h +	eor	w28,w28,w21			// Maj(a,b,c) +	eor	w17,w14,w17,ror#13	// Sigma0(a) +	add	w27,w27,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	//add	w27,w27,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w12,w12			// 9 +#endif +	ldp	w13,w14,[x1],#2*4 +	add	w27,w27,w17			// h+=Sigma0(a) +	ror	w16,w23,#6 +	add	w26,w26,w28			// h+=K[i] +	eor	w15,w23,w23,ror#14 +	and	w17,w24,w23 +	bic	w28,w25,w23 +	add	w26,w26,w12			// h+=X[i] +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w27,w20			// a^b, b^c in next round +	eor	w16,w16,w15,ror#11	// Sigma1(e) +	ror	w15,w27,#2 +	add	w26,w26,w17			// h+=Ch(e,f,g) +	eor	w17,w27,w27,ror#9 +	add	w26,w26,w16			// h+=Sigma1(e) +	and	w19,w19,w28			// (b^c)&=(a^b) +	add	w22,w22,w26			// d+=h +	eor	w19,w19,w20			// Maj(a,b,c) +	eor	w17,w15,w17,ror#13	// Sigma0(a) +	add	w26,w26,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	//add	w26,w26,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w13,w13			// 10 +#endif +	add	w26,w26,w17			// h+=Sigma0(a) +	ror	w16,w22,#6 +	add	w25,w25,w19			// h+=K[i] +	eor	w0,w22,w22,ror#14 +	and	w17,w23,w22 +	bic	w19,w24,w22 +	add	w25,w25,w13			// h+=X[i] +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w26,w27			// a^b, b^c in next round +	eor	w16,w16,w0,ror#11	// Sigma1(e) +	ror	w0,w26,#2 +	add	w25,w25,w17			// h+=Ch(e,f,g) +	eor	w17,w26,w26,ror#9 +	add	w25,w25,w16			// h+=Sigma1(e) +	and	w28,w28,w19			// (b^c)&=(a^b) +	add	w21,w21,w25			// d+=h +	eor	w28,w28,w27			// Maj(a,b,c) +	eor	w17,w0,w17,ror#13	// Sigma0(a) +	add	w25,w25,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	//add	w25,w25,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w14,w14			// 11 +#endif +	ldp	w15,w0,[x1],#2*4 +	add	w25,w25,w17			// h+=Sigma0(a) +	str	w6,[sp,#12] +	ror	w16,w21,#6 +	add	w24,w24,w28			// h+=K[i] +	eor	w6,w21,w21,ror#14 +	and	w17,w22,w21 +	bic	w28,w23,w21 +	add	w24,w24,w14			// h+=X[i] +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w25,w26			// a^b, b^c in next round +	eor	w16,w16,w6,ror#11	// Sigma1(e) +	ror	w6,w25,#2 +	add	w24,w24,w17			// h+=Ch(e,f,g) +	eor	w17,w25,w25,ror#9 +	add	w24,w24,w16			// h+=Sigma1(e) +	and	w19,w19,w28			// (b^c)&=(a^b) +	add	w20,w20,w24			// d+=h +	eor	w19,w19,w26			// Maj(a,b,c) +	eor	w17,w6,w17,ror#13	// Sigma0(a) +	add	w24,w24,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	//add	w24,w24,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w15,w15			// 12 +#endif +	add	w24,w24,w17			// h+=Sigma0(a) +	str	w7,[sp,#0] +	ror	w16,w20,#6 +	add	w23,w23,w19			// h+=K[i] +	eor	w7,w20,w20,ror#14 +	and	w17,w21,w20 +	bic	w19,w22,w20 +	add	w23,w23,w15			// h+=X[i] +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w24,w25			// a^b, b^c in next round +	eor	w16,w16,w7,ror#11	// Sigma1(e) +	ror	w7,w24,#2 +	add	w23,w23,w17			// h+=Ch(e,f,g) +	eor	w17,w24,w24,ror#9 +	add	w23,w23,w16			// h+=Sigma1(e) +	and	w28,w28,w19			// (b^c)&=(a^b) +	add	w27,w27,w23			// d+=h +	eor	w28,w28,w25			// Maj(a,b,c) +	eor	w17,w7,w17,ror#13	// Sigma0(a) +	add	w23,w23,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	//add	w23,w23,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w0,w0			// 13 +#endif +	ldp	w1,w2,[x1] +	add	w23,w23,w17			// h+=Sigma0(a) +	str	w8,[sp,#4] +	ror	w16,w27,#6 +	add	w22,w22,w28			// h+=K[i] +	eor	w8,w27,w27,ror#14 +	and	w17,w20,w27 +	bic	w28,w21,w27 +	add	w22,w22,w0			// h+=X[i] +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w23,w24			// a^b, b^c in next round +	eor	w16,w16,w8,ror#11	// Sigma1(e) +	ror	w8,w23,#2 +	add	w22,w22,w17			// h+=Ch(e,f,g) +	eor	w17,w23,w23,ror#9 +	add	w22,w22,w16			// h+=Sigma1(e) +	and	w19,w19,w28			// (b^c)&=(a^b) +	add	w26,w26,w22			// d+=h +	eor	w19,w19,w24			// Maj(a,b,c) +	eor	w17,w8,w17,ror#13	// Sigma0(a) +	add	w22,w22,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	//add	w22,w22,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w1,w1			// 14 +#endif +	ldr	w6,[sp,#12] +	add	w22,w22,w17			// h+=Sigma0(a) +	str	w9,[sp,#8] +	ror	w16,w26,#6 +	add	w21,w21,w19			// h+=K[i] +	eor	w9,w26,w26,ror#14 +	and	w17,w27,w26 +	bic	w19,w20,w26 +	add	w21,w21,w1			// h+=X[i] +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w22,w23			// a^b, b^c in next round +	eor	w16,w16,w9,ror#11	// Sigma1(e) +	ror	w9,w22,#2 +	add	w21,w21,w17			// h+=Ch(e,f,g) +	eor	w17,w22,w22,ror#9 +	add	w21,w21,w16			// h+=Sigma1(e) +	and	w28,w28,w19			// (b^c)&=(a^b) +	add	w25,w25,w21			// d+=h +	eor	w28,w28,w23			// Maj(a,b,c) +	eor	w17,w9,w17,ror#13	// Sigma0(a) +	add	w21,w21,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	//add	w21,w21,w17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	w2,w2			// 15 +#endif +	ldr	w7,[sp,#0] +	add	w21,w21,w17			// h+=Sigma0(a) +	str	w10,[sp,#12] +	ror	w16,w25,#6 +	add	w20,w20,w28			// h+=K[i] +	ror	w9,w4,#7 +	and	w17,w26,w25 +	ror	w8,w1,#17 +	bic	w28,w27,w25 +	ror	w10,w21,#2 +	add	w20,w20,w2			// h+=X[i] +	eor	w16,w16,w25,ror#11 +	eor	w9,w9,w4,ror#18 +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w21,w22			// a^b, b^c in next round +	eor	w16,w16,w25,ror#25	// Sigma1(e) +	eor	w10,w10,w21,ror#13 +	add	w20,w20,w17			// h+=Ch(e,f,g) +	and	w19,w19,w28			// (b^c)&=(a^b) +	eor	w8,w8,w1,ror#19 +	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1]) +	add	w20,w20,w16			// h+=Sigma1(e) +	eor	w19,w19,w22			// Maj(a,b,c) +	eor	w17,w10,w21,ror#22	// Sigma0(a) +	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14]) +	add	w3,w3,w12 +	add	w24,w24,w20			// d+=h +	add	w20,w20,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	add	w3,w3,w9 +	add	w20,w20,w17			// h+=Sigma0(a) +	add	w3,w3,w8 +.Loop_16_xx: +	ldr	w8,[sp,#4] +	str	w11,[sp,#0] +	ror	w16,w24,#6 +	add	w27,w27,w19			// h+=K[i] +	ror	w10,w5,#7 +	and	w17,w25,w24 +	ror	w9,w2,#17 +	bic	w19,w26,w24 +	ror	w11,w20,#2 +	add	w27,w27,w3			// h+=X[i] +	eor	w16,w16,w24,ror#11 +	eor	w10,w10,w5,ror#18 +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w20,w21			// a^b, b^c in next round +	eor	w16,w16,w24,ror#25	// Sigma1(e) +	eor	w11,w11,w20,ror#13 +	add	w27,w27,w17			// h+=Ch(e,f,g) +	and	w28,w28,w19			// (b^c)&=(a^b) +	eor	w9,w9,w2,ror#19 +	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1]) +	add	w27,w27,w16			// h+=Sigma1(e) +	eor	w28,w28,w21			// Maj(a,b,c) +	eor	w17,w11,w20,ror#22	// Sigma0(a) +	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14]) +	add	w4,w4,w13 +	add	w23,w23,w27			// d+=h +	add	w27,w27,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	add	w4,w4,w10 +	add	w27,w27,w17			// h+=Sigma0(a) +	add	w4,w4,w9 +	ldr	w9,[sp,#8] +	str	w12,[sp,#4] +	ror	w16,w23,#6 +	add	w26,w26,w28			// h+=K[i] +	ror	w11,w6,#7 +	and	w17,w24,w23 +	ror	w10,w3,#17 +	bic	w28,w25,w23 +	ror	w12,w27,#2 +	add	w26,w26,w4			// h+=X[i] +	eor	w16,w16,w23,ror#11 +	eor	w11,w11,w6,ror#18 +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w27,w20			// a^b, b^c in next round +	eor	w16,w16,w23,ror#25	// Sigma1(e) +	eor	w12,w12,w27,ror#13 +	add	w26,w26,w17			// h+=Ch(e,f,g) +	and	w19,w19,w28			// (b^c)&=(a^b) +	eor	w10,w10,w3,ror#19 +	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1]) +	add	w26,w26,w16			// h+=Sigma1(e) +	eor	w19,w19,w20			// Maj(a,b,c) +	eor	w17,w12,w27,ror#22	// Sigma0(a) +	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14]) +	add	w5,w5,w14 +	add	w22,w22,w26			// d+=h +	add	w26,w26,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	add	w5,w5,w11 +	add	w26,w26,w17			// h+=Sigma0(a) +	add	w5,w5,w10 +	ldr	w10,[sp,#12] +	str	w13,[sp,#8] +	ror	w16,w22,#6 +	add	w25,w25,w19			// h+=K[i] +	ror	w12,w7,#7 +	and	w17,w23,w22 +	ror	w11,w4,#17 +	bic	w19,w24,w22 +	ror	w13,w26,#2 +	add	w25,w25,w5			// h+=X[i] +	eor	w16,w16,w22,ror#11 +	eor	w12,w12,w7,ror#18 +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w26,w27			// a^b, b^c in next round +	eor	w16,w16,w22,ror#25	// Sigma1(e) +	eor	w13,w13,w26,ror#13 +	add	w25,w25,w17			// h+=Ch(e,f,g) +	and	w28,w28,w19			// (b^c)&=(a^b) +	eor	w11,w11,w4,ror#19 +	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1]) +	add	w25,w25,w16			// h+=Sigma1(e) +	eor	w28,w28,w27			// Maj(a,b,c) +	eor	w17,w13,w26,ror#22	// Sigma0(a) +	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14]) +	add	w6,w6,w15 +	add	w21,w21,w25			// d+=h +	add	w25,w25,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	add	w6,w6,w12 +	add	w25,w25,w17			// h+=Sigma0(a) +	add	w6,w6,w11 +	ldr	w11,[sp,#0] +	str	w14,[sp,#12] +	ror	w16,w21,#6 +	add	w24,w24,w28			// h+=K[i] +	ror	w13,w8,#7 +	and	w17,w22,w21 +	ror	w12,w5,#17 +	bic	w28,w23,w21 +	ror	w14,w25,#2 +	add	w24,w24,w6			// h+=X[i] +	eor	w16,w16,w21,ror#11 +	eor	w13,w13,w8,ror#18 +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w25,w26			// a^b, b^c in next round +	eor	w16,w16,w21,ror#25	// Sigma1(e) +	eor	w14,w14,w25,ror#13 +	add	w24,w24,w17			// h+=Ch(e,f,g) +	and	w19,w19,w28			// (b^c)&=(a^b) +	eor	w12,w12,w5,ror#19 +	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1]) +	add	w24,w24,w16			// h+=Sigma1(e) +	eor	w19,w19,w26			// Maj(a,b,c) +	eor	w17,w14,w25,ror#22	// Sigma0(a) +	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14]) +	add	w7,w7,w0 +	add	w20,w20,w24			// d+=h +	add	w24,w24,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	add	w7,w7,w13 +	add	w24,w24,w17			// h+=Sigma0(a) +	add	w7,w7,w12 +	ldr	w12,[sp,#4] +	str	w15,[sp,#0] +	ror	w16,w20,#6 +	add	w23,w23,w19			// h+=K[i] +	ror	w14,w9,#7 +	and	w17,w21,w20 +	ror	w13,w6,#17 +	bic	w19,w22,w20 +	ror	w15,w24,#2 +	add	w23,w23,w7			// h+=X[i] +	eor	w16,w16,w20,ror#11 +	eor	w14,w14,w9,ror#18 +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w24,w25			// a^b, b^c in next round +	eor	w16,w16,w20,ror#25	// Sigma1(e) +	eor	w15,w15,w24,ror#13 +	add	w23,w23,w17			// h+=Ch(e,f,g) +	and	w28,w28,w19			// (b^c)&=(a^b) +	eor	w13,w13,w6,ror#19 +	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1]) +	add	w23,w23,w16			// h+=Sigma1(e) +	eor	w28,w28,w25			// Maj(a,b,c) +	eor	w17,w15,w24,ror#22	// Sigma0(a) +	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14]) +	add	w8,w8,w1 +	add	w27,w27,w23			// d+=h +	add	w23,w23,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	add	w8,w8,w14 +	add	w23,w23,w17			// h+=Sigma0(a) +	add	w8,w8,w13 +	ldr	w13,[sp,#8] +	str	w0,[sp,#4] +	ror	w16,w27,#6 +	add	w22,w22,w28			// h+=K[i] +	ror	w15,w10,#7 +	and	w17,w20,w27 +	ror	w14,w7,#17 +	bic	w28,w21,w27 +	ror	w0,w23,#2 +	add	w22,w22,w8			// h+=X[i] +	eor	w16,w16,w27,ror#11 +	eor	w15,w15,w10,ror#18 +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w23,w24			// a^b, b^c in next round +	eor	w16,w16,w27,ror#25	// Sigma1(e) +	eor	w0,w0,w23,ror#13 +	add	w22,w22,w17			// h+=Ch(e,f,g) +	and	w19,w19,w28			// (b^c)&=(a^b) +	eor	w14,w14,w7,ror#19 +	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1]) +	add	w22,w22,w16			// h+=Sigma1(e) +	eor	w19,w19,w24			// Maj(a,b,c) +	eor	w17,w0,w23,ror#22	// Sigma0(a) +	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14]) +	add	w9,w9,w2 +	add	w26,w26,w22			// d+=h +	add	w22,w22,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	add	w9,w9,w15 +	add	w22,w22,w17			// h+=Sigma0(a) +	add	w9,w9,w14 +	ldr	w14,[sp,#12] +	str	w1,[sp,#8] +	ror	w16,w26,#6 +	add	w21,w21,w19			// h+=K[i] +	ror	w0,w11,#7 +	and	w17,w27,w26 +	ror	w15,w8,#17 +	bic	w19,w20,w26 +	ror	w1,w22,#2 +	add	w21,w21,w9			// h+=X[i] +	eor	w16,w16,w26,ror#11 +	eor	w0,w0,w11,ror#18 +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w22,w23			// a^b, b^c in next round +	eor	w16,w16,w26,ror#25	// Sigma1(e) +	eor	w1,w1,w22,ror#13 +	add	w21,w21,w17			// h+=Ch(e,f,g) +	and	w28,w28,w19			// (b^c)&=(a^b) +	eor	w15,w15,w8,ror#19 +	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1]) +	add	w21,w21,w16			// h+=Sigma1(e) +	eor	w28,w28,w23			// Maj(a,b,c) +	eor	w17,w1,w22,ror#22	// Sigma0(a) +	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14]) +	add	w10,w10,w3 +	add	w25,w25,w21			// d+=h +	add	w21,w21,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	add	w10,w10,w0 +	add	w21,w21,w17			// h+=Sigma0(a) +	add	w10,w10,w15 +	ldr	w15,[sp,#0] +	str	w2,[sp,#12] +	ror	w16,w25,#6 +	add	w20,w20,w28			// h+=K[i] +	ror	w1,w12,#7 +	and	w17,w26,w25 +	ror	w0,w9,#17 +	bic	w28,w27,w25 +	ror	w2,w21,#2 +	add	w20,w20,w10			// h+=X[i] +	eor	w16,w16,w25,ror#11 +	eor	w1,w1,w12,ror#18 +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w21,w22			// a^b, b^c in next round +	eor	w16,w16,w25,ror#25	// Sigma1(e) +	eor	w2,w2,w21,ror#13 +	add	w20,w20,w17			// h+=Ch(e,f,g) +	and	w19,w19,w28			// (b^c)&=(a^b) +	eor	w0,w0,w9,ror#19 +	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1]) +	add	w20,w20,w16			// h+=Sigma1(e) +	eor	w19,w19,w22			// Maj(a,b,c) +	eor	w17,w2,w21,ror#22	// Sigma0(a) +	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14]) +	add	w11,w11,w4 +	add	w24,w24,w20			// d+=h +	add	w20,w20,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	add	w11,w11,w1 +	add	w20,w20,w17			// h+=Sigma0(a) +	add	w11,w11,w0 +	ldr	w0,[sp,#4] +	str	w3,[sp,#0] +	ror	w16,w24,#6 +	add	w27,w27,w19			// h+=K[i] +	ror	w2,w13,#7 +	and	w17,w25,w24 +	ror	w1,w10,#17 +	bic	w19,w26,w24 +	ror	w3,w20,#2 +	add	w27,w27,w11			// h+=X[i] +	eor	w16,w16,w24,ror#11 +	eor	w2,w2,w13,ror#18 +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w20,w21			// a^b, b^c in next round +	eor	w16,w16,w24,ror#25	// Sigma1(e) +	eor	w3,w3,w20,ror#13 +	add	w27,w27,w17			// h+=Ch(e,f,g) +	and	w28,w28,w19			// (b^c)&=(a^b) +	eor	w1,w1,w10,ror#19 +	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1]) +	add	w27,w27,w16			// h+=Sigma1(e) +	eor	w28,w28,w21			// Maj(a,b,c) +	eor	w17,w3,w20,ror#22	// Sigma0(a) +	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14]) +	add	w12,w12,w5 +	add	w23,w23,w27			// d+=h +	add	w27,w27,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	add	w12,w12,w2 +	add	w27,w27,w17			// h+=Sigma0(a) +	add	w12,w12,w1 +	ldr	w1,[sp,#8] +	str	w4,[sp,#4] +	ror	w16,w23,#6 +	add	w26,w26,w28			// h+=K[i] +	ror	w3,w14,#7 +	and	w17,w24,w23 +	ror	w2,w11,#17 +	bic	w28,w25,w23 +	ror	w4,w27,#2 +	add	w26,w26,w12			// h+=X[i] +	eor	w16,w16,w23,ror#11 +	eor	w3,w3,w14,ror#18 +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w27,w20			// a^b, b^c in next round +	eor	w16,w16,w23,ror#25	// Sigma1(e) +	eor	w4,w4,w27,ror#13 +	add	w26,w26,w17			// h+=Ch(e,f,g) +	and	w19,w19,w28			// (b^c)&=(a^b) +	eor	w2,w2,w11,ror#19 +	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1]) +	add	w26,w26,w16			// h+=Sigma1(e) +	eor	w19,w19,w20			// Maj(a,b,c) +	eor	w17,w4,w27,ror#22	// Sigma0(a) +	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14]) +	add	w13,w13,w6 +	add	w22,w22,w26			// d+=h +	add	w26,w26,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	add	w13,w13,w3 +	add	w26,w26,w17			// h+=Sigma0(a) +	add	w13,w13,w2 +	ldr	w2,[sp,#12] +	str	w5,[sp,#8] +	ror	w16,w22,#6 +	add	w25,w25,w19			// h+=K[i] +	ror	w4,w15,#7 +	and	w17,w23,w22 +	ror	w3,w12,#17 +	bic	w19,w24,w22 +	ror	w5,w26,#2 +	add	w25,w25,w13			// h+=X[i] +	eor	w16,w16,w22,ror#11 +	eor	w4,w4,w15,ror#18 +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w26,w27			// a^b, b^c in next round +	eor	w16,w16,w22,ror#25	// Sigma1(e) +	eor	w5,w5,w26,ror#13 +	add	w25,w25,w17			// h+=Ch(e,f,g) +	and	w28,w28,w19			// (b^c)&=(a^b) +	eor	w3,w3,w12,ror#19 +	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1]) +	add	w25,w25,w16			// h+=Sigma1(e) +	eor	w28,w28,w27			// Maj(a,b,c) +	eor	w17,w5,w26,ror#22	// Sigma0(a) +	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14]) +	add	w14,w14,w7 +	add	w21,w21,w25			// d+=h +	add	w25,w25,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	add	w14,w14,w4 +	add	w25,w25,w17			// h+=Sigma0(a) +	add	w14,w14,w3 +	ldr	w3,[sp,#0] +	str	w6,[sp,#12] +	ror	w16,w21,#6 +	add	w24,w24,w28			// h+=K[i] +	ror	w5,w0,#7 +	and	w17,w22,w21 +	ror	w4,w13,#17 +	bic	w28,w23,w21 +	ror	w6,w25,#2 +	add	w24,w24,w14			// h+=X[i] +	eor	w16,w16,w21,ror#11 +	eor	w5,w5,w0,ror#18 +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w25,w26			// a^b, b^c in next round +	eor	w16,w16,w21,ror#25	// Sigma1(e) +	eor	w6,w6,w25,ror#13 +	add	w24,w24,w17			// h+=Ch(e,f,g) +	and	w19,w19,w28			// (b^c)&=(a^b) +	eor	w4,w4,w13,ror#19 +	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1]) +	add	w24,w24,w16			// h+=Sigma1(e) +	eor	w19,w19,w26			// Maj(a,b,c) +	eor	w17,w6,w25,ror#22	// Sigma0(a) +	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14]) +	add	w15,w15,w8 +	add	w20,w20,w24			// d+=h +	add	w24,w24,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	add	w15,w15,w5 +	add	w24,w24,w17			// h+=Sigma0(a) +	add	w15,w15,w4 +	ldr	w4,[sp,#4] +	str	w7,[sp,#0] +	ror	w16,w20,#6 +	add	w23,w23,w19			// h+=K[i] +	ror	w6,w1,#7 +	and	w17,w21,w20 +	ror	w5,w14,#17 +	bic	w19,w22,w20 +	ror	w7,w24,#2 +	add	w23,w23,w15			// h+=X[i] +	eor	w16,w16,w20,ror#11 +	eor	w6,w6,w1,ror#18 +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w24,w25			// a^b, b^c in next round +	eor	w16,w16,w20,ror#25	// Sigma1(e) +	eor	w7,w7,w24,ror#13 +	add	w23,w23,w17			// h+=Ch(e,f,g) +	and	w28,w28,w19			// (b^c)&=(a^b) +	eor	w5,w5,w14,ror#19 +	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1]) +	add	w23,w23,w16			// h+=Sigma1(e) +	eor	w28,w28,w25			// Maj(a,b,c) +	eor	w17,w7,w24,ror#22	// Sigma0(a) +	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14]) +	add	w0,w0,w9 +	add	w27,w27,w23			// d+=h +	add	w23,w23,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	add	w0,w0,w6 +	add	w23,w23,w17			// h+=Sigma0(a) +	add	w0,w0,w5 +	ldr	w5,[sp,#8] +	str	w8,[sp,#4] +	ror	w16,w27,#6 +	add	w22,w22,w28			// h+=K[i] +	ror	w7,w2,#7 +	and	w17,w20,w27 +	ror	w6,w15,#17 +	bic	w28,w21,w27 +	ror	w8,w23,#2 +	add	w22,w22,w0			// h+=X[i] +	eor	w16,w16,w27,ror#11 +	eor	w7,w7,w2,ror#18 +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w23,w24			// a^b, b^c in next round +	eor	w16,w16,w27,ror#25	// Sigma1(e) +	eor	w8,w8,w23,ror#13 +	add	w22,w22,w17			// h+=Ch(e,f,g) +	and	w19,w19,w28			// (b^c)&=(a^b) +	eor	w6,w6,w15,ror#19 +	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1]) +	add	w22,w22,w16			// h+=Sigma1(e) +	eor	w19,w19,w24			// Maj(a,b,c) +	eor	w17,w8,w23,ror#22	// Sigma0(a) +	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14]) +	add	w1,w1,w10 +	add	w26,w26,w22			// d+=h +	add	w22,w22,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	add	w1,w1,w7 +	add	w22,w22,w17			// h+=Sigma0(a) +	add	w1,w1,w6 +	ldr	w6,[sp,#12] +	str	w9,[sp,#8] +	ror	w16,w26,#6 +	add	w21,w21,w19			// h+=K[i] +	ror	w8,w3,#7 +	and	w17,w27,w26 +	ror	w7,w0,#17 +	bic	w19,w20,w26 +	ror	w9,w22,#2 +	add	w21,w21,w1			// h+=X[i] +	eor	w16,w16,w26,ror#11 +	eor	w8,w8,w3,ror#18 +	orr	w17,w17,w19			// Ch(e,f,g) +	eor	w19,w22,w23			// a^b, b^c in next round +	eor	w16,w16,w26,ror#25	// Sigma1(e) +	eor	w9,w9,w22,ror#13 +	add	w21,w21,w17			// h+=Ch(e,f,g) +	and	w28,w28,w19			// (b^c)&=(a^b) +	eor	w7,w7,w0,ror#19 +	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1]) +	add	w21,w21,w16			// h+=Sigma1(e) +	eor	w28,w28,w23			// Maj(a,b,c) +	eor	w17,w9,w22,ror#22	// Sigma0(a) +	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14]) +	add	w2,w2,w11 +	add	w25,w25,w21			// d+=h +	add	w21,w21,w28			// h+=Maj(a,b,c) +	ldr	w28,[x30],#4		// *K++, w19 in next round +	add	w2,w2,w8 +	add	w21,w21,w17			// h+=Sigma0(a) +	add	w2,w2,w7 +	ldr	w7,[sp,#0] +	str	w10,[sp,#12] +	ror	w16,w25,#6 +	add	w20,w20,w28			// h+=K[i] +	ror	w9,w4,#7 +	and	w17,w26,w25 +	ror	w8,w1,#17 +	bic	w28,w27,w25 +	ror	w10,w21,#2 +	add	w20,w20,w2			// h+=X[i] +	eor	w16,w16,w25,ror#11 +	eor	w9,w9,w4,ror#18 +	orr	w17,w17,w28			// Ch(e,f,g) +	eor	w28,w21,w22			// a^b, b^c in next round +	eor	w16,w16,w25,ror#25	// Sigma1(e) +	eor	w10,w10,w21,ror#13 +	add	w20,w20,w17			// h+=Ch(e,f,g) +	and	w19,w19,w28			// (b^c)&=(a^b) +	eor	w8,w8,w1,ror#19 +	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1]) +	add	w20,w20,w16			// h+=Sigma1(e) +	eor	w19,w19,w22			// Maj(a,b,c) +	eor	w17,w10,w21,ror#22	// Sigma0(a) +	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14]) +	add	w3,w3,w12 +	add	w24,w24,w20			// d+=h +	add	w20,w20,w19			// h+=Maj(a,b,c) +	ldr	w19,[x30],#4		// *K++, w28 in next round +	add	w3,w3,w9 +	add	w20,w20,w17			// h+=Sigma0(a) +	add	w3,w3,w8 +	cbnz	w19,.Loop_16_xx + +	ldp	x0,x2,[x29,#96] +	ldr	x1,[x29,#112] +	sub	x30,x30,#260		// rewind + +	ldp	w3,w4,[x0] +	ldp	w5,w6,[x0,#2*4] +	add	x1,x1,#14*4			// advance input pointer +	ldp	w7,w8,[x0,#4*4] +	add	w20,w20,w3 +	ldp	w9,w10,[x0,#6*4] +	add	w21,w21,w4 +	add	w22,w22,w5 +	add	w23,w23,w6 +	stp	w20,w21,[x0] +	add	w24,w24,w7 +	add	w25,w25,w8 +	stp	w22,w23,[x0,#2*4] +	add	w26,w26,w9 +	add	w27,w27,w10 +	cmp	x1,x2 +	stp	w24,w25,[x0,#4*4] +	stp	w26,w27,[x0,#6*4] +	b.ne	.Loop + +	ldp	x19,x20,[x29,#16] +	add	sp,sp,#4*4 +	ldp	x21,x22,[x29,#32] +	ldp	x23,x24,[x29,#48] +	ldp	x25,x26,[x29,#64] +	ldp	x27,x28,[x29,#80] +	ldp	x29,x30,[sp],#128 +	ret +.size	sha256_block_data_order,.-sha256_block_data_order + +.align	6 +.type	K256,%object +K256: +	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +	.long	0	//terminator +.size	K256,.-K256 +.align	3 +.LOPENSSL_armcap_P: +	.quad	OPENSSL_armcap_P-. +.asciz	"SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align	2 +.type	sha256_block_armv8,%function +.align	6 +sha256_block_armv8: +.Lv8_entry: +	stp		x29,x30,[sp,#-16]! +	add		x29,sp,#0 + +	ld1		{v0.4s,v1.4s},[x0] +	adr		x3,K256 + +.Loop_hw: +	ld1		{v4.16b-v7.16b},[x1],#64 +	sub		x2,x2,#1 +	ld1		{v16.4s},[x3],#16 +	rev32		v4.16b,v4.16b +	rev32		v5.16b,v5.16b +	rev32		v6.16b,v6.16b +	rev32		v7.16b,v7.16b +	orr		v18.16b,v0.16b,v0.16b		// offload +	orr		v19.16b,v1.16b,v1.16b +	ld1		{v17.4s},[x3],#16 +	add		v16.4s,v16.4s,v4.4s +	.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s +	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s +	.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b +	ld1		{v16.4s},[x3],#16 +	add		v17.4s,v17.4s,v5.4s +	.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s +	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s +	.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b +	ld1		{v17.4s},[x3],#16 +	add		v16.4s,v16.4s,v6.4s +	.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s +	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s +	.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b +	ld1		{v16.4s},[x3],#16 +	add		v17.4s,v17.4s,v7.4s +	.inst	0x5e282887	//sha256su0 v7.16b,v4.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s +	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s +	.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b +	ld1		{v17.4s},[x3],#16 +	add		v16.4s,v16.4s,v4.4s +	.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s +	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s +	.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b +	ld1		{v16.4s},[x3],#16 +	add		v17.4s,v17.4s,v5.4s +	.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s +	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s +	.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b +	ld1		{v17.4s},[x3],#16 +	add		v16.4s,v16.4s,v6.4s +	.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s +	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s +	.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b +	ld1		{v16.4s},[x3],#16 +	add		v17.4s,v17.4s,v7.4s +	.inst	0x5e282887	//sha256su0 v7.16b,v4.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s +	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s +	.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b +	ld1		{v17.4s},[x3],#16 +	add		v16.4s,v16.4s,v4.4s +	.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s +	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s +	.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b +	ld1		{v16.4s},[x3],#16 +	add		v17.4s,v17.4s,v5.4s +	.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s +	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s +	.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b +	ld1		{v17.4s},[x3],#16 +	add		v16.4s,v16.4s,v6.4s +	.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s +	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s +	.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b +	ld1		{v16.4s},[x3],#16 +	add		v17.4s,v17.4s,v7.4s +	.inst	0x5e282887	//sha256su0 v7.16b,v4.16b +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s +	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s +	.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b +	ld1		{v17.4s},[x3],#16 +	add		v16.4s,v16.4s,v4.4s +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s +	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s + +	ld1		{v16.4s},[x3],#16 +	add		v17.4s,v17.4s,v5.4s +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s +	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s + +	ld1		{v17.4s},[x3] +	add		v16.4s,v16.4s,v6.4s +	sub		x3,x3,#64*4-16	// rewind +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s +	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s + +	add		v17.4s,v17.4s,v7.4s +	orr		v2.16b,v0.16b,v0.16b +	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s +	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s + +	add		v0.4s,v0.4s,v18.4s +	add		v1.4s,v1.4s,v19.4s + +	cbnz		x2,.Loop_hw + +	st1		{v0.4s,v1.4s},[x0] + +	ldr		x29,[sp],#16 +	ret +.size	sha256_block_armv8,.-sha256_block_armv8 +.comm	OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha512-armv4.pl b/main/openssl/crypto/sha/asm/sha512-armv4.pl index 7faf37b1..71aa9356 100644 --- a/main/openssl/crypto/sha/asm/sha512-armv4.pl +++ b/main/openssl/crypto/sha/asm/sha512-armv4.pl @@ -565,7 +565,7 @@ $code.=<<___;  	bne		.Loop_neon  	vldmia	sp!,{d8-d15}		@ epilogue -	bx	lr +	ret				@ bx lr  #endif  ___  } @@ -578,5 +578,6 @@ ___  $code =~ s/\`([^\`]*)\`/eval $1/gem;  $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx	lr/gm;  print $code;  close STDOUT; # enforce flush diff --git a/main/openssl/crypto/sha/asm/sha512-armv4.s b/main/openssl/crypto/sha/asm/sha512-armv4.s index 57301922..fd462771 100644 --- a/main/openssl/crypto/sha/asm/sha512-armv4.s +++ b/main/openssl/crypto/sha/asm/sha512-armv4.s @@ -1775,7 +1775,7 @@ sha512_block_data_order:  	bne		.Loop_neon  	vldmia	sp!,{d8-d15}		@ epilogue -	.word	0xe12fff1e +	bx	lr				@ .word	0xe12fff1e  #endif  .size	sha512_block_data_order,.-sha512_block_data_order  .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" diff --git a/main/openssl/crypto/sha/asm/sha512-armv8.S b/main/openssl/crypto/sha/asm/sha512-armv8.S new file mode 100644 index 00000000..6b0d1940 --- /dev/null +++ b/main/openssl/crypto/sha/asm/sha512-armv8.S @@ -0,0 +1,1021 @@ +#include "arm_arch.h" + +.text + +.globl	sha512_block_data_order +.type	sha512_block_data_order,%function +.align	6 +sha512_block_data_order: +	stp	x29,x30,[sp,#-128]! +	add	x29,sp,#0 + +	stp	x19,x20,[sp,#16] +	stp	x21,x22,[sp,#32] +	stp	x23,x24,[sp,#48] +	stp	x25,x26,[sp,#64] +	stp	x27,x28,[sp,#80] +	sub	sp,sp,#4*8 + +	ldp	x20,x21,[x0]				// load context +	ldp	x22,x23,[x0,#2*8] +	ldp	x24,x25,[x0,#4*8] +	add	x2,x1,x2,lsl#7	// end of input +	ldp	x26,x27,[x0,#6*8] +	adr	x30,K512 +	stp	x0,x2,[x29,#96] + +.Loop: +	ldp	x3,x4,[x1],#2*8 +	ldr	x19,[x30],#8			// *K++ +	eor	x28,x21,x22				// magic seed +	str	x1,[x29,#112] +#ifndef	__ARMEB__ +	rev	x3,x3			// 0 +#endif +	ror	x16,x24,#14 +	add	x27,x27,x19			// h+=K[i] +	eor	x6,x24,x24,ror#23 +	and	x17,x25,x24 +	bic	x19,x26,x24 +	add	x27,x27,x3			// h+=X[i] +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x20,x21			// a^b, b^c in next round +	eor	x16,x16,x6,ror#18	// Sigma1(e) +	ror	x6,x20,#28 +	add	x27,x27,x17			// h+=Ch(e,f,g) +	eor	x17,x20,x20,ror#5 +	add	x27,x27,x16			// h+=Sigma1(e) +	and	x28,x28,x19			// (b^c)&=(a^b) +	add	x23,x23,x27			// d+=h +	eor	x28,x28,x21			// Maj(a,b,c) +	eor	x17,x6,x17,ror#34	// Sigma0(a) +	add	x27,x27,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	//add	x27,x27,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x4,x4			// 1 +#endif +	ldp	x5,x6,[x1],#2*8 +	add	x27,x27,x17			// h+=Sigma0(a) +	ror	x16,x23,#14 +	add	x26,x26,x28			// h+=K[i] +	eor	x7,x23,x23,ror#23 +	and	x17,x24,x23 +	bic	x28,x25,x23 +	add	x26,x26,x4			// h+=X[i] +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x27,x20			// a^b, b^c in next round +	eor	x16,x16,x7,ror#18	// Sigma1(e) +	ror	x7,x27,#28 +	add	x26,x26,x17			// h+=Ch(e,f,g) +	eor	x17,x27,x27,ror#5 +	add	x26,x26,x16			// h+=Sigma1(e) +	and	x19,x19,x28			// (b^c)&=(a^b) +	add	x22,x22,x26			// d+=h +	eor	x19,x19,x20			// Maj(a,b,c) +	eor	x17,x7,x17,ror#34	// Sigma0(a) +	add	x26,x26,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	//add	x26,x26,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x5,x5			// 2 +#endif +	add	x26,x26,x17			// h+=Sigma0(a) +	ror	x16,x22,#14 +	add	x25,x25,x19			// h+=K[i] +	eor	x8,x22,x22,ror#23 +	and	x17,x23,x22 +	bic	x19,x24,x22 +	add	x25,x25,x5			// h+=X[i] +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x26,x27			// a^b, b^c in next round +	eor	x16,x16,x8,ror#18	// Sigma1(e) +	ror	x8,x26,#28 +	add	x25,x25,x17			// h+=Ch(e,f,g) +	eor	x17,x26,x26,ror#5 +	add	x25,x25,x16			// h+=Sigma1(e) +	and	x28,x28,x19			// (b^c)&=(a^b) +	add	x21,x21,x25			// d+=h +	eor	x28,x28,x27			// Maj(a,b,c) +	eor	x17,x8,x17,ror#34	// Sigma0(a) +	add	x25,x25,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	//add	x25,x25,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x6,x6			// 3 +#endif +	ldp	x7,x8,[x1],#2*8 +	add	x25,x25,x17			// h+=Sigma0(a) +	ror	x16,x21,#14 +	add	x24,x24,x28			// h+=K[i] +	eor	x9,x21,x21,ror#23 +	and	x17,x22,x21 +	bic	x28,x23,x21 +	add	x24,x24,x6			// h+=X[i] +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x25,x26			// a^b, b^c in next round +	eor	x16,x16,x9,ror#18	// Sigma1(e) +	ror	x9,x25,#28 +	add	x24,x24,x17			// h+=Ch(e,f,g) +	eor	x17,x25,x25,ror#5 +	add	x24,x24,x16			// h+=Sigma1(e) +	and	x19,x19,x28			// (b^c)&=(a^b) +	add	x20,x20,x24			// d+=h +	eor	x19,x19,x26			// Maj(a,b,c) +	eor	x17,x9,x17,ror#34	// Sigma0(a) +	add	x24,x24,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	//add	x24,x24,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x7,x7			// 4 +#endif +	add	x24,x24,x17			// h+=Sigma0(a) +	ror	x16,x20,#14 +	add	x23,x23,x19			// h+=K[i] +	eor	x10,x20,x20,ror#23 +	and	x17,x21,x20 +	bic	x19,x22,x20 +	add	x23,x23,x7			// h+=X[i] +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x24,x25			// a^b, b^c in next round +	eor	x16,x16,x10,ror#18	// Sigma1(e) +	ror	x10,x24,#28 +	add	x23,x23,x17			// h+=Ch(e,f,g) +	eor	x17,x24,x24,ror#5 +	add	x23,x23,x16			// h+=Sigma1(e) +	and	x28,x28,x19			// (b^c)&=(a^b) +	add	x27,x27,x23			// d+=h +	eor	x28,x28,x25			// Maj(a,b,c) +	eor	x17,x10,x17,ror#34	// Sigma0(a) +	add	x23,x23,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	//add	x23,x23,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x8,x8			// 5 +#endif +	ldp	x9,x10,[x1],#2*8 +	add	x23,x23,x17			// h+=Sigma0(a) +	ror	x16,x27,#14 +	add	x22,x22,x28			// h+=K[i] +	eor	x11,x27,x27,ror#23 +	and	x17,x20,x27 +	bic	x28,x21,x27 +	add	x22,x22,x8			// h+=X[i] +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x23,x24			// a^b, b^c in next round +	eor	x16,x16,x11,ror#18	// Sigma1(e) +	ror	x11,x23,#28 +	add	x22,x22,x17			// h+=Ch(e,f,g) +	eor	x17,x23,x23,ror#5 +	add	x22,x22,x16			// h+=Sigma1(e) +	and	x19,x19,x28			// (b^c)&=(a^b) +	add	x26,x26,x22			// d+=h +	eor	x19,x19,x24			// Maj(a,b,c) +	eor	x17,x11,x17,ror#34	// Sigma0(a) +	add	x22,x22,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	//add	x22,x22,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x9,x9			// 6 +#endif +	add	x22,x22,x17			// h+=Sigma0(a) +	ror	x16,x26,#14 +	add	x21,x21,x19			// h+=K[i] +	eor	x12,x26,x26,ror#23 +	and	x17,x27,x26 +	bic	x19,x20,x26 +	add	x21,x21,x9			// h+=X[i] +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x22,x23			// a^b, b^c in next round +	eor	x16,x16,x12,ror#18	// Sigma1(e) +	ror	x12,x22,#28 +	add	x21,x21,x17			// h+=Ch(e,f,g) +	eor	x17,x22,x22,ror#5 +	add	x21,x21,x16			// h+=Sigma1(e) +	and	x28,x28,x19			// (b^c)&=(a^b) +	add	x25,x25,x21			// d+=h +	eor	x28,x28,x23			// Maj(a,b,c) +	eor	x17,x12,x17,ror#34	// Sigma0(a) +	add	x21,x21,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	//add	x21,x21,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x10,x10			// 7 +#endif +	ldp	x11,x12,[x1],#2*8 +	add	x21,x21,x17			// h+=Sigma0(a) +	ror	x16,x25,#14 +	add	x20,x20,x28			// h+=K[i] +	eor	x13,x25,x25,ror#23 +	and	x17,x26,x25 +	bic	x28,x27,x25 +	add	x20,x20,x10			// h+=X[i] +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x21,x22			// a^b, b^c in next round +	eor	x16,x16,x13,ror#18	// Sigma1(e) +	ror	x13,x21,#28 +	add	x20,x20,x17			// h+=Ch(e,f,g) +	eor	x17,x21,x21,ror#5 +	add	x20,x20,x16			// h+=Sigma1(e) +	and	x19,x19,x28			// (b^c)&=(a^b) +	add	x24,x24,x20			// d+=h +	eor	x19,x19,x22			// Maj(a,b,c) +	eor	x17,x13,x17,ror#34	// Sigma0(a) +	add	x20,x20,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	//add	x20,x20,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x11,x11			// 8 +#endif +	add	x20,x20,x17			// h+=Sigma0(a) +	ror	x16,x24,#14 +	add	x27,x27,x19			// h+=K[i] +	eor	x14,x24,x24,ror#23 +	and	x17,x25,x24 +	bic	x19,x26,x24 +	add	x27,x27,x11			// h+=X[i] +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x20,x21			// a^b, b^c in next round +	eor	x16,x16,x14,ror#18	// Sigma1(e) +	ror	x14,x20,#28 +	add	x27,x27,x17			// h+=Ch(e,f,g) +	eor	x17,x20,x20,ror#5 +	add	x27,x27,x16			// h+=Sigma1(e) +	and	x28,x28,x19			// (b^c)&=(a^b) +	add	x23,x23,x27			// d+=h +	eor	x28,x28,x21			// Maj(a,b,c) +	eor	x17,x14,x17,ror#34	// Sigma0(a) +	add	x27,x27,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	//add	x27,x27,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x12,x12			// 9 +#endif +	ldp	x13,x14,[x1],#2*8 +	add	x27,x27,x17			// h+=Sigma0(a) +	ror	x16,x23,#14 +	add	x26,x26,x28			// h+=K[i] +	eor	x15,x23,x23,ror#23 +	and	x17,x24,x23 +	bic	x28,x25,x23 +	add	x26,x26,x12			// h+=X[i] +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x27,x20			// a^b, b^c in next round +	eor	x16,x16,x15,ror#18	// Sigma1(e) +	ror	x15,x27,#28 +	add	x26,x26,x17			// h+=Ch(e,f,g) +	eor	x17,x27,x27,ror#5 +	add	x26,x26,x16			// h+=Sigma1(e) +	and	x19,x19,x28			// (b^c)&=(a^b) +	add	x22,x22,x26			// d+=h +	eor	x19,x19,x20			// Maj(a,b,c) +	eor	x17,x15,x17,ror#34	// Sigma0(a) +	add	x26,x26,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	//add	x26,x26,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x13,x13			// 10 +#endif +	add	x26,x26,x17			// h+=Sigma0(a) +	ror	x16,x22,#14 +	add	x25,x25,x19			// h+=K[i] +	eor	x0,x22,x22,ror#23 +	and	x17,x23,x22 +	bic	x19,x24,x22 +	add	x25,x25,x13			// h+=X[i] +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x26,x27			// a^b, b^c in next round +	eor	x16,x16,x0,ror#18	// Sigma1(e) +	ror	x0,x26,#28 +	add	x25,x25,x17			// h+=Ch(e,f,g) +	eor	x17,x26,x26,ror#5 +	add	x25,x25,x16			// h+=Sigma1(e) +	and	x28,x28,x19			// (b^c)&=(a^b) +	add	x21,x21,x25			// d+=h +	eor	x28,x28,x27			// Maj(a,b,c) +	eor	x17,x0,x17,ror#34	// Sigma0(a) +	add	x25,x25,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	//add	x25,x25,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x14,x14			// 11 +#endif +	ldp	x15,x0,[x1],#2*8 +	add	x25,x25,x17			// h+=Sigma0(a) +	str	x6,[sp,#24] +	ror	x16,x21,#14 +	add	x24,x24,x28			// h+=K[i] +	eor	x6,x21,x21,ror#23 +	and	x17,x22,x21 +	bic	x28,x23,x21 +	add	x24,x24,x14			// h+=X[i] +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x25,x26			// a^b, b^c in next round +	eor	x16,x16,x6,ror#18	// Sigma1(e) +	ror	x6,x25,#28 +	add	x24,x24,x17			// h+=Ch(e,f,g) +	eor	x17,x25,x25,ror#5 +	add	x24,x24,x16			// h+=Sigma1(e) +	and	x19,x19,x28			// (b^c)&=(a^b) +	add	x20,x20,x24			// d+=h +	eor	x19,x19,x26			// Maj(a,b,c) +	eor	x17,x6,x17,ror#34	// Sigma0(a) +	add	x24,x24,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	//add	x24,x24,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x15,x15			// 12 +#endif +	add	x24,x24,x17			// h+=Sigma0(a) +	str	x7,[sp,#0] +	ror	x16,x20,#14 +	add	x23,x23,x19			// h+=K[i] +	eor	x7,x20,x20,ror#23 +	and	x17,x21,x20 +	bic	x19,x22,x20 +	add	x23,x23,x15			// h+=X[i] +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x24,x25			// a^b, b^c in next round +	eor	x16,x16,x7,ror#18	// Sigma1(e) +	ror	x7,x24,#28 +	add	x23,x23,x17			// h+=Ch(e,f,g) +	eor	x17,x24,x24,ror#5 +	add	x23,x23,x16			// h+=Sigma1(e) +	and	x28,x28,x19			// (b^c)&=(a^b) +	add	x27,x27,x23			// d+=h +	eor	x28,x28,x25			// Maj(a,b,c) +	eor	x17,x7,x17,ror#34	// Sigma0(a) +	add	x23,x23,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	//add	x23,x23,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x0,x0			// 13 +#endif +	ldp	x1,x2,[x1] +	add	x23,x23,x17			// h+=Sigma0(a) +	str	x8,[sp,#8] +	ror	x16,x27,#14 +	add	x22,x22,x28			// h+=K[i] +	eor	x8,x27,x27,ror#23 +	and	x17,x20,x27 +	bic	x28,x21,x27 +	add	x22,x22,x0			// h+=X[i] +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x23,x24			// a^b, b^c in next round +	eor	x16,x16,x8,ror#18	// Sigma1(e) +	ror	x8,x23,#28 +	add	x22,x22,x17			// h+=Ch(e,f,g) +	eor	x17,x23,x23,ror#5 +	add	x22,x22,x16			// h+=Sigma1(e) +	and	x19,x19,x28			// (b^c)&=(a^b) +	add	x26,x26,x22			// d+=h +	eor	x19,x19,x24			// Maj(a,b,c) +	eor	x17,x8,x17,ror#34	// Sigma0(a) +	add	x22,x22,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	//add	x22,x22,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x1,x1			// 14 +#endif +	ldr	x6,[sp,#24] +	add	x22,x22,x17			// h+=Sigma0(a) +	str	x9,[sp,#16] +	ror	x16,x26,#14 +	add	x21,x21,x19			// h+=K[i] +	eor	x9,x26,x26,ror#23 +	and	x17,x27,x26 +	bic	x19,x20,x26 +	add	x21,x21,x1			// h+=X[i] +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x22,x23			// a^b, b^c in next round +	eor	x16,x16,x9,ror#18	// Sigma1(e) +	ror	x9,x22,#28 +	add	x21,x21,x17			// h+=Ch(e,f,g) +	eor	x17,x22,x22,ror#5 +	add	x21,x21,x16			// h+=Sigma1(e) +	and	x28,x28,x19			// (b^c)&=(a^b) +	add	x25,x25,x21			// d+=h +	eor	x28,x28,x23			// Maj(a,b,c) +	eor	x17,x9,x17,ror#34	// Sigma0(a) +	add	x21,x21,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	//add	x21,x21,x17			// h+=Sigma0(a) +#ifndef	__ARMEB__ +	rev	x2,x2			// 15 +#endif +	ldr	x7,[sp,#0] +	add	x21,x21,x17			// h+=Sigma0(a) +	str	x10,[sp,#24] +	ror	x16,x25,#14 +	add	x20,x20,x28			// h+=K[i] +	ror	x9,x4,#1 +	and	x17,x26,x25 +	ror	x8,x1,#19 +	bic	x28,x27,x25 +	ror	x10,x21,#28 +	add	x20,x20,x2			// h+=X[i] +	eor	x16,x16,x25,ror#18 +	eor	x9,x9,x4,ror#8 +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x21,x22			// a^b, b^c in next round +	eor	x16,x16,x25,ror#41	// Sigma1(e) +	eor	x10,x10,x21,ror#34 +	add	x20,x20,x17			// h+=Ch(e,f,g) +	and	x19,x19,x28			// (b^c)&=(a^b) +	eor	x8,x8,x1,ror#61 +	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1]) +	add	x20,x20,x16			// h+=Sigma1(e) +	eor	x19,x19,x22			// Maj(a,b,c) +	eor	x17,x10,x21,ror#39	// Sigma0(a) +	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14]) +	add	x3,x3,x12 +	add	x24,x24,x20			// d+=h +	add	x20,x20,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	add	x3,x3,x9 +	add	x20,x20,x17			// h+=Sigma0(a) +	add	x3,x3,x8 +.Loop_16_xx: +	ldr	x8,[sp,#8] +	str	x11,[sp,#0] +	ror	x16,x24,#14 +	add	x27,x27,x19			// h+=K[i] +	ror	x10,x5,#1 +	and	x17,x25,x24 +	ror	x9,x2,#19 +	bic	x19,x26,x24 +	ror	x11,x20,#28 +	add	x27,x27,x3			// h+=X[i] +	eor	x16,x16,x24,ror#18 +	eor	x10,x10,x5,ror#8 +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x20,x21			// a^b, b^c in next round +	eor	x16,x16,x24,ror#41	// Sigma1(e) +	eor	x11,x11,x20,ror#34 +	add	x27,x27,x17			// h+=Ch(e,f,g) +	and	x28,x28,x19			// (b^c)&=(a^b) +	eor	x9,x9,x2,ror#61 +	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1]) +	add	x27,x27,x16			// h+=Sigma1(e) +	eor	x28,x28,x21			// Maj(a,b,c) +	eor	x17,x11,x20,ror#39	// Sigma0(a) +	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14]) +	add	x4,x4,x13 +	add	x23,x23,x27			// d+=h +	add	x27,x27,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	add	x4,x4,x10 +	add	x27,x27,x17			// h+=Sigma0(a) +	add	x4,x4,x9 +	ldr	x9,[sp,#16] +	str	x12,[sp,#8] +	ror	x16,x23,#14 +	add	x26,x26,x28			// h+=K[i] +	ror	x11,x6,#1 +	and	x17,x24,x23 +	ror	x10,x3,#19 +	bic	x28,x25,x23 +	ror	x12,x27,#28 +	add	x26,x26,x4			// h+=X[i] +	eor	x16,x16,x23,ror#18 +	eor	x11,x11,x6,ror#8 +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x27,x20			// a^b, b^c in next round +	eor	x16,x16,x23,ror#41	// Sigma1(e) +	eor	x12,x12,x27,ror#34 +	add	x26,x26,x17			// h+=Ch(e,f,g) +	and	x19,x19,x28			// (b^c)&=(a^b) +	eor	x10,x10,x3,ror#61 +	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1]) +	add	x26,x26,x16			// h+=Sigma1(e) +	eor	x19,x19,x20			// Maj(a,b,c) +	eor	x17,x12,x27,ror#39	// Sigma0(a) +	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14]) +	add	x5,x5,x14 +	add	x22,x22,x26			// d+=h +	add	x26,x26,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	add	x5,x5,x11 +	add	x26,x26,x17			// h+=Sigma0(a) +	add	x5,x5,x10 +	ldr	x10,[sp,#24] +	str	x13,[sp,#16] +	ror	x16,x22,#14 +	add	x25,x25,x19			// h+=K[i] +	ror	x12,x7,#1 +	and	x17,x23,x22 +	ror	x11,x4,#19 +	bic	x19,x24,x22 +	ror	x13,x26,#28 +	add	x25,x25,x5			// h+=X[i] +	eor	x16,x16,x22,ror#18 +	eor	x12,x12,x7,ror#8 +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x26,x27			// a^b, b^c in next round +	eor	x16,x16,x22,ror#41	// Sigma1(e) +	eor	x13,x13,x26,ror#34 +	add	x25,x25,x17			// h+=Ch(e,f,g) +	and	x28,x28,x19			// (b^c)&=(a^b) +	eor	x11,x11,x4,ror#61 +	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1]) +	add	x25,x25,x16			// h+=Sigma1(e) +	eor	x28,x28,x27			// Maj(a,b,c) +	eor	x17,x13,x26,ror#39	// Sigma0(a) +	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14]) +	add	x6,x6,x15 +	add	x21,x21,x25			// d+=h +	add	x25,x25,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	add	x6,x6,x12 +	add	x25,x25,x17			// h+=Sigma0(a) +	add	x6,x6,x11 +	ldr	x11,[sp,#0] +	str	x14,[sp,#24] +	ror	x16,x21,#14 +	add	x24,x24,x28			// h+=K[i] +	ror	x13,x8,#1 +	and	x17,x22,x21 +	ror	x12,x5,#19 +	bic	x28,x23,x21 +	ror	x14,x25,#28 +	add	x24,x24,x6			// h+=X[i] +	eor	x16,x16,x21,ror#18 +	eor	x13,x13,x8,ror#8 +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x25,x26			// a^b, b^c in next round +	eor	x16,x16,x21,ror#41	// Sigma1(e) +	eor	x14,x14,x25,ror#34 +	add	x24,x24,x17			// h+=Ch(e,f,g) +	and	x19,x19,x28			// (b^c)&=(a^b) +	eor	x12,x12,x5,ror#61 +	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1]) +	add	x24,x24,x16			// h+=Sigma1(e) +	eor	x19,x19,x26			// Maj(a,b,c) +	eor	x17,x14,x25,ror#39	// Sigma0(a) +	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14]) +	add	x7,x7,x0 +	add	x20,x20,x24			// d+=h +	add	x24,x24,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	add	x7,x7,x13 +	add	x24,x24,x17			// h+=Sigma0(a) +	add	x7,x7,x12 +	ldr	x12,[sp,#8] +	str	x15,[sp,#0] +	ror	x16,x20,#14 +	add	x23,x23,x19			// h+=K[i] +	ror	x14,x9,#1 +	and	x17,x21,x20 +	ror	x13,x6,#19 +	bic	x19,x22,x20 +	ror	x15,x24,#28 +	add	x23,x23,x7			// h+=X[i] +	eor	x16,x16,x20,ror#18 +	eor	x14,x14,x9,ror#8 +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x24,x25			// a^b, b^c in next round +	eor	x16,x16,x20,ror#41	// Sigma1(e) +	eor	x15,x15,x24,ror#34 +	add	x23,x23,x17			// h+=Ch(e,f,g) +	and	x28,x28,x19			// (b^c)&=(a^b) +	eor	x13,x13,x6,ror#61 +	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1]) +	add	x23,x23,x16			// h+=Sigma1(e) +	eor	x28,x28,x25			// Maj(a,b,c) +	eor	x17,x15,x24,ror#39	// Sigma0(a) +	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14]) +	add	x8,x8,x1 +	add	x27,x27,x23			// d+=h +	add	x23,x23,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	add	x8,x8,x14 +	add	x23,x23,x17			// h+=Sigma0(a) +	add	x8,x8,x13 +	ldr	x13,[sp,#16] +	str	x0,[sp,#8] +	ror	x16,x27,#14 +	add	x22,x22,x28			// h+=K[i] +	ror	x15,x10,#1 +	and	x17,x20,x27 +	ror	x14,x7,#19 +	bic	x28,x21,x27 +	ror	x0,x23,#28 +	add	x22,x22,x8			// h+=X[i] +	eor	x16,x16,x27,ror#18 +	eor	x15,x15,x10,ror#8 +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x23,x24			// a^b, b^c in next round +	eor	x16,x16,x27,ror#41	// Sigma1(e) +	eor	x0,x0,x23,ror#34 +	add	x22,x22,x17			// h+=Ch(e,f,g) +	and	x19,x19,x28			// (b^c)&=(a^b) +	eor	x14,x14,x7,ror#61 +	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1]) +	add	x22,x22,x16			// h+=Sigma1(e) +	eor	x19,x19,x24			// Maj(a,b,c) +	eor	x17,x0,x23,ror#39	// Sigma0(a) +	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14]) +	add	x9,x9,x2 +	add	x26,x26,x22			// d+=h +	add	x22,x22,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	add	x9,x9,x15 +	add	x22,x22,x17			// h+=Sigma0(a) +	add	x9,x9,x14 +	ldr	x14,[sp,#24] +	str	x1,[sp,#16] +	ror	x16,x26,#14 +	add	x21,x21,x19			// h+=K[i] +	ror	x0,x11,#1 +	and	x17,x27,x26 +	ror	x15,x8,#19 +	bic	x19,x20,x26 +	ror	x1,x22,#28 +	add	x21,x21,x9			// h+=X[i] +	eor	x16,x16,x26,ror#18 +	eor	x0,x0,x11,ror#8 +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x22,x23			// a^b, b^c in next round +	eor	x16,x16,x26,ror#41	// Sigma1(e) +	eor	x1,x1,x22,ror#34 +	add	x21,x21,x17			// h+=Ch(e,f,g) +	and	x28,x28,x19			// (b^c)&=(a^b) +	eor	x15,x15,x8,ror#61 +	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1]) +	add	x21,x21,x16			// h+=Sigma1(e) +	eor	x28,x28,x23			// Maj(a,b,c) +	eor	x17,x1,x22,ror#39	// Sigma0(a) +	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14]) +	add	x10,x10,x3 +	add	x25,x25,x21			// d+=h +	add	x21,x21,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	add	x10,x10,x0 +	add	x21,x21,x17			// h+=Sigma0(a) +	add	x10,x10,x15 +	ldr	x15,[sp,#0] +	str	x2,[sp,#24] +	ror	x16,x25,#14 +	add	x20,x20,x28			// h+=K[i] +	ror	x1,x12,#1 +	and	x17,x26,x25 +	ror	x0,x9,#19 +	bic	x28,x27,x25 +	ror	x2,x21,#28 +	add	x20,x20,x10			// h+=X[i] +	eor	x16,x16,x25,ror#18 +	eor	x1,x1,x12,ror#8 +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x21,x22			// a^b, b^c in next round +	eor	x16,x16,x25,ror#41	// Sigma1(e) +	eor	x2,x2,x21,ror#34 +	add	x20,x20,x17			// h+=Ch(e,f,g) +	and	x19,x19,x28			// (b^c)&=(a^b) +	eor	x0,x0,x9,ror#61 +	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1]) +	add	x20,x20,x16			// h+=Sigma1(e) +	eor	x19,x19,x22			// Maj(a,b,c) +	eor	x17,x2,x21,ror#39	// Sigma0(a) +	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14]) +	add	x11,x11,x4 +	add	x24,x24,x20			// d+=h +	add	x20,x20,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	add	x11,x11,x1 +	add	x20,x20,x17			// h+=Sigma0(a) +	add	x11,x11,x0 +	ldr	x0,[sp,#8] +	str	x3,[sp,#0] +	ror	x16,x24,#14 +	add	x27,x27,x19			// h+=K[i] +	ror	x2,x13,#1 +	and	x17,x25,x24 +	ror	x1,x10,#19 +	bic	x19,x26,x24 +	ror	x3,x20,#28 +	add	x27,x27,x11			// h+=X[i] +	eor	x16,x16,x24,ror#18 +	eor	x2,x2,x13,ror#8 +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x20,x21			// a^b, b^c in next round +	eor	x16,x16,x24,ror#41	// Sigma1(e) +	eor	x3,x3,x20,ror#34 +	add	x27,x27,x17			// h+=Ch(e,f,g) +	and	x28,x28,x19			// (b^c)&=(a^b) +	eor	x1,x1,x10,ror#61 +	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1]) +	add	x27,x27,x16			// h+=Sigma1(e) +	eor	x28,x28,x21			// Maj(a,b,c) +	eor	x17,x3,x20,ror#39	// Sigma0(a) +	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14]) +	add	x12,x12,x5 +	add	x23,x23,x27			// d+=h +	add	x27,x27,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	add	x12,x12,x2 +	add	x27,x27,x17			// h+=Sigma0(a) +	add	x12,x12,x1 +	ldr	x1,[sp,#16] +	str	x4,[sp,#8] +	ror	x16,x23,#14 +	add	x26,x26,x28			// h+=K[i] +	ror	x3,x14,#1 +	and	x17,x24,x23 +	ror	x2,x11,#19 +	bic	x28,x25,x23 +	ror	x4,x27,#28 +	add	x26,x26,x12			// h+=X[i] +	eor	x16,x16,x23,ror#18 +	eor	x3,x3,x14,ror#8 +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x27,x20			// a^b, b^c in next round +	eor	x16,x16,x23,ror#41	// Sigma1(e) +	eor	x4,x4,x27,ror#34 +	add	x26,x26,x17			// h+=Ch(e,f,g) +	and	x19,x19,x28			// (b^c)&=(a^b) +	eor	x2,x2,x11,ror#61 +	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1]) +	add	x26,x26,x16			// h+=Sigma1(e) +	eor	x19,x19,x20			// Maj(a,b,c) +	eor	x17,x4,x27,ror#39	// Sigma0(a) +	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14]) +	add	x13,x13,x6 +	add	x22,x22,x26			// d+=h +	add	x26,x26,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	add	x13,x13,x3 +	add	x26,x26,x17			// h+=Sigma0(a) +	add	x13,x13,x2 +	ldr	x2,[sp,#24] +	str	x5,[sp,#16] +	ror	x16,x22,#14 +	add	x25,x25,x19			// h+=K[i] +	ror	x4,x15,#1 +	and	x17,x23,x22 +	ror	x3,x12,#19 +	bic	x19,x24,x22 +	ror	x5,x26,#28 +	add	x25,x25,x13			// h+=X[i] +	eor	x16,x16,x22,ror#18 +	eor	x4,x4,x15,ror#8 +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x26,x27			// a^b, b^c in next round +	eor	x16,x16,x22,ror#41	// Sigma1(e) +	eor	x5,x5,x26,ror#34 +	add	x25,x25,x17			// h+=Ch(e,f,g) +	and	x28,x28,x19			// (b^c)&=(a^b) +	eor	x3,x3,x12,ror#61 +	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1]) +	add	x25,x25,x16			// h+=Sigma1(e) +	eor	x28,x28,x27			// Maj(a,b,c) +	eor	x17,x5,x26,ror#39	// Sigma0(a) +	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14]) +	add	x14,x14,x7 +	add	x21,x21,x25			// d+=h +	add	x25,x25,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	add	x14,x14,x4 +	add	x25,x25,x17			// h+=Sigma0(a) +	add	x14,x14,x3 +	ldr	x3,[sp,#0] +	str	x6,[sp,#24] +	ror	x16,x21,#14 +	add	x24,x24,x28			// h+=K[i] +	ror	x5,x0,#1 +	and	x17,x22,x21 +	ror	x4,x13,#19 +	bic	x28,x23,x21 +	ror	x6,x25,#28 +	add	x24,x24,x14			// h+=X[i] +	eor	x16,x16,x21,ror#18 +	eor	x5,x5,x0,ror#8 +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x25,x26			// a^b, b^c in next round +	eor	x16,x16,x21,ror#41	// Sigma1(e) +	eor	x6,x6,x25,ror#34 +	add	x24,x24,x17			// h+=Ch(e,f,g) +	and	x19,x19,x28			// (b^c)&=(a^b) +	eor	x4,x4,x13,ror#61 +	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1]) +	add	x24,x24,x16			// h+=Sigma1(e) +	eor	x19,x19,x26			// Maj(a,b,c) +	eor	x17,x6,x25,ror#39	// Sigma0(a) +	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14]) +	add	x15,x15,x8 +	add	x20,x20,x24			// d+=h +	add	x24,x24,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	add	x15,x15,x5 +	add	x24,x24,x17			// h+=Sigma0(a) +	add	x15,x15,x4 +	ldr	x4,[sp,#8] +	str	x7,[sp,#0] +	ror	x16,x20,#14 +	add	x23,x23,x19			// h+=K[i] +	ror	x6,x1,#1 +	and	x17,x21,x20 +	ror	x5,x14,#19 +	bic	x19,x22,x20 +	ror	x7,x24,#28 +	add	x23,x23,x15			// h+=X[i] +	eor	x16,x16,x20,ror#18 +	eor	x6,x6,x1,ror#8 +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x24,x25			// a^b, b^c in next round +	eor	x16,x16,x20,ror#41	// Sigma1(e) +	eor	x7,x7,x24,ror#34 +	add	x23,x23,x17			// h+=Ch(e,f,g) +	and	x28,x28,x19			// (b^c)&=(a^b) +	eor	x5,x5,x14,ror#61 +	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1]) +	add	x23,x23,x16			// h+=Sigma1(e) +	eor	x28,x28,x25			// Maj(a,b,c) +	eor	x17,x7,x24,ror#39	// Sigma0(a) +	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14]) +	add	x0,x0,x9 +	add	x27,x27,x23			// d+=h +	add	x23,x23,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	add	x0,x0,x6 +	add	x23,x23,x17			// h+=Sigma0(a) +	add	x0,x0,x5 +	ldr	x5,[sp,#16] +	str	x8,[sp,#8] +	ror	x16,x27,#14 +	add	x22,x22,x28			// h+=K[i] +	ror	x7,x2,#1 +	and	x17,x20,x27 +	ror	x6,x15,#19 +	bic	x28,x21,x27 +	ror	x8,x23,#28 +	add	x22,x22,x0			// h+=X[i] +	eor	x16,x16,x27,ror#18 +	eor	x7,x7,x2,ror#8 +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x23,x24			// a^b, b^c in next round +	eor	x16,x16,x27,ror#41	// Sigma1(e) +	eor	x8,x8,x23,ror#34 +	add	x22,x22,x17			// h+=Ch(e,f,g) +	and	x19,x19,x28			// (b^c)&=(a^b) +	eor	x6,x6,x15,ror#61 +	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1]) +	add	x22,x22,x16			// h+=Sigma1(e) +	eor	x19,x19,x24			// Maj(a,b,c) +	eor	x17,x8,x23,ror#39	// Sigma0(a) +	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14]) +	add	x1,x1,x10 +	add	x26,x26,x22			// d+=h +	add	x22,x22,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	add	x1,x1,x7 +	add	x22,x22,x17			// h+=Sigma0(a) +	add	x1,x1,x6 +	ldr	x6,[sp,#24] +	str	x9,[sp,#16] +	ror	x16,x26,#14 +	add	x21,x21,x19			// h+=K[i] +	ror	x8,x3,#1 +	and	x17,x27,x26 +	ror	x7,x0,#19 +	bic	x19,x20,x26 +	ror	x9,x22,#28 +	add	x21,x21,x1			// h+=X[i] +	eor	x16,x16,x26,ror#18 +	eor	x8,x8,x3,ror#8 +	orr	x17,x17,x19			// Ch(e,f,g) +	eor	x19,x22,x23			// a^b, b^c in next round +	eor	x16,x16,x26,ror#41	// Sigma1(e) +	eor	x9,x9,x22,ror#34 +	add	x21,x21,x17			// h+=Ch(e,f,g) +	and	x28,x28,x19			// (b^c)&=(a^b) +	eor	x7,x7,x0,ror#61 +	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1]) +	add	x21,x21,x16			// h+=Sigma1(e) +	eor	x28,x28,x23			// Maj(a,b,c) +	eor	x17,x9,x22,ror#39	// Sigma0(a) +	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14]) +	add	x2,x2,x11 +	add	x25,x25,x21			// d+=h +	add	x21,x21,x28			// h+=Maj(a,b,c) +	ldr	x28,[x30],#8		// *K++, x19 in next round +	add	x2,x2,x8 +	add	x21,x21,x17			// h+=Sigma0(a) +	add	x2,x2,x7 +	ldr	x7,[sp,#0] +	str	x10,[sp,#24] +	ror	x16,x25,#14 +	add	x20,x20,x28			// h+=K[i] +	ror	x9,x4,#1 +	and	x17,x26,x25 +	ror	x8,x1,#19 +	bic	x28,x27,x25 +	ror	x10,x21,#28 +	add	x20,x20,x2			// h+=X[i] +	eor	x16,x16,x25,ror#18 +	eor	x9,x9,x4,ror#8 +	orr	x17,x17,x28			// Ch(e,f,g) +	eor	x28,x21,x22			// a^b, b^c in next round +	eor	x16,x16,x25,ror#41	// Sigma1(e) +	eor	x10,x10,x21,ror#34 +	add	x20,x20,x17			// h+=Ch(e,f,g) +	and	x19,x19,x28			// (b^c)&=(a^b) +	eor	x8,x8,x1,ror#61 +	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1]) +	add	x20,x20,x16			// h+=Sigma1(e) +	eor	x19,x19,x22			// Maj(a,b,c) +	eor	x17,x10,x21,ror#39	// Sigma0(a) +	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14]) +	add	x3,x3,x12 +	add	x24,x24,x20			// d+=h +	add	x20,x20,x19			// h+=Maj(a,b,c) +	ldr	x19,[x30],#8		// *K++, x28 in next round +	add	x3,x3,x9 +	add	x20,x20,x17			// h+=Sigma0(a) +	add	x3,x3,x8 +	cbnz	x19,.Loop_16_xx + +	ldp	x0,x2,[x29,#96] +	ldr	x1,[x29,#112] +	sub	x30,x30,#648		// rewind + +	ldp	x3,x4,[x0] +	ldp	x5,x6,[x0,#2*8] +	add	x1,x1,#14*8			// advance input pointer +	ldp	x7,x8,[x0,#4*8] +	add	x20,x20,x3 +	ldp	x9,x10,[x0,#6*8] +	add	x21,x21,x4 +	add	x22,x22,x5 +	add	x23,x23,x6 +	stp	x20,x21,[x0] +	add	x24,x24,x7 +	add	x25,x25,x8 +	stp	x22,x23,[x0,#2*8] +	add	x26,x26,x9 +	add	x27,x27,x10 +	cmp	x1,x2 +	stp	x24,x25,[x0,#4*8] +	stp	x26,x27,[x0,#6*8] +	b.ne	.Loop + +	ldp	x19,x20,[x29,#16] +	add	sp,sp,#4*8 +	ldp	x21,x22,[x29,#32] +	ldp	x23,x24,[x29,#48] +	ldp	x25,x26,[x29,#64] +	ldp	x27,x28,[x29,#80] +	ldp	x29,x30,[sp],#128 +	ret +.size	sha512_block_data_order,.-sha512_block_data_order + +.align	6 +.type	K512,%object +K512: +	.quad	0x428a2f98d728ae22,0x7137449123ef65cd +	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +	.quad	0x3956c25bf348b538,0x59f111f1b605d019 +	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118 +	.quad	0xd807aa98a3030242,0x12835b0145706fbe +	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1 +	.quad	0x9bdc06a725c71235,0xc19bf174cf692694 +	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3 +	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483 +	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +	.quad	0x983e5152ee66dfab,0xa831c66d2db43210 +	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4 +	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725 +	.quad	0x06ca6351e003826f,0x142929670a0e6e70 +	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926 +	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df +	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8 +	.quad	0x81c2c92e47edaee6,0x92722c851482353b +	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001 +	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30 +	.quad	0xd192e819d6ef5218,0xd69906245565a910 +	.quad	0xf40e35855771202a,0x106aa07032bbd1b8 +	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53 +	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60 +	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec +	.quad	0x90befffa23631e28,0xa4506cebde82bde9 +	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b +	.quad	0xca273eceea26619c,0xd186b8c721c0c207 +	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6 +	.quad	0x113f9804bef90dae,0x1b710b35131c471b +	.quad	0x28db77f523047d84,0x32caab7b40c72493 +	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a +	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817 +	.quad	0	// terminator +.size	K512,.-K512 +.align	3 +.LOPENSSL_armcap_P: +	.quad	OPENSSL_armcap_P-. +.asciz	"SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align	2 +.comm	OPENSSL_armcap_P,4,4 diff --git a/main/openssl/crypto/sha/asm/sha512-armv8.pl b/main/openssl/crypto/sha/asm/sha512-armv8.pl new file mode 100644 index 00000000..6935ed65 --- /dev/null +++ b/main/openssl/crypto/sha/asm/sha512-armv8.pl @@ -0,0 +1,414 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +#		SHA256-hw	SHA256(*)	SHA512 +# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**)) +# Cortex-A5x	n/a		n/a		n/a +#  +# (*)	Software SHA256 results are of lesser relevance, presented +#	mostly for informational purposes. +# (**)	The result is a trade-off: it's possible to improve it by +#	10%, but at the cost of 20% loss on Cortex-A5x. + +$flavour=shift; +$output=shift; +open STDOUT,">$output"; + +if ($output =~ /512/) { +	$BITS=512; +	$SZ=8; +	@Sigma0=(28,34,39); +	@Sigma1=(14,18,41); +	@sigma0=(1,  8, 7); +	@sigma1=(19,61, 6); +	$rounds=80; +	$reg_t="x"; +} else { +	$BITS=256; +	$SZ=4; +	@Sigma0=( 2,13,22); +	@Sigma1=( 6,11,25); +	@sigma0=( 7,18, 3); +	@sigma1=(17,19,10); +	$rounds=64; +	$reg_t="w"; +} + +$func="sha${BITS}_block_data_order"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); +   $T0=@X[$i+3] if ($i<11); + +$code.=<<___	if ($i<16); +#ifndef	__ARMEB__ +	rev	@X[$i],@X[$i]			// $i +#endif +___ +$code.=<<___	if ($i<13 && ($i&1)); +	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___	if ($i==13); +	ldp	@X[14],@X[15],[$inp] +___ +$code.=<<___	if ($i>=14); +	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___	if ($i>0 && $i<16); +	add	$a,$a,$t1			// h+=Sigma0(a) +___ +$code.=<<___	if ($i>=11); +	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___	if ($i<15); +	ror	$t0,$e,#$Sigma1[0] +	add	$h,$h,$t2			// h+=K[i] +	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` +	and	$t1,$f,$e +	bic	$t2,$g,$e +	add	$h,$h,@X[$i&15]			// h+=X[i] +	orr	$t1,$t1,$t2			// Ch(e,f,g) +	eor	$t2,$a,$b			// a^b, b^c in next round +	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e) +	ror	$T0,$a,#$Sigma0[0] +	add	$h,$h,$t1			// h+=Ch(e,f,g) +	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` +	add	$h,$h,$t0			// h+=Sigma1(e) +	and	$t3,$t3,$t2			// (b^c)&=(a^b) +	add	$d,$d,$h			// d+=h +	eor	$t3,$t3,$b			// Maj(a,b,c) +	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a) +	add	$h,$h,$t3			// h+=Maj(a,b,c) +	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round +	//add	$h,$h,$t1			// h+=Sigma0(a) +___ +$code.=<<___	if ($i>=15); +	ror	$t0,$e,#$Sigma1[0] +	add	$h,$h,$t2			// h+=K[i] +	ror	$T1,@X[($j+1)&15],#$sigma0[0] +	and	$t1,$f,$e +	ror	$T2,@X[($j+14)&15],#$sigma1[0] +	bic	$t2,$g,$e +	ror	$T0,$a,#$Sigma0[0] +	add	$h,$h,@X[$i&15]			// h+=X[i] +	eor	$t0,$t0,$e,ror#$Sigma1[1] +	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1] +	orr	$t1,$t1,$t2			// Ch(e,f,g) +	eor	$t2,$a,$b			// a^b, b^c in next round +	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e) +	eor	$T0,$T0,$a,ror#$Sigma0[1] +	add	$h,$h,$t1			// h+=Ch(e,f,g) +	and	$t3,$t3,$t2			// (b^c)&=(a^b) +	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1] +	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1]) +	add	$h,$h,$t0			// h+=Sigma1(e) +	eor	$t3,$t3,$b			// Maj(a,b,c) +	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a) +	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14]) +	add	@X[$j],@X[$j],@X[($j+9)&15] +	add	$d,$d,$h			// d+=h +	add	$h,$h,$t3			// h+=Maj(a,b,c) +	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round +	add	@X[$j],@X[$j],$T1 +	add	$h,$h,$t1			// h+=Sigma0(a) +	add	@X[$j],@X[$j],$T2 +___ +	($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#include "arm_arch.h" + +.text + +.globl	$func +.type	$func,%function +.align	6 +$func: +___ +$code.=<<___	if ($SZ==4); +	ldr	x16,.LOPENSSL_armcap_P +	adr	x17,.LOPENSSL_armcap_P +	add	x16,x16,x17 +	ldr	w16,[x16] +	tst	w16,#ARMV8_SHA256 +	b.ne	.Lv8_entry +___ +$code.=<<___; +	stp	x29,x30,[sp,#-128]! +	add	x29,sp,#0 + +	stp	x19,x20,[sp,#16] +	stp	x21,x22,[sp,#32] +	stp	x23,x24,[sp,#48] +	stp	x25,x26,[sp,#64] +	stp	x27,x28,[sp,#80] +	sub	sp,sp,#4*$SZ + +	ldp	$A,$B,[$ctx]				// load context +	ldp	$C,$D,[$ctx,#2*$SZ] +	ldp	$E,$F,[$ctx,#4*$SZ] +	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input +	ldp	$G,$H,[$ctx,#6*$SZ] +	adr	$Ktbl,K$BITS +	stp	$ctx,$num,[x29,#96] + +.Loop: +	ldp	@X[0],@X[1],[$inp],#2*$SZ +	ldr	$t2,[$Ktbl],#$SZ			// *K++ +	eor	$t3,$B,$C				// magic seed +	str	$inp,[x29,#112] +___ +for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +	cbnz	$t2,.Loop_16_xx + +	ldp	$ctx,$num,[x29,#96] +	ldr	$inp,[x29,#112] +	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind + +	ldp	@X[0],@X[1],[$ctx] +	ldp	@X[2],@X[3],[$ctx,#2*$SZ] +	add	$inp,$inp,#14*$SZ			// advance input pointer +	ldp	@X[4],@X[5],[$ctx,#4*$SZ] +	add	$A,$A,@X[0] +	ldp	@X[6],@X[7],[$ctx,#6*$SZ] +	add	$B,$B,@X[1] +	add	$C,$C,@X[2] +	add	$D,$D,@X[3] +	stp	$A,$B,[$ctx] +	add	$E,$E,@X[4] +	add	$F,$F,@X[5] +	stp	$C,$D,[$ctx,#2*$SZ] +	add	$G,$G,@X[6] +	add	$H,$H,@X[7] +	cmp	$inp,$num +	stp	$E,$F,[$ctx,#4*$SZ] +	stp	$G,$H,[$ctx,#6*$SZ] +	b.ne	.Loop + +	ldp	x19,x20,[x29,#16] +	add	sp,sp,#4*$SZ +	ldp	x21,x22,[x29,#32] +	ldp	x23,x24,[x29,#48] +	ldp	x25,x26,[x29,#64] +	ldp	x27,x28,[x29,#80] +	ldp	x29,x30,[sp],#128 +	ret +.size	$func,.-$func + +.align	6 +.type	K$BITS,%object +K$BITS: +___ +$code.=<<___ if ($SZ==8); +	.quad	0x428a2f98d728ae22,0x7137449123ef65cd +	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +	.quad	0x3956c25bf348b538,0x59f111f1b605d019 +	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118 +	.quad	0xd807aa98a3030242,0x12835b0145706fbe +	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1 +	.quad	0x9bdc06a725c71235,0xc19bf174cf692694 +	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3 +	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483 +	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +	.quad	0x983e5152ee66dfab,0xa831c66d2db43210 +	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4 +	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725 +	.quad	0x06ca6351e003826f,0x142929670a0e6e70 +	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926 +	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df +	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8 +	.quad	0x81c2c92e47edaee6,0x92722c851482353b +	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001 +	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30 +	.quad	0xd192e819d6ef5218,0xd69906245565a910 +	.quad	0xf40e35855771202a,0x106aa07032bbd1b8 +	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53 +	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60 +	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec +	.quad	0x90befffa23631e28,0xa4506cebde82bde9 +	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b +	.quad	0xca273eceea26619c,0xd186b8c721c0c207 +	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6 +	.quad	0x113f9804bef90dae,0x1b710b35131c471b +	.quad	0x28db77f523047d84,0x32caab7b40c72493 +	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a +	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817 +	.quad	0	// terminator +___ +$code.=<<___ if ($SZ==4); +	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +	.long	0	//terminator +___ +$code.=<<___; +.size	K$BITS,.-K$BITS +.align	3 +.LOPENSSL_armcap_P: +	.quad	OPENSSL_armcap_P-. +.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align	2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +.type	sha256_block_armv8,%function +.align	6 +sha256_block_armv8: +.Lv8_entry: +	stp		x29,x30,[sp,#-16]! +	add		x29,sp,#0 + +	ld1.32		{$ABCD,$EFGH},[$ctx] +	adr		$Ktbl,K256 + +.Loop_hw: +	ld1		{@MSG[0]-@MSG[3]},[$inp],#64 +	sub		$num,$num,#1 +	ld1.32		{$W0},[$Ktbl],#16 +	rev32		@MSG[0],@MSG[0] +	rev32		@MSG[1],@MSG[1] +	rev32		@MSG[2],@MSG[2] +	rev32		@MSG[3],@MSG[3] +	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload +	orr		$EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; +	ld1.32		{$W1},[$Ktbl],#16 +	add.i32		$W0,$W0,@MSG[0] +	sha256su0	@MSG[0],@MSG[1] +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W0 +	sha256h2	$EFGH,$abcd,$W0 +	sha256su1	@MSG[0],@MSG[2],@MSG[3] +___ +	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG)); +} +$code.=<<___; +	ld1.32		{$W1},[$Ktbl],#16 +	add.i32		$W0,$W0,@MSG[0] +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W0 +	sha256h2	$EFGH,$abcd,$W0 + +	ld1.32		{$W0},[$Ktbl],#16 +	add.i32		$W1,$W1,@MSG[1] +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W1 +	sha256h2	$EFGH,$abcd,$W1 + +	ld1.32		{$W1},[$Ktbl] +	add.i32		$W0,$W0,@MSG[2] +	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W0 +	sha256h2	$EFGH,$abcd,$W0 + +	add.i32		$W1,$W1,@MSG[3] +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W1 +	sha256h2	$EFGH,$abcd,$W1 + +	add.i32		$ABCD,$ABCD,$ABCD_SAVE +	add.i32		$EFGH,$EFGH,$EFGH_SAVE + +	cbnz		$num,.Loop_hw + +	st1.32		{$ABCD,$EFGH},[$ctx] + +	ldr		x29,[sp],#16 +	ret +.size	sha256_block_armv8,.-sha256_block_armv8 +___ +} + +$code.=<<___; +.comm	OPENSSL_armcap_P,4,4 +___ + +{   my  %opcode = ( +	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000, +	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	); + +    sub unsha256 { +	my ($mnemonic,$arg)=@_; + +	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o +	&& +	sprintf ".inst\t0x%08x\t//%s %s", +			$opcode{$mnemonic}|$1|($2<<5)|($3<<16), +			$mnemonic,$arg; +    } +} + +foreach(split("\n",$code)) { + +	s/\`([^\`]*)\`/eval($1)/geo; + +	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; + +	s/\.\w?32\b//o		and s/\.16b/\.4s/go; +	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go; + +	print $_,"\n"; +} + +close STDOUT; diff --git a/main/openssl/import_openssl.sh b/main/openssl/import_openssl.sh index 4ae7e333..f16596bc 100755 --- a/main/openssl/import_openssl.sh +++ b/main/openssl/import_openssl.sh @@ -128,7 +128,16 @@ function default_asm_file () {  function gen_asm_arm () {    local OUT    OUT=$(default_asm_file "$@") -  $PERL_EXE "$1" > "$OUT" +  $PERL_EXE "$1" void "$OUT" > "$OUT" +} + +# Generate an ARMv8 64-bit assembly file. +# $1: generator (perl script) +# $2: [optional] output file name +function gen_asm_arm64 () { +  local OUT +  OUT=$(default_asm_file "$@") +  $PERL_EXE "$1" linux64 "$OUT" > "$OUT"  }  function gen_asm_mips () { @@ -177,6 +186,54 @@ function print_autogenerated_header() {    echo "#"  } +function run_verbose() { +  echo Running: $@ +  $@ +} + +function scan_opensslconf_for_flags() { +  for flag in "$@"; do +    awk "/^#define ${flag}$/ { print \$2 }" crypto/opensslconf.h +  done +} + +CRYPTO_CONF_FLAGS=( +OPENSSL_CPUID_OBJ +DES_LONG +DES_PTR +DES_RISC1 +DES_RISC2 +DES_UNROLL +RC4_INT +RC4_CHUNK +RC4_INDEX +) + +function check_asm_flags() { +  local arch="$1" +  local target="$2" +  local unsorted_flags +  local expected_flags +  local actual_flags +  local defines="OPENSSL_CRYPTO_DEFINES_$arch" + +  PERL=/usr/bin/perl run_verbose ./Configure $CONFIGURE_ARGS $target + +  unsorted_flags="$(awk '/^CFLAG=/ { sub(/^CFLAG= .*-Wall /, ""); gsub(/-D/, ""); print; }' Makefile)" +  unsorted_flags="$unsorted_flags $(scan_opensslconf_for_flags "${CRYPTO_CONF_FLAGS[@]}")" + +  expected_flags="$(echo $unsorted_flags | tr ' ' '\n' | sort | tr '\n' ' ')" +  actual_flags="$(echo ${!defines} | tr ' ' '\n' | sort | tr '\n' ' ')" + +  if [[ $actual_flags != $expected_flags ]]; then +    echo ${defines} is wrong! +    echo "    $actual_flags" +    echo Please update to: +    echo "    $expected_flags" +    exit 1 +  fi +} +  # Run Configure and generate headers  # $1: 32 for 32-bit arch, 64 for 64-bit arch, trusty for Trusty  # $2: 1 if building for static version @@ -192,9 +249,9 @@ function generate_build_config_headers() {    fi    if [[ $1 == trusty ]] ; then -    PERL=/usr/bin/perl ./Configure $CONFIGURE_ARGS_TRUSTY +    PERL=/usr/bin/perl run_verbose ./Configure $CONFIGURE_ARGS_TRUSTY    else -    PERL=/usr/bin/perl ./Configure $CONFIGURE_ARGS ${!configure_args_bits} ${!configure_args_stat} +    PERL=/usr/bin/perl run_verbose ./Configure $CONFIGURE_ARGS ${!configure_args_bits} ${!configure_args_stat}    fi    rm -f apps/CA.pl.bak crypto/opensslconf.h.bak @@ -424,8 +481,16 @@ function import() {    declare -r OPENSSL_SOURCE=$1    untar $OPENSSL_SOURCE readonly    applypatches $OPENSSL_DIR +  convert_iso8859_to_utf8 $OPENSSL_DIR    cd $OPENSSL_DIR + +  # Check the ASM flags for each arch +  check_asm_flags arm linux-armv4 +  check_asm_flags arm64 linux-aarch64 +  check_asm_flags x86 linux-elf +  check_asm_flags x86_64 linux-x86_64 +    generate_build_config_mk    generate_opensslconf_h @@ -443,14 +508,23 @@ function import() {    # Generate arm asm    gen_asm_arm crypto/aes/asm/aes-armv4.pl +  gen_asm_arm crypto/aes/asm/aesv8-armx.pl    gen_asm_arm crypto/aes/asm/bsaes-armv7.pl    gen_asm_arm crypto/bn/asm/armv4-gf2m.pl    gen_asm_arm crypto/bn/asm/armv4-mont.pl    gen_asm_arm crypto/modes/asm/ghash-armv4.pl +  gen_asm_arm crypto/modes/asm/ghashv8-armx.pl    gen_asm_arm crypto/sha/asm/sha1-armv4-large.pl    gen_asm_arm crypto/sha/asm/sha256-armv4.pl    gen_asm_arm crypto/sha/asm/sha512-armv4.pl +  # Generate armv8 asm +  gen_asm_arm64 crypto/aes/asm/aesv8-armx.pl crypto/aes/asm/aesv8-armx-64.S +  gen_asm_arm64 crypto/modes/asm/ghashv8-armx.pl crypto/modes/asm/ghashv8-armx-64.S +  gen_asm_arm64 crypto/sha/asm/sha1-armv8.pl +  gen_asm_arm64 crypto/sha/asm/sha512-armv8.pl crypto/sha/asm/sha256-armv8.S +  gen_asm_arm64 crypto/sha/asm/sha512-armv8.pl +    # Generate mips asm    gen_asm_mips crypto/aes/asm/aes-mips.pl    gen_asm_mips crypto/bn/asm/mips.pl crypto/bn/asm/bn-mips.S @@ -585,7 +659,6 @@ function untar() {    # Process new source    tar -zxf $OPENSSL_SOURCE -  convert_iso8859_to_utf8 $OPENSSL_DIR    cp -RfP $OPENSSL_DIR $OPENSSL_DIR_ORIG    if [ ! -z $readonly ]; then      find $OPENSSL_DIR_ORIG -type f -print0 | xargs -0 chmod a-w diff --git a/main/openssl/include/openssl/opensslconf-32.h b/main/openssl/include/openssl/opensslconf-32.h index d6625489..caf6f1b8 100644 --- a/main/openssl/include/openssl/opensslconf-32.h +++ b/main/openssl/include/openssl/opensslconf-32.h @@ -53,6 +53,9 @@  #ifndef OPENSSL_NO_RFC3779  # define OPENSSL_NO_RFC3779  #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif  #ifndef OPENSSL_NO_RSAX  # define OPENSSL_NO_RSAX  #endif @@ -137,6 +140,9 @@  # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)  #  define NO_RFC3779  # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +#  define NO_RIPEMD +# endif  # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)  #  define NO_RSAX  # endif diff --git a/main/openssl/include/openssl/opensslconf-64.h b/main/openssl/include/openssl/opensslconf-64.h index 70c5a2cb..88fb0419 100644 --- a/main/openssl/include/openssl/opensslconf-64.h +++ b/main/openssl/include/openssl/opensslconf-64.h @@ -53,6 +53,9 @@  #ifndef OPENSSL_NO_RFC3779  # define OPENSSL_NO_RFC3779  #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif  #ifndef OPENSSL_NO_RSAX  # define OPENSSL_NO_RSAX  #endif @@ -137,6 +140,9 @@  # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)  #  define NO_RFC3779  # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +#  define NO_RIPEMD +# endif  # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)  #  define NO_RSAX  # endif diff --git a/main/openssl/include/openssl/opensslconf-static-32.h b/main/openssl/include/openssl/opensslconf-static-32.h index d6625489..caf6f1b8 100644 --- a/main/openssl/include/openssl/opensslconf-static-32.h +++ b/main/openssl/include/openssl/opensslconf-static-32.h @@ -53,6 +53,9 @@  #ifndef OPENSSL_NO_RFC3779  # define OPENSSL_NO_RFC3779  #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif  #ifndef OPENSSL_NO_RSAX  # define OPENSSL_NO_RSAX  #endif @@ -137,6 +140,9 @@  # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)  #  define NO_RFC3779  # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +#  define NO_RIPEMD +# endif  # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)  #  define NO_RSAX  # endif diff --git a/main/openssl/include/openssl/opensslconf-static-64.h b/main/openssl/include/openssl/opensslconf-static-64.h index 70c5a2cb..88fb0419 100644 --- a/main/openssl/include/openssl/opensslconf-static-64.h +++ b/main/openssl/include/openssl/opensslconf-static-64.h @@ -53,6 +53,9 @@  #ifndef OPENSSL_NO_RFC3779  # define OPENSSL_NO_RFC3779  #endif +#ifndef OPENSSL_NO_RIPEMD +# define OPENSSL_NO_RIPEMD +#endif  #ifndef OPENSSL_NO_RSAX  # define OPENSSL_NO_RSAX  #endif @@ -137,6 +140,9 @@  # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)  #  define NO_RFC3779  # endif +# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD) +#  define NO_RIPEMD +# endif  # if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)  #  define NO_RSAX  # endif diff --git a/main/openssl/include/openssl/ssl.h b/main/openssl/include/openssl/ssl.h index 06bb90f8..a85841b3 100644 --- a/main/openssl/include/openssl/ssl.h +++ b/main/openssl/include/openssl/ssl.h @@ -1816,6 +1816,7 @@ int	SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits);  char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);  const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);  unsigned long 	SSL_CIPHER_get_id(const SSL_CIPHER *c); +const char *	SSL_CIPHER_authentication_method(const SSL_CIPHER* cipher);  int	SSL_get_fd(const SSL *s);  int	SSL_get_rfd(const SSL *s); diff --git a/main/openssl/include/openssl/tls1.h b/main/openssl/include/openssl/tls1.h index 66520893..b9a0899e 100644 --- a/main/openssl/include/openssl/tls1.h +++ b/main/openssl/include/openssl/tls1.h @@ -532,9 +532,11 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)  #define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256        0x0300C031  #define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384        0x0300C032 -/* ECDHE PSK ciphersuites from RFC 5489 */ -#define TLS1_CK_ECDHE_PSK_WITH_AES_128_CBC_SHA256    0x0300C037 -#define TLS1_CK_ECDHE_PSK_WITH_AES_256_CBC_SHA384    0x0300C038 +/* ECDHE PSK ciphersuites from RFC5489 + * SHA-2 cipher suites are omitted because they cannot be used safely with + * SSLv3. */ +#define TLS1_CK_ECDHE_PSK_WITH_AES_128_CBC_SHA          0x0300C035 +#define TLS1_CK_ECDHE_PSK_WITH_AES_256_CBC_SHA          0x0300C036  /* XXX   * Inconsistency alert: @@ -687,9 +689,9 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)  #define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256       "ECDH-RSA-AES128-GCM-SHA256"  #define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384       "ECDH-RSA-AES256-GCM-SHA384" -/* ECDHE PSK ciphersuites from RFC 5489 */ -#define TLS1_TXT_ECDHE_PSK_WITH_AES_128_CBC_SHA256  "ECDHE-PSK-WITH-AES-128-CBC-SHA256" -#define TLS1_TXT_ECDHE_PSK_WITH_AES_256_CBC_SHA384  "ECDHE-PSK-WITH-AES-256-CBC-SHA384" +/* ECDHE PSK ciphersuites from RFC5489 */ +#define TLS1_TXT_ECDHE_PSK_WITH_AES_128_CBC_SHA         "ECDHE-PSK-AES128-CBC-SHA" +#define TLS1_TXT_ECDHE_PSK_WITH_AES_256_CBC_SHA         "ECDHE-PSK-AES256-CBC-SHA"  #define TLS_CT_RSA_SIGN			1  #define TLS_CT_DSS_SIGN			2 diff --git a/main/openssl/openssl.config b/main/openssl/openssl.config index d44b0bbc..867711fe 100644 --- a/main/openssl/openssl.config +++ b/main/openssl/openssl.config @@ -13,6 +13,7 @@ no-md2 \  no-mdc2 \  no-rc5 \  no-rdrand \ +no-ripemd \  no-rfc3779 \  no-rsax \  no-sctp \ @@ -53,6 +54,7 @@ crypto/cast \  crypto/idea \  crypto/md2 \  crypto/rc5 \ +crypto/ripemd \  crypto/seed \  crypto/whrlpool \  demos \ @@ -177,7 +179,6 @@ crypto/rand/Makefile \  crypto/rand/rand_vms.c \  crypto/rc2/Makefile \  crypto/rc4/Makefile \ -crypto/ripemd/Makefile \  crypto/rsa/Makefile \  crypto/sha/Makefile \  crypto/sha/sha_one.c \ @@ -219,7 +220,7 @@ ssl/heartbeat_test.c \  ssl/install-ssl.com \  ssl/ssl-lib.com \  ssl/ssl_task.c \ -"  +"  NEEDED_SOURCES="\  apps \ @@ -237,19 +238,24 @@ NO_WINDOWS_BRAINDEATH \  "  OPENSSL_CRYPTO_DEFINES_arm="\ +AES_ASM \ +BSAES_ASM \ +DES_UNROLL \ +GHASH_ASM \  OPENSSL_BN_ASM_GF2m \  OPENSSL_BN_ASM_MONT \  OPENSSL_CPUID_OBJ \ -GHASH_ASM \ -AES_ASM \ -BSAES_ASM \  SHA1_ASM \  SHA256_ASM \  SHA512_ASM \  "  OPENSSL_CRYPTO_DEFINES_arm64="\ -OPENSSL_NO_ASM \ +DES_UNROLL \ +OPENSSL_CPUID_OBJ \ +SHA1_ASM \ +SHA256_ASM \ +SHA512_ASM \  "  OPENSSL_CRYPTO_DEFINES_mips="\ @@ -260,39 +266,40 @@ SHA256_ASM \  "  OPENSSL_CRYPTO_DEFINES_x86="\ -OPENSSL_IA32_SSE2 \ +AES_ASM \ +DES_PTR \ +DES_RISC1 \ +DES_UNROLL \ +GHASH_ASM \ +MD5_ASM \  OPENSSL_BN_ASM_GF2m \  OPENSSL_BN_ASM_MONT \  OPENSSL_BN_ASM_PART_WORDS \ -AES_ASM \ -VPAES_ASM \ -GHASH_ASM \ +OPENSSL_CPUID_OBJ \ +OPENSSL_IA32_SSE2 \ +RC4_INDEX \ +RMD160_ASM \  SHA1_ASM \  SHA256_ASM \  SHA512_ASM \ -MD5_ASM \ -DES_PTR \ -DES_RISC1 \ -DES_UNROLL \ -OPENSSL_CPUID_OBJ \ +VPAES_ASM \  "  OPENSSL_CRYPTO_DEFINES_x86_64="\ -OPENSSL_BN_ASM_GF2m \ -OPENSSL_BN_ASM_MONT \ -OPENSSL_BN_ASM_MONT5 \  AES_ASM \ -VPAES_ASM \  BSAES_ASM \ +DES_UNROLL \  GHASH_ASM \ +MD5_ASM \ +OPENSSL_BN_ASM_GF2m \ +OPENSSL_BN_ASM_MONT \ +OPENSSL_BN_ASM_MONT5 \ +OPENSSL_CPUID_OBJ \ +OPENSSL_IA32_SSE2 \  SHA1_ASM \  SHA256_ASM \  SHA512_ASM \ -MD5_ASM \ -DES_PTR \ -DES_RISC1 \ -DES_UNROLL \ -OPENSSL_CPUID_OBJ \ +VPAES_ASM \  "  OPENSSL_CRYPTO_INCLUDES="\ @@ -628,7 +635,6 @@ crypto/evp/m_md4.c \  crypto/evp/m_md5.c \  crypto/evp/m_mdc2.c \  crypto/evp/m_null.c \ -crypto/evp/m_ripemd.c \  crypto/evp/m_sha1.c \  crypto/evp/m_sigver.c \  crypto/evp/m_wp.c \ @@ -727,8 +733,6 @@ crypto/rc2/rc2ofb64.c \  crypto/rc4/rc4_enc.c \  crypto/rc4/rc4_skey.c \  crypto/rc4/rc4_utl.c \ -crypto/ripemd/rmd_dgst.c \ -crypto/ripemd/rmd_one.c \  crypto/rsa/rsa_ameth.c \  crypto/rsa/rsa_asn1.c \  crypto/rsa/rsa_chk.c \ @@ -825,12 +829,14 @@ crypto/x509v3/v3err.c \  OPENSSL_CRYPTO_SOURCES_arm="\  crypto/aes/asm/aes-armv4.S \ +crypto/aes/asm/aesv8-armx.S \  crypto/aes/asm/bsaes-armv7.S \  crypto/armcap.c \  crypto/armv4cpuid.S \  crypto/bn/asm/armv4-gf2m.S \  crypto/bn/asm/armv4-mont.S \  crypto/modes/asm/ghash-armv4.S \ +crypto/modes/asm/ghashv8-armx.S \  crypto/sha/asm/sha1-armv4-large.S \  crypto/sha/asm/sha256-armv4.S \  crypto/sha/asm/sha512-armv4.S \ @@ -842,6 +848,13 @@ crypto/mem_clr.c \  "  OPENSSL_CRYPTO_SOURCES_arm64="\ +crypto/armcap.c \ +crypto/arm64cpuid.S \ +crypto/aes/asm/aesv8-armx-64.S \ +crypto/modes/asm/ghashv8-armx-64.S \ +crypto/sha/asm/sha1-armv8.S \ +crypto/sha/asm/sha256-armv8.S \ +crypto/sha/asm/sha512-armv8.S \  "  OPENSSL_CRYPTO_SOURCES_EXCLUDES_arm64="\ diff --git a/main/openssl/patches/README b/main/openssl/patches/README index 2ff69282..13e9bd8b 100644 --- a/main/openssl/patches/README +++ b/main/openssl/patches/README @@ -53,6 +53,19 @@ ecdhe_psk.patch  Adds support for ECDHE Pre-Shared Key (PSK) TLS cipher suites. +ecdhe_psk_part2.patch + +Removes ECHDE-PSK cipher suites with SHA-2 because they cannot be used with +SSLv3 (and there's no way to express that in OpenSSL's configuration). Adds +SHA-1 based ECDHE-PSK AES-CBC cipher suites instead. + +arm_asm.patch + +Adds newer ARM assembly pack with BSAES for ARMv7 and acceleration for ARMv8 +Based on branch available at: +https://git.linaro.org/people/ard.biesheuvel/openssl.git/shortlog/refs/heads/openssl-1.0.1f-with-arm-patches +c7b582ef23eb6f4386664e841e6e406d984c38d3^..cb8b1ab03e5c179a719afe83f03fecb1c2c78730 +  tls_psk_hint.patch  Fixes issues with TLS-PSK identity hint implementation where diff --git a/main/openssl/ssl/s3_lib.c b/main/openssl/ssl/s3_lib.c index 4eb54284..896d1e19 100644 --- a/main/openssl/ssl/s3_lib.c +++ b/main/openssl/ssl/s3_lib.c @@ -2828,35 +2828,34 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={  #ifndef OPENSSL_NO_PSK      /* ECDH PSK ciphersuites from RFC 5489 */ - -	/* Cipher C037 */ +	/* Cipher C035 */  	{  	1, -	TLS1_TXT_ECDHE_PSK_WITH_AES_128_CBC_SHA256, -	TLS1_CK_ECDHE_PSK_WITH_AES_128_CBC_SHA256, +	TLS1_TXT_ECDHE_PSK_WITH_AES_128_CBC_SHA, +	TLS1_CK_ECDHE_PSK_WITH_AES_128_CBC_SHA,  	SSL_kEECDH,  	SSL_aPSK,  	SSL_AES128, -	SSL_SHA256, +	SSL_SHA1,  	SSL_TLSV1, -	SSL_NOT_EXP|SSL_HIGH, -	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF_SHA256, +	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, +	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,  	128,  	128,  	}, -	/* Cipher C038 */ +	/* Cipher C036 */  	{  	1, -	TLS1_TXT_ECDHE_PSK_WITH_AES_256_CBC_SHA384, -	TLS1_CK_ECDHE_PSK_WITH_AES_256_CBC_SHA384, +	TLS1_TXT_ECDHE_PSK_WITH_AES_256_CBC_SHA, +	TLS1_CK_ECDHE_PSK_WITH_AES_256_CBC_SHA,  	SSL_kEECDH,  	SSL_aPSK,  	SSL_AES256, -	SSL_SHA384, +	SSL_SHA1,  	SSL_TLSV1, -	SSL_NOT_EXP|SSL_HIGH, -	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF_SHA384, +	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, +	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,  	256,  	256,  	}, diff --git a/main/openssl/ssl/ssl.h b/main/openssl/ssl/ssl.h index 06bb90f8..a85841b3 100644 --- a/main/openssl/ssl/ssl.h +++ b/main/openssl/ssl/ssl.h @@ -1816,6 +1816,7 @@ int	SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits);  char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);  const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);  unsigned long 	SSL_CIPHER_get_id(const SSL_CIPHER *c); +const char *	SSL_CIPHER_authentication_method(const SSL_CIPHER* cipher);  int	SSL_get_fd(const SSL *s);  int	SSL_get_rfd(const SSL *s); diff --git a/main/openssl/ssl/tls1.h b/main/openssl/ssl/tls1.h index 66520893..b9a0899e 100644 --- a/main/openssl/ssl/tls1.h +++ b/main/openssl/ssl/tls1.h @@ -532,9 +532,11 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)  #define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256        0x0300C031  #define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384        0x0300C032 -/* ECDHE PSK ciphersuites from RFC 5489 */ -#define TLS1_CK_ECDHE_PSK_WITH_AES_128_CBC_SHA256    0x0300C037 -#define TLS1_CK_ECDHE_PSK_WITH_AES_256_CBC_SHA384    0x0300C038 +/* ECDHE PSK ciphersuites from RFC5489 + * SHA-2 cipher suites are omitted because they cannot be used safely with + * SSLv3. */ +#define TLS1_CK_ECDHE_PSK_WITH_AES_128_CBC_SHA          0x0300C035 +#define TLS1_CK_ECDHE_PSK_WITH_AES_256_CBC_SHA          0x0300C036  /* XXX   * Inconsistency alert: @@ -687,9 +689,9 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)  #define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256       "ECDH-RSA-AES128-GCM-SHA256"  #define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384       "ECDH-RSA-AES256-GCM-SHA384" -/* ECDHE PSK ciphersuites from RFC 5489 */ -#define TLS1_TXT_ECDHE_PSK_WITH_AES_128_CBC_SHA256  "ECDHE-PSK-WITH-AES-128-CBC-SHA256" -#define TLS1_TXT_ECDHE_PSK_WITH_AES_256_CBC_SHA384  "ECDHE-PSK-WITH-AES-256-CBC-SHA384" +/* ECDHE PSK ciphersuites from RFC5489 */ +#define TLS1_TXT_ECDHE_PSK_WITH_AES_128_CBC_SHA         "ECDHE-PSK-AES128-CBC-SHA" +#define TLS1_TXT_ECDHE_PSK_WITH_AES_256_CBC_SHA         "ECDHE-PSK-AES256-CBC-SHA"  #define TLS_CT_RSA_SIGN			1  #define TLS_CT_DSS_SIGN			2  | 
