From 626c2706b1f7abdc6af1216873b7687e59025d1f Mon Sep 17 00:00:00 2001
From: Arne Schwabe <arne@rfc2549.org>
Date: Thu, 5 Jun 2014 18:34:09 +0200
Subject: Update OpenSSL to aosp/master

--HG--
extra : rebase_source : a2f70c1a7529c7fcfc88f8dd1882e66e6ba42167
---
 main/openssl/Apps-config-host.mk           |   26 +-
 main/openssl/Crypto-config-host.mk         |   37 +-
 main/openssl/Crypto-config-target.mk       |   11 +
 main/openssl/Crypto.mk                     |    4 +
 main/openssl/Ssl-config-host.mk            |   26 +-
 main/openssl/android-config.mk             |   19 +-
 main/openssl/apps/md4.c                    |  128 +-
 main/openssl/crypto/aes/asm/bsaes-armv7.S  | 2544 ++++++++++++++++++++++++++++
 main/openssl/crypto/aes/asm/bsaes-armv7.pl | 2467 +++++++++++++++++++++++++++
 main/openssl/crypto/armcap.c               |   80 +
 main/openssl/crypto/bn/asm/bn-586.S        |  441 +++--
 main/openssl/crypto/bn/asm/x86-gf2m.S      |   12 +
 main/openssl/crypto/bn/asm/x86-mont.S      |  174 +-
 main/openssl/crypto/evp/e_aes.c            |   10 +
 main/openssl/crypto/modes/asm/ghash-x86.S  | 1327 ++++++++++-----
 main/openssl/crypto/ocsp/ocsp.h            |    7 +
 main/openssl/crypto/sha/asm/sha1-586.S     | 1265 +++++++++++++-
 main/openssl/crypto/sha/asm/sha512-586.S   |  281 ++-
 main/openssl/crypto/x86cpuid.S             |   46 +-
 main/openssl/import_openssl.sh             |   48 +-
 main/openssl/include/openssl/ocsp.h        |    7 +
 main/openssl/include/openssl/ssl.h         |    4 +
 main/openssl/include/openssl/tls1.h        |    8 +
 main/openssl/openssl.config                |   17 +-
 main/openssl/patches/README                |   14 +
 main/openssl/rules.mk                      |   18 +-
 main/openssl/ssl/d1_clnt.c                 |   13 +-
 main/openssl/ssl/d1_srvr.c                 |   10 +-
 main/openssl/ssl/s3_clnt.c                 |  359 ++--
 main/openssl/ssl/s3_lib.c                  |   38 +-
 main/openssl/ssl/s3_srvr.c                 |  540 +++---
 main/openssl/ssl/ssl.h                     |    4 +
 main/openssl/ssl/ssl_lib.c                 |   58 +-
 main/openssl/ssl/ssl_sess.c                |   12 +
 main/openssl/ssl/tls1.h                    |    8 +
 35 files changed, 8838 insertions(+), 1225 deletions(-)
 mode change 100644 => 120000 main/openssl/apps/md4.c
 create mode 100644 main/openssl/crypto/aes/asm/bsaes-armv7.S
 create mode 100644 main/openssl/crypto/aes/asm/bsaes-armv7.pl
 create mode 100644 main/openssl/crypto/armcap.c

diff --git a/main/openssl/Apps-config-host.mk b/main/openssl/Apps-config-host.mk
index c1e5c6c5..37dcb78b 100644
--- a/main/openssl/Apps-config-host.mk
+++ b/main/openssl/Apps-config-host.mk
@@ -105,21 +105,15 @@ mips_src_files :=
 mips_exclude_files :=
 
 
-ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86)
-ifneq ($(BUILD_HOST_64bit),)
-host_arch := x86_64
-else
-host_arch := x86
-endif
-else
-ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86_64)
-host_arch := x86_64
+LOCAL_CFLAGS += $(common_cflags)
+LOCAL_C_INCLUDES += $(common_c_includes) $(local_c_includes)
+
+ifeq ($(HOST_OS),linux)
+LOCAL_CFLAGS_x86 += $(x86_cflags)
+LOCAL_SRC_FILES_x86 += $(filter-out $(x86_exclude_files), $(common_src_files) $(x86_src_files))
+LOCAL_CFLAGS_x86_64 += $(x86_64_cflags)
+LOCAL_SRC_FILES_x86_64 += $(filter-out $(x86_64_exclude_files), $(common_src_files) $(x86_64_src_files))
 else
-$(warning Unknown host architecture $(HOST_OS)-$(HOST_ARCH))
-host_arch := unknown
-endif
+$(warning Unknown host OS $(HOST_OS))
+LOCAL_SRC_FILES += $(common_src_files)
 endif
-
-LOCAL_CFLAGS     += $(common_cflags) $($(host_arch)_cflags)
-LOCAL_C_INCLUDES += $(common_c_includes) $(local_c_includes)
-LOCAL_SRC_FILES  += $(filter-out $($(host_arch)_exclude_files), $(common_src_files) $($(host_arch)_src_files))
diff --git a/main/openssl/Crypto-config-host.mk b/main/openssl/Crypto-config-host.mk
index 35013240..a377fec4 100644
--- a/main/openssl/Crypto-config-host.mk
+++ b/main/openssl/Crypto-config-host.mk
@@ -545,15 +545,20 @@ common_c_includes := \
 
 arm_cflags := \
   -DAES_ASM \
+  -DBSAES_ASM \
   -DGHASH_ASM \
   -DOPENSSL_BN_ASM_GF2m \
   -DOPENSSL_BN_ASM_MONT \
+  -DOPENSSL_CPUID_OBJ \
   -DSHA1_ASM \
   -DSHA256_ASM \
   -DSHA512_ASM \
 
 arm_src_files := \
   crypto/aes/asm/aes-armv4.S \
+  crypto/aes/asm/bsaes-armv7.S \
+  crypto/armcap.c \
+  crypto/armv4cpuid.S \
   crypto/bn/asm/armv4-gf2m.S \
   crypto/bn/asm/armv4-mont.S \
   crypto/modes/asm/ghash-armv4.S \
@@ -563,6 +568,7 @@ arm_src_files := \
 
 arm_exclude_files := \
   crypto/aes/aes_core.c \
+  crypto/mem_clr.c \
 
 arm64_cflags := \
   -DOPENSSL_NO_ASM \
@@ -582,9 +588,11 @@ x86_cflags := \
   -DOPENSSL_BN_ASM_MONT \
   -DOPENSSL_BN_ASM_PART_WORDS \
   -DOPENSSL_CPUID_OBJ \
+  -DOPENSSL_IA32_SSE2 \
   -DSHA1_ASM \
   -DSHA256_ASM \
   -DSHA512_ASM \
+  -DVPAES_ASM \
 
 x86_src_files := \
   crypto/aes/asm/aes-586.S \
@@ -615,6 +623,7 @@ x86_exclude_files := \
 
 x86_64_cflags := \
   -DAES_ASM \
+  -DBSAES_ASM \
   -DDES_PTR \
   -DDES_RISC1 \
   -DDES_UNROLL \
@@ -622,10 +631,12 @@ x86_64_cflags := \
   -DMD5_ASM \
   -DOPENSSL_BN_ASM_GF2m \
   -DOPENSSL_BN_ASM_MONT \
+  -DOPENSSL_BN_ASM_MONT5 \
   -DOPENSSL_CPUID_OBJ \
   -DSHA1_ASM \
   -DSHA256_ASM \
   -DSHA512_ASM \
+  -DVPAES_ASM \
 
 x86_64_src_files := \
   crypto/aes/asm/aes-x86_64.S \
@@ -673,21 +684,15 @@ mips_exclude_files := \
   crypto/bn/bn_asm.c \
 
 
-ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86)
-ifneq ($(BUILD_HOST_64bit),)
-host_arch := x86_64
-else
-host_arch := x86
-endif
-else
-ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86_64)
-host_arch := x86_64
+LOCAL_CFLAGS += $(common_cflags)
+LOCAL_C_INCLUDES += $(common_c_includes) $(local_c_includes)
+
+ifeq ($(HOST_OS),linux)
+LOCAL_CFLAGS_x86 += $(x86_cflags)
+LOCAL_SRC_FILES_x86 += $(filter-out $(x86_exclude_files), $(common_src_files) $(x86_src_files))
+LOCAL_CFLAGS_x86_64 += $(x86_64_cflags)
+LOCAL_SRC_FILES_x86_64 += $(filter-out $(x86_64_exclude_files), $(common_src_files) $(x86_64_src_files))
 else
-$(warning Unknown host architecture $(HOST_OS)-$(HOST_ARCH))
-host_arch := unknown
-endif
+$(warning Unknown host OS $(HOST_OS))
+LOCAL_SRC_FILES += $(common_src_files)
 endif
-
-LOCAL_CFLAGS     += $(common_cflags) $($(host_arch)_cflags)
-LOCAL_C_INCLUDES += $(common_c_includes) $(local_c_includes)
-LOCAL_SRC_FILES  += $(filter-out $($(host_arch)_exclude_files), $(common_src_files) $($(host_arch)_src_files))
diff --git a/main/openssl/Crypto-config-target.mk b/main/openssl/Crypto-config-target.mk
index 6dbf1bd3..2c5b01e5 100644
--- a/main/openssl/Crypto-config-target.mk
+++ b/main/openssl/Crypto-config-target.mk
@@ -545,15 +545,20 @@ common_c_includes := \
 
 arm_cflags := \
   -DAES_ASM \
+  -DBSAES_ASM \
   -DGHASH_ASM \
   -DOPENSSL_BN_ASM_GF2m \
   -DOPENSSL_BN_ASM_MONT \
+  -DOPENSSL_CPUID_OBJ \
   -DSHA1_ASM \
   -DSHA256_ASM \
   -DSHA512_ASM \
 
 arm_src_files := \
   crypto/aes/asm/aes-armv4.S \
+  crypto/aes/asm/bsaes-armv7.S \
+  crypto/armcap.c \
+  crypto/armv4cpuid.S \
   crypto/bn/asm/armv4-gf2m.S \
   crypto/bn/asm/armv4-mont.S \
   crypto/modes/asm/ghash-armv4.S \
@@ -563,6 +568,7 @@ arm_src_files := \
 
 arm_exclude_files := \
   crypto/aes/aes_core.c \
+  crypto/mem_clr.c \
 
 arm64_cflags := \
   -DOPENSSL_NO_ASM \
@@ -582,9 +588,11 @@ x86_cflags := \
   -DOPENSSL_BN_ASM_MONT \
   -DOPENSSL_BN_ASM_PART_WORDS \
   -DOPENSSL_CPUID_OBJ \
+  -DOPENSSL_IA32_SSE2 \
   -DSHA1_ASM \
   -DSHA256_ASM \
   -DSHA512_ASM \
+  -DVPAES_ASM \
 
 x86_src_files := \
   crypto/aes/asm/aes-586.S \
@@ -615,6 +623,7 @@ x86_exclude_files := \
 
 x86_64_cflags := \
   -DAES_ASM \
+  -DBSAES_ASM \
   -DDES_PTR \
   -DDES_RISC1 \
   -DDES_UNROLL \
@@ -622,10 +631,12 @@ x86_64_cflags := \
   -DMD5_ASM \
   -DOPENSSL_BN_ASM_GF2m \
   -DOPENSSL_BN_ASM_MONT \
+  -DOPENSSL_BN_ASM_MONT5 \
   -DOPENSSL_CPUID_OBJ \
   -DSHA1_ASM \
   -DSHA256_ASM \
   -DSHA512_ASM \
+  -DVPAES_ASM \
 
 x86_64_src_files := \
   crypto/aes/asm/aes-x86_64.S \
diff --git a/main/openssl/Crypto.mk b/main/openssl/Crypto.mk
index 9c558d9e..4214b91e 100644
--- a/main/openssl/Crypto.mk
+++ b/main/openssl/Crypto.mk
@@ -31,6 +31,10 @@ LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)
 # in the NDK.
 ifeq (,$(TARGET_BUILD_APPS))
 LOCAL_CLANG := true
+ifeq ($(HOST_OS), darwin_XXX)
+LOCAL_ASFLAGS += -no-integrated-as
+LOCAL_CFLAGS += -no-integrated-as
+endif
 else
 LOCAL_SDK_VERSION := 9
 endif
diff --git a/main/openssl/Ssl-config-host.mk b/main/openssl/Ssl-config-host.mk
index 203544df..95035487 100644
--- a/main/openssl/Ssl-config-host.mk
+++ b/main/openssl/Ssl-config-host.mk
@@ -99,21 +99,15 @@ mips_src_files :=
 mips_exclude_files :=
 
 
-ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86)
-ifneq ($(BUILD_HOST_64bit),)
-host_arch := x86_64
-else
-host_arch := x86
-endif
-else
-ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86_64)
-host_arch := x86_64
+LOCAL_CFLAGS += $(common_cflags)
+LOCAL_C_INCLUDES += $(common_c_includes) $(local_c_includes)
+
+ifeq ($(HOST_OS),linux)
+LOCAL_CFLAGS_x86 += $(x86_cflags)
+LOCAL_SRC_FILES_x86 += $(filter-out $(x86_exclude_files), $(common_src_files) $(x86_src_files))
+LOCAL_CFLAGS_x86_64 += $(x86_64_cflags)
+LOCAL_SRC_FILES_x86_64 += $(filter-out $(x86_64_exclude_files), $(common_src_files) $(x86_64_src_files))
 else
-$(warning Unknown host architecture $(HOST_OS)-$(HOST_ARCH))
-host_arch := unknown
-endif
+$(warning Unknown host OS $(HOST_OS))
+LOCAL_SRC_FILES += $(common_src_files)
 endif
-
-LOCAL_CFLAGS     += $(common_cflags) $($(host_arch)_cflags)
-LOCAL_C_INCLUDES += $(common_c_includes) $(local_c_includes)
-LOCAL_SRC_FILES  += $(filter-out $($(host_arch)_exclude_files), $(common_src_files) $($(host_arch)_src_files))
diff --git a/main/openssl/android-config.mk b/main/openssl/android-config.mk
index 84ab6782..2a091130 100644
--- a/main/openssl/android-config.mk
+++ b/main/openssl/android-config.mk
@@ -7,6 +7,20 @@
 # This script performs minor but required patching for the Android build.
 #
 
+# Directories for ENGINE shared libraries
+openssl_cflags_32 += \
+  -DOPENSSLDIR="\"/system/lib/ssl\"" \
+  -DENGINESDIR="\"/system/lib/ssl/engines\""
+openssl_cflags_static_32 += \
+  -DOPENSSLDIR="\"/system/lib/ssl\"" \
+  -DENGINESDIR="\"/system/lib/ssl/engines\""
+openssl_cflags_64 += \
+  -DOPENSSLDIR="\"/system/lib64/ssl\"" \
+  -DENGINESDIR="\"/system/lib64/ssl/engines\""
+openssl_cflags_static_64 += \
+  -DOPENSSLDIR="\"/system/lib64/ssl\"" \
+  -DENGINESDIR="\"/system/lib64/ssl/engines\""
+
 # Intentionally excluded http://b/7079965
 ifneq (,$(filter -DZLIB, $(openssl_cflags_32) $(openssl_cflags_64) \
     $(openssl_cflags_static_32) $(openssl_cflags_static_64)))
@@ -27,11 +41,6 @@ LOCAL_CFLAGS_32 := $(filter-out -DDSO_DLFCN -DHAVE_DLFCN_H,$(LOCAL_CFLAGS_32))
 LOCAL_CFLAGS_64 := $(filter-out -DDSO_DLFCN -DHAVE_DLFCN_H,$(LOCAL_CFLAGS_64))
 endif
 
-# Directories
-LOCAL_CFLAGS += \
-  -DOPENSSLDIR="\"/system/lib/ssl\"" \
-  -DENGINESDIR="\"/system/lib/ssl/engines\""
-
 # Debug
 # LOCAL_CFLAGS += -DCIPHER_DEBUG
 
diff --git a/main/openssl/apps/md4.c b/main/openssl/apps/md4.c
deleted file mode 100644
index 141415ad..00000000
--- a/main/openssl/apps/md4.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/* crypto/md4/md4.c */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- * 
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to.  The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- * 
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *    "This product includes cryptographic software written by
- *     Eric Young (eay@cryptsoft.com)"
- *    The word 'cryptographic' can be left out if the rouines from the library
- *    being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from 
- *    the apps directory (application code) you must include an acknowledgement:
- *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- * 
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * 
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed.  i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <openssl/md4.h>
-
-#define BUFSIZE	1024*16
-
-void do_fp(FILE *f);
-void pt(unsigned char *md);
-#if !defined(_OSD_POSIX) && !defined(__DJGPP__)
-int read(int, void *, unsigned int);
-#endif
-
-int main(int argc, char **argv)
-	{
-	int i,err=0;
-	FILE *IN;
-
-	if (argc == 1)
-		{
-		do_fp(stdin);
-		}
-	else
-		{
-		for (i=1; i<argc; i++)
-			{
-			IN=fopen(argv[i],"r");
-			if (IN == NULL)
-				{
-				perror(argv[i]);
-				err++;
-				continue;
-				}
-			printf("MD4(%s)= ",argv[i]);
-			do_fp(IN);
-			fclose(IN);
-			}
-		}
-	exit(err);
-	}
-
-void do_fp(FILE *f)
-	{
-	MD4_CTX c;
-	unsigned char md[MD4_DIGEST_LENGTH];
-	int fd;
-	int i;
-	static unsigned char buf[BUFSIZE];
-
-	fd=fileno(f);
-	MD4_Init(&c);
-	for (;;)
-		{
-		i=read(fd,buf,sizeof buf);
-		if (i <= 0) break;
-		MD4_Update(&c,buf,(unsigned long)i);
-		}
-	MD4_Final(&(md[0]),&c);
-	pt(md);
-	}
-
-void pt(unsigned char *md)
-	{
-	int i;
-
-	for (i=0; i<MD4_DIGEST_LENGTH; i++)
-		printf("%02x",md[i]);
-	printf("\n");
-	}
-
diff --git a/main/openssl/apps/md4.c b/main/openssl/apps/md4.c
new file mode 120000
index 00000000..7f457b2a
--- /dev/null
+++ b/main/openssl/apps/md4.c
@@ -0,0 +1 @@
+../crypto/md4/md4.c
\ No newline at end of file
diff --git a/main/openssl/crypto/aes/asm/bsaes-armv7.S b/main/openssl/crypto/aes/asm/bsaes-armv7.S
new file mode 100644
index 00000000..64205d45
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/bsaes-armv7.S
@@ -0,0 +1,2544 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+@ granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt	19.5 cycles per byte processed with 128-bit key
+@ decrypt	22.1 cycles per byte processed with 128-bit key
+@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@						<appro@openssl.org>
+
+@ April-August 2013
+@
+@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
+@
+@					<ard.biesheuvel@linaro.org>
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+# define VFP_ABI_FRAME	0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME	0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code   32
+#endif
+
+.fpu	neon
+
+.type	_bsaes_decrypt8,%function
+.align	4
+_bsaes_decrypt8:
+	adr	r6,_bsaes_decrypt8
+	vldmia	r4!, {q9}		@ round 0 key
+	add	r6,r6,#.LM0ISR-_bsaes_decrypt8
+
+	vldmia	r6!, {q8}		@ .LM0ISR
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	 vtbl.8	d0, {q10}, d16
+	 vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	 vtbl.8	d2, {q11}, d16
+	 vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	 vtbl.8	d4, {q12}, d16
+	 vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	 vtbl.8	d6, {q13}, d16
+	 vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	 vtbl.8	d8, {q14}, d16
+	 vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	 vtbl.8	d10, {q15}, d16
+	 vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	 vtbl.8	d12, {q10}, d16
+	 vtbl.8	d13, {q10}, d17
+	 vtbl.8	d14, {q11}, d16
+	 vtbl.8	d15, {q11}, d17
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q6, #1
+	 vshr.u64	q11, q4, #1
+	veor		q10, q10, q7
+	 veor		q11, q11, q5
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q7, q7, q10
+	vshl.u64	q10, q10, #1
+	 veor		q5, q5, q11
+	 vshl.u64	q11, q11, #1
+	veor		q6, q6, q10
+	 veor		q4, q4, q11
+	vshr.u64	q10, q2, #1
+	 vshr.u64	q11, q0, #1
+	veor		q10, q10, q3
+	 veor		q11, q11, q1
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q3, q3, q10
+	vshl.u64	q10, q10, #1
+	 veor		q1, q1, q11
+	 vshl.u64	q11, q11, #1
+	veor		q2, q2, q10
+	 veor		q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q5, #2
+	 vshr.u64	q11, q4, #2
+	veor		q10, q10, q7
+	 veor		q11, q11, q6
+	vand		q10, q10, q9
+	 vand		q11, q11, q9
+	veor		q7, q7, q10
+	vshl.u64	q10, q10, #2
+	 veor		q6, q6, q11
+	 vshl.u64	q11, q11, #2
+	veor		q5, q5, q10
+	 veor		q4, q4, q11
+	vshr.u64	q10, q1, #2
+	 vshr.u64	q11, q0, #2
+	veor		q10, q10, q3
+	 veor		q11, q11, q2
+	vand		q10, q10, q9
+	 vand		q11, q11, q9
+	veor		q3, q3, q10
+	vshl.u64	q10, q10, #2
+	 veor		q2, q2, q11
+	 vshl.u64	q11, q11, #2
+	veor		q1, q1, q10
+	 veor		q0, q0, q11
+	vshr.u64	q10, q3, #4
+	 vshr.u64	q11, q2, #4
+	veor		q10, q10, q7
+	 veor		q11, q11, q6
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q7, q7, q10
+	vshl.u64	q10, q10, #4
+	 veor		q6, q6, q11
+	 vshl.u64	q11, q11, #4
+	veor		q3, q3, q10
+	 veor		q2, q2, q11
+	vshr.u64	q10, q1, #4
+	 vshr.u64	q11, q0, #4
+	veor		q10, q10, q5
+	 veor		q11, q11, q4
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q5, q5, q10
+	vshl.u64	q10, q10, #4
+	 veor		q4, q4, q11
+	 vshl.u64	q11, q11, #4
+	veor		q1, q1, q10
+	 veor		q0, q0, q11
+	sub	r5,r5,#1
+	b	.Ldec_sbox
+.align	4
+.Ldec_loop:
+	vldmia	r4!, {q8-q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+.Ldec_sbox:
+	 veor	q1, q1, q4
+	veor	q3, q3, q4
+
+	veor	q4, q4, q7
+	 veor	q1, q1, q6
+	veor	q2, q2, q7
+	veor	q6, q6, q4
+
+	veor	q0, q0, q1
+	veor	q2, q2, q5
+	 veor	q7, q7, q6
+	veor	q3, q3, q0
+	veor	q5, q5, q0
+	veor	q1, q1, q3
+	veor	q11, q3, q0
+	veor	q10, q7, q4
+	veor	q9, q1, q6
+	veor	q13, q4, q0
+	 vmov	q8, q10
+	veor	q12, q5, q2
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q6, q2
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q3, q7
+	veor	q12, q1, q5
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q4, q6
+	veor	q9, q9, q14
+	vand	q13, q0, q2
+	vand	q14, q7, q1
+	vorr	q15, q3, q5
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q5, q2
+	veor	q8, q1, q6
+	veor 	q10, q15, q14
+	vand	q10, q10, q5
+	veor	q5, q5, q1
+	vand	q11, q1, q15
+	vand	q5, q5, q14
+	veor	q1, q11, q10
+	veor	q5, q5, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	 veor 	q10, q13, q9
+	vand	q11, q11, q12
+	 vand	q10, q10, q2
+	veor	q12, q12, q8
+	 veor	q2, q2, q6
+	vand	q8, q8, q15
+	 vand	q6, q6, q13
+	vand	q12, q12, q14
+	 vand	q2, q2, q9
+	veor	q8, q8, q12
+	 veor	q2, q2, q6
+	veor	q12, q12, q11
+	 veor	q6, q6, q10
+	veor	q5, q5, q12
+	veor	q2, q2, q12
+	veor	q1, q1, q8
+	veor	q6, q6, q8
+
+	veor	q12, q3, q0
+	veor	q8, q7, q4
+	veor	q11, q15, q14
+	 veor 	q10, q13, q9
+	vand	q11, q11, q12
+	 vand	q10, q10, q0
+	veor	q12, q12, q8
+	 veor	q0, q0, q4
+	vand	q8, q8, q15
+	 vand	q4, q4, q13
+	vand	q12, q12, q14
+	 vand	q0, q0, q9
+	veor	q8, q8, q12
+	 veor	q0, q0, q4
+	veor	q12, q12, q11
+	 veor	q4, q4, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor 	q10, q15, q14
+	vand	q10, q10, q3
+	veor	q3, q3, q7
+	vand	q11, q7, q15
+	vand	q3, q3, q14
+	veor	q7, q11, q10
+	veor	q3, q3, q11
+	veor	q3, q3, q12
+	veor	q0, q0, q12
+	veor	q7, q7, q8
+	veor	q4, q4, q8
+	veor	q1, q1, q7
+	veor	q6, q6, q5
+
+	veor	q4, q4, q1
+	veor	q2, q2, q7
+	veor	q5, q5, q7
+	veor	q4, q4, q2
+	 veor 	q7, q7, q0
+	veor	q4, q4, q5
+	 veor	q3, q3, q6
+	 veor	q6, q6, q1
+	veor	q3, q3, q4
+
+	veor	q4, q4, q0
+	veor	q7, q7, q3
+	subs	r5,r5,#1
+	bcc	.Ldec_done
+	@ multiplication by 0x05-0x00-0x04-0x00
+	vext.8	q8, q0, q0, #8
+	vext.8	q14, q3, q3, #8
+	vext.8	q15, q5, q5, #8
+	veor	q8, q8, q0
+	vext.8	q9, q1, q1, #8
+	veor	q14, q14, q3
+	vext.8	q10, q6, q6, #8
+	veor	q15, q15, q5
+	vext.8	q11, q4, q4, #8
+	veor	q9, q9, q1
+	vext.8	q12, q2, q2, #8
+	veor	q10, q10, q6
+	vext.8	q13, q7, q7, #8
+	veor	q11, q11, q4
+	veor	q12, q12, q2
+	veor	q13, q13, q7
+
+	 veor	q0, q0, q14
+	 veor	q1, q1, q14
+	 veor	q6, q6, q8
+	 veor	q2, q2, q10
+	 veor	q4, q4, q9
+	 veor	q1, q1, q15
+	 veor	q6, q6, q15
+	 veor	q2, q2, q14
+	 veor	q7, q7, q11
+	 veor	q4, q4, q14
+	 veor	q3, q3, q12
+	 veor	q2, q2, q15
+	 veor	q7, q7, q15
+	 veor	q5, q5, q13
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	 veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q6, q6, #12
+	 veor	q1, q1, q9
+	vext.8	q11, q4, q4, #12
+	 veor	q6, q6, q10
+	vext.8	q12, q2, q2, #12
+	 veor	q4, q4, q11
+	vext.8	q13, q7, q7, #12
+	 veor	q2, q2, q12
+	vext.8	q14, q3, q3, #12
+	 veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	 veor	q3, q3, q14
+
+	veor	q9, q9, q0
+	 veor	q5, q5, q15
+	 vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	 vext.8	q1, q1, q1, #8
+	veor	q13, q13, q2
+	 veor	q0, q0, q8
+	veor	q14, q14, q7
+	 veor	q1, q1, q9
+	 vext.8	q8, q2, q2, #8
+	veor	q12, q12, q4
+	 vext.8	q9, q7, q7, #8
+	veor	q15, q15, q3
+	 vext.8	q2, q4, q4, #8
+	veor	q11, q11, q6
+	 vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	 vext.8	q4, q3, q3, #8
+	veor	q11, q11, q5
+	 vext.8	q3, q6, q6, #8
+	veor	q5, q9, q13
+	veor	q11, q11, q2
+	veor	q7, q7, q15
+	veor	q6, q4, q14
+	veor	q4, q8, q12
+	veor	q2, q3, q10
+	vmov	q3, q11
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ .LISR
+	ite	eq				@ Thumb2 thing, sanity check in ARM
+	addeq	r6,r6,#0x10
+	bne	.Ldec_loop
+	vldmia	r6, {q12}		@ .LISRM0
+	b	.Ldec_loop
+.align	4
+.Ldec_done:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q3, #1
+	 vshr.u64	q11, q2, #1
+	veor		q10, q10, q5
+	 veor		q11, q11, q7
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q5, q5, q10
+	vshl.u64	q10, q10, #1
+	 veor		q7, q7, q11
+	 vshl.u64	q11, q11, #1
+	veor		q3, q3, q10
+	 veor		q2, q2, q11
+	vshr.u64	q10, q6, #1
+	 vshr.u64	q11, q0, #1
+	veor		q10, q10, q4
+	 veor		q11, q11, q1
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q4, q4, q10
+	vshl.u64	q10, q10, #1
+	 veor		q1, q1, q11
+	 vshl.u64	q11, q11, #1
+	veor		q6, q6, q10
+	 veor		q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q7, #2
+	 vshr.u64	q11, q2, #2
+	veor		q10, q10, q5
+	 veor		q11, q11, q3
+	vand		q10, q10, q9
+	 vand		q11, q11, q9
+	veor		q5, q5, q10
+	vshl.u64	q10, q10, #2
+	 veor		q3, q3, q11
+	 vshl.u64	q11, q11, #2
+	veor		q7, q7, q10
+	 veor		q2, q2, q11
+	vshr.u64	q10, q1, #2
+	 vshr.u64	q11, q0, #2
+	veor		q10, q10, q4
+	 veor		q11, q11, q6
+	vand		q10, q10, q9
+	 vand		q11, q11, q9
+	veor		q4, q4, q10
+	vshl.u64	q10, q10, #2
+	 veor		q6, q6, q11
+	 vshl.u64	q11, q11, #2
+	veor		q1, q1, q10
+	 veor		q0, q0, q11
+	vshr.u64	q10, q4, #4
+	 vshr.u64	q11, q6, #4
+	veor		q10, q10, q5
+	 veor		q11, q11, q3
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q5, q5, q10
+	vshl.u64	q10, q10, #4
+	 veor		q3, q3, q11
+	 vshl.u64	q11, q11, #4
+	veor		q4, q4, q10
+	 veor		q6, q6, q11
+	vshr.u64	q10, q1, #4
+	 vshr.u64	q11, q0, #4
+	veor		q10, q10, q7
+	 veor		q11, q11, q2
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q7, q7, q10
+	vshl.u64	q10, q10, #4
+	 veor		q2, q2, q11
+	 vshl.u64	q11, q11, #4
+	veor		q1, q1, q10
+	 veor		q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q6, q6, q8
+	veor	q4, q4, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q8
+	veor	q3, q3, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type	_bsaes_const,%object
+.align	6
+_bsaes_const:
+.LM0ISR:	@ InvShiftRows constants
+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:		@ ShiftRows constants
+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+	.quad	0x090d01050c000408, 0x03070b0f060a0e02
+.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align	6
+.size	_bsaes_const,.-_bsaes_const
+
+.type	_bsaes_encrypt8,%function
+.align	4
+_bsaes_encrypt8:
+	adr	r6,_bsaes_encrypt8
+	vldmia	r4!, {q9}		@ round 0 key
+	sub	r6,r6,#_bsaes_encrypt8-.LM0SR
+
+	vldmia	r6!, {q8}		@ .LM0SR
+_bsaes_encrypt8_alt:
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	 vtbl.8	d0, {q10}, d16
+	 vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	 vtbl.8	d2, {q11}, d16
+	 vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	 vtbl.8	d4, {q12}, d16
+	 vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	 vtbl.8	d6, {q13}, d16
+	 vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	 vtbl.8	d8, {q14}, d16
+	 vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	 vtbl.8	d10, {q15}, d16
+	 vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	 vtbl.8	d12, {q10}, d16
+	 vtbl.8	d13, {q10}, d17
+	 vtbl.8	d14, {q11}, d16
+	 vtbl.8	d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q6, #1
+	 vshr.u64	q11, q4, #1
+	veor		q10, q10, q7
+	 veor		q11, q11, q5
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q7, q7, q10
+	vshl.u64	q10, q10, #1
+	 veor		q5, q5, q11
+	 vshl.u64	q11, q11, #1
+	veor		q6, q6, q10
+	 veor		q4, q4, q11
+	vshr.u64	q10, q2, #1
+	 vshr.u64	q11, q0, #1
+	veor		q10, q10, q3
+	 veor		q11, q11, q1
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q3, q3, q10
+	vshl.u64	q10, q10, #1
+	 veor		q1, q1, q11
+	 vshl.u64	q11, q11, #1
+	veor		q2, q2, q10
+	 veor		q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q5, #2
+	 vshr.u64	q11, q4, #2
+	veor		q10, q10, q7
+	 veor		q11, q11, q6
+	vand		q10, q10, q9
+	 vand		q11, q11, q9
+	veor		q7, q7, q10
+	vshl.u64	q10, q10, #2
+	 veor		q6, q6, q11
+	 vshl.u64	q11, q11, #2
+	veor		q5, q5, q10
+	 veor		q4, q4, q11
+	vshr.u64	q10, q1, #2
+	 vshr.u64	q11, q0, #2
+	veor		q10, q10, q3
+	 veor		q11, q11, q2
+	vand		q10, q10, q9
+	 vand		q11, q11, q9
+	veor		q3, q3, q10
+	vshl.u64	q10, q10, #2
+	 veor		q2, q2, q11
+	 vshl.u64	q11, q11, #2
+	veor		q1, q1, q10
+	 veor		q0, q0, q11
+	vshr.u64	q10, q3, #4
+	 vshr.u64	q11, q2, #4
+	veor		q10, q10, q7
+	 veor		q11, q11, q6
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q7, q7, q10
+	vshl.u64	q10, q10, #4
+	 veor		q6, q6, q11
+	 vshl.u64	q11, q11, #4
+	veor		q3, q3, q10
+	 veor		q2, q2, q11
+	vshr.u64	q10, q1, #4
+	 vshr.u64	q11, q0, #4
+	veor		q10, q10, q5
+	 veor		q11, q11, q4
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q5, q5, q10
+	vshl.u64	q10, q10, #4
+	 veor		q4, q4, q11
+	 vshl.u64	q11, q11, #4
+	veor		q1, q1, q10
+	 veor		q0, q0, q11
+	sub	r5,r5,#1
+	b	.Lenc_sbox
+.align	4
+.Lenc_loop:
+	vldmia	r4!, {q8-q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+.Lenc_sbox:
+	veor	q2, q2, q1
+	veor	q5, q5, q6
+	veor	q3, q3, q0
+	veor	q6, q6, q2
+	veor	q5, q5, q0
+
+	veor	q6, q6, q3
+	veor	q3, q3, q7
+	veor	q7, q7, q5
+	veor	q3, q3, q4
+	veor	q4, q4, q5
+
+	veor	q2, q2, q7
+	veor	q3, q3, q1
+	veor	q1, q1, q5
+	veor	q11, q7, q4
+	veor	q10, q1, q2
+	veor	q9, q5, q3
+	veor	q13, q2, q4
+	 vmov	q8, q10
+	veor	q12, q6, q0
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q3, q0
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q7, q1
+	veor	q12, q5, q6
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q2, q3
+	veor	q9, q9, q14
+	vand	q13, q4, q0
+	vand	q14, q1, q5
+	vorr	q15, q7, q6
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q6, q0
+	veor	q8, q5, q3
+	veor 	q10, q15, q14
+	vand	q10, q10, q6
+	veor	q6, q6, q5
+	vand	q11, q5, q15
+	vand	q6, q6, q14
+	veor	q5, q11, q10
+	veor	q6, q6, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	 veor 	q10, q13, q9
+	vand	q11, q11, q12
+	 vand	q10, q10, q0
+	veor	q12, q12, q8
+	 veor	q0, q0, q3
+	vand	q8, q8, q15
+	 vand	q3, q3, q13
+	vand	q12, q12, q14
+	 vand	q0, q0, q9
+	veor	q8, q8, q12
+	 veor	q0, q0, q3
+	veor	q12, q12, q11
+	 veor	q3, q3, q10
+	veor	q6, q6, q12
+	veor	q0, q0, q12
+	veor	q5, q5, q8
+	veor	q3, q3, q8
+
+	veor	q12, q7, q4
+	veor	q8, q1, q2
+	veor	q11, q15, q14
+	 veor 	q10, q13, q9
+	vand	q11, q11, q12
+	 vand	q10, q10, q4
+	veor	q12, q12, q8
+	 veor	q4, q4, q2
+	vand	q8, q8, q15
+	 vand	q2, q2, q13
+	vand	q12, q12, q14
+	 vand	q4, q4, q9
+	veor	q8, q8, q12
+	 veor	q4, q4, q2
+	veor	q12, q12, q11
+	 veor	q2, q2, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor 	q10, q15, q14
+	vand	q10, q10, q7
+	veor	q7, q7, q1
+	vand	q11, q1, q15
+	vand	q7, q7, q14
+	veor	q1, q11, q10
+	veor	q7, q7, q11
+	veor	q7, q7, q12
+	veor	q4, q4, q12
+	veor	q1, q1, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q0
+	veor	q1, q1, q6
+	veor	q6, q6, q0
+	veor	q4, q4, q7
+	veor	q0, q0, q1
+
+	veor	q1, q1, q5
+	veor	q5, q5, q2
+	veor	q2, q2, q3
+	veor	q3, q3, q5
+	veor	q4, q4, q5
+
+	veor	q6, q6, q3
+	subs	r5,r5,#1
+	bcc	.Lenc_done
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	 veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q4, q4, #12
+	 veor	q1, q1, q9
+	vext.8	q11, q6, q6, #12
+	 veor	q4, q4, q10
+	vext.8	q12, q3, q3, #12
+	 veor	q6, q6, q11
+	vext.8	q13, q7, q7, #12
+	 veor	q3, q3, q12
+	vext.8	q14, q2, q2, #12
+	 veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	 veor	q2, q2, q14
+
+	veor	q9, q9, q0
+	 veor	q5, q5, q15
+	 vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	 vext.8	q1, q1, q1, #8
+	veor	q13, q13, q3
+	 veor	q0, q0, q8
+	veor	q14, q14, q7
+	 veor	q1, q1, q9
+	 vext.8	q8, q3, q3, #8
+	veor	q12, q12, q6
+	 vext.8	q9, q7, q7, #8
+	veor	q15, q15, q2
+	 vext.8	q3, q6, q6, #8
+	veor	q11, q11, q4
+	 vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	 vext.8	q6, q2, q2, #8
+	veor	q11, q11, q5
+	 vext.8	q2, q4, q4, #8
+	veor	q5, q9, q13
+	veor	q4, q8, q12
+	veor	q3, q3, q11
+	veor	q7, q7, q15
+	veor	q6, q6, q14
+	 @ vmov	q4, q8
+	veor	q2, q2, q10
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ .LSR
+	ite	eq				@ Thumb2 thing, samity check in ARM
+	addeq	r6,r6,#0x10
+	bne	.Lenc_loop
+	vldmia	r6, {q12}		@ .LSRM0
+	b	.Lenc_loop
+.align	4
+.Lenc_done:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q2, #1
+	 vshr.u64	q11, q3, #1
+	veor		q10, q10, q5
+	 veor		q11, q11, q7
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q5, q5, q10
+	vshl.u64	q10, q10, #1
+	 veor		q7, q7, q11
+	 vshl.u64	q11, q11, #1
+	veor		q2, q2, q10
+	 veor		q3, q3, q11
+	vshr.u64	q10, q4, #1
+	 vshr.u64	q11, q0, #1
+	veor		q10, q10, q6
+	 veor		q11, q11, q1
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q6, q6, q10
+	vshl.u64	q10, q10, #1
+	 veor		q1, q1, q11
+	 vshl.u64	q11, q11, #1
+	veor		q4, q4, q10
+	 veor		q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q7, #2
+	 vshr.u64	q11, q3, #2
+	veor		q10, q10, q5
+	 veor		q11, q11, q2
+	vand		q10, q10, q9
+	 vand		q11, q11, q9
+	veor		q5, q5, q10
+	vshl.u64	q10, q10, #2
+	 veor		q2, q2, q11
+	 vshl.u64	q11, q11, #2
+	veor		q7, q7, q10
+	 veor		q3, q3, q11
+	vshr.u64	q10, q1, #2
+	 vshr.u64	q11, q0, #2
+	veor		q10, q10, q6
+	 veor		q11, q11, q4
+	vand		q10, q10, q9
+	 vand		q11, q11, q9
+	veor		q6, q6, q10
+	vshl.u64	q10, q10, #2
+	 veor		q4, q4, q11
+	 vshl.u64	q11, q11, #2
+	veor		q1, q1, q10
+	 veor		q0, q0, q11
+	vshr.u64	q10, q6, #4
+	 vshr.u64	q11, q4, #4
+	veor		q10, q10, q5
+	 veor		q11, q11, q2
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q5, q5, q10
+	vshl.u64	q10, q10, #4
+	 veor		q2, q2, q11
+	 vshl.u64	q11, q11, #4
+	veor		q6, q6, q10
+	 veor		q4, q4, q11
+	vshr.u64	q10, q1, #4
+	 vshr.u64	q11, q0, #4
+	veor		q10, q10, q7
+	 veor		q11, q11, q3
+	vand		q10, q10, q8
+	 vand		q11, q11, q8
+	veor		q7, q7, q10
+	vshl.u64	q10, q10, #4
+	 veor		q3, q3, q11
+	 vshl.u64	q11, q11, #4
+	veor		q1, q1, q10
+	 veor		q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q4, q4, q8
+	veor	q6, q6, q8
+	veor	q3, q3, q8
+	veor	q7, q7, q8
+	veor	q2, q2, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+.type	_bsaes_key_convert,%function
+.align	4
+_bsaes_key_convert:
+	adr	r6,_bsaes_key_convert
+	vld1.8	{q7},  [r4]!		@ load round 0 key
+	sub	r6,r6,#_bsaes_key_convert-.LM0
+	vld1.8	{q15}, [r4]!		@ load round 1 key
+
+	vmov.i8	q8,  #0x01			@ bit masks
+	vmov.i8	q9,  #0x02
+	vmov.i8	q10, #0x04
+	vmov.i8	q11, #0x08
+	vmov.i8	q12, #0x10
+	vmov.i8	q13, #0x20
+	vldmia	r6, {q14}		@ .LM0
+
+#ifdef __ARMEL__
+	vrev32.8	q7,  q7
+	vrev32.8	q15, q15
+#endif
+	sub	r5,r5,#1
+	vstmia	r12!, {q7}		@ save round 0 key
+	b	.Lkey_loop
+
+.align	4
+.Lkey_loop:
+	vtbl.8	d14,{q15},d28
+	vtbl.8	d15,{q15},d29
+	vmov.i8	q6,  #0x40
+	vmov.i8	q15, #0x80
+
+	vtst.8	q0, q7, q8
+	vtst.8	q1, q7, q9
+	vtst.8	q2, q7, q10
+	vtst.8	q3, q7, q11
+	vtst.8	q4, q7, q12
+	vtst.8	q5, q7, q13
+	vtst.8	q6, q7, q6
+	vtst.8	q7, q7, q15
+	vld1.8	{q15}, [r4]!		@ load next round key
+	vmvn	q0, q0		@ "pnot"
+	vmvn	q1, q1
+	vmvn	q5, q5
+	vmvn	q6, q6
+#ifdef __ARMEL__
+	vrev32.8	q15, q15
+#endif
+	subs	r5,r5,#1
+	vstmia	r12!,{q0-q7}		@ write bit-sliced round key
+	bne	.Lkey_loop
+
+	vmov.i8	q7,#0x63			@ compose .L63
+	@ don't save last round key
+	bx	lr
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,%function
+.align	5
+bsaes_cbc_encrypt:
+#ifndef	__KERNEL__
+	cmp	r2, #128
+#ifndef	__thumb__
+	blo	AES_cbc_encrypt
+#else
+	bhs	1f
+	b	AES_cbc_encrypt
+1:
+#endif
+#endif
+
+	@ it is up to the caller to make sure we are called with enc == 0
+
+	mov	ip, sp
+	stmdb	sp!, {r4-r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ IV is 1st arg on the stack
+	mov	r2, r2, lsr#4		@ len in 16 byte blocks
+	sub	sp, #0x10			@ scratch space to carry over the IV
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ sifze of bit-slices key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	vldmia	sp, {q6}
+	vstmia	r12,  {q15}		@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	sp, {q7}
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, r3, #248
+	vldmia	r4, {q6}
+	vstmia	r12, {q15}			@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	r4, {q7}
+
+.align	2
+0:
+#endif
+
+	vld1.8	{q15}, [r8]		@ load IV
+	b	.Lcbc_dec_loop
+
+.align	4
+.Lcbc_dec_loop:
+	subs	r2, r2, #0x8
+	bmi	.Lcbc_dec_loop_finish
+
+	vld1.8	{q0-q1}, [r0]!	@ load input
+	vld1.8	{q2-q3}, [r0]!
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	vld1.8	{q4-q5}, [r0]!
+	mov	r5, r10
+	vld1.8	{q6-q7}, [r0]
+	sub	r0, r0, #0x60
+	vstmia	r9, {q15}			@ put aside IV
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8-q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10-q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12-q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q14-q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0-q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	veor	q5, q5, q14
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	vst1.8	{q5}, [r1]!
+
+	b	.Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+	adds	r2, r2, #8
+	beq	.Lcbc_dec_done
+
+	vld1.8	{q0}, [r0]!		@ load input
+	cmp	r2, #2
+	blo	.Lcbc_dec_one
+	vld1.8	{q1}, [r0]!
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	mov	r5, r10
+	vstmia	r9, {q15}			@ put aside IV
+	beq	.Lcbc_dec_two
+	vld1.8	{q2}, [r0]!
+	cmp	r2, #4
+	blo	.Lcbc_dec_three
+	vld1.8	{q3}, [r0]!
+	beq	.Lcbc_dec_four
+	vld1.8	{q4}, [r0]!
+	cmp	r2, #6
+	blo	.Lcbc_dec_five
+	vld1.8	{q5}, [r0]!
+	beq	.Lcbc_dec_six
+	vld1.8	{q6}, [r0]!
+	sub	r0, r0, #0x70
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8-q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10-q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12-q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0-q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_six:
+	sub	r0, r0, #0x60
+	bl	_bsaes_decrypt8
+	vldmia	r9,{q14}			@ reload IV
+	vld1.8	{q8-q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10-q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0-q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_five:
+	sub	r0, r0, #0x50
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8-q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10-q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0-q1}, [r1]!	@ write output
+	veor	q2, q2, q11
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_four:
+	sub	r0, r0, #0x40
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8-q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0-q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_three:
+	sub	r0, r0, #0x30
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8-q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vst1.8	{q0-q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_two:
+	sub	r0, r0, #0x20
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q1, q1, q8
+	vst1.8	{q0-q1}, [r1]!	@ write output
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_one:
+	sub	r0, r0, #0x10
+	mov	r10, r1			@ save original out pointer
+	mov	r1, r9			@ use the iv scratch space as out buffer
+	mov	r2, r3
+	vmov	q4,q15		@ just in case ensure that IV
+	vmov	q5,q0			@ and input are preserved
+	bl	AES_decrypt
+	vld1.8	{q0}, [r9,:64]		@ load result
+	veor	q0, q0, q4	@ ^= IV
+	vmov	q15, q5		@ q5 holds input
+	vst1.8	{q0}, [r10]		@ write output
+
+.Lcbc_dec_done:
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+.Lcbc_dec_bzero:				@ wipe key schedule [if any]
+	vstmia		sp!, {q0-q1}
+	cmp		sp, r9
+	bne		.Lcbc_dec_bzero
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
+	vst1.8	{q15}, [r8]		@ return IV
+	VFP_ABI_POP
+	ldmia	sp!, {r4-r10, pc}
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.extern	AES_encrypt
+.global	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,%function
+.align	5
+bsaes_ctr32_encrypt_blocks:
+	cmp	r2, #8			@ use plain AES for
+	blo	.Lctr_enc_short			@ small sizes
+
+	mov	ip, sp
+	stmdb	sp!, {r4-r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ ctr is 1st arg on the stack
+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ size of bit-sliced key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+	vld1.8	{q0}, [r8]		@ load counter
+	add	r8, r6, #.LREVM0SR-.LM0	@ borrow r8
+	vldmia	sp, {q4}		@ load round0 key
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+.align	2
+0:	add	r12, r3, #248
+	vld1.8	{q0}, [r8]		@ load counter
+	adrl	r8, .LREVM0SR			@ borrow r8
+	vldmia	r12, {q4}			@ load round0 key
+	sub	sp, #0x10			@ place for adjusted round0 key
+#endif
+
+	vmov.i32	q8,#1		@ compose 1<<96
+	veor		q9,q9,q9
+	vrev32.8	q0,q0
+	vext.8		q8,q9,q8,#4
+	vrev32.8	q4,q4
+	vadd.u32	q9,q8,q8	@ compose 2<<96
+	vstmia	sp, {q4}		@ save adjusted round0 key
+	b	.Lctr_enc_loop
+
+.align	4
+.Lctr_enc_loop:
+	vadd.u32	q10, q8, q9	@ compose 3<<96
+	vadd.u32	q1, q0, q8	@ +1
+	vadd.u32	q2, q0, q9	@ +2
+	vadd.u32	q3, q0, q10	@ +3
+	vadd.u32	q4, q1, q10
+	vadd.u32	q5, q2, q10
+	vadd.u32	q6, q3, q10
+	vadd.u32	q7, q4, q10
+	vadd.u32	q10, q5, q10	@ next counter
+
+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	@ to flip byte order in 32-bit counter
+
+	vldmia		sp, {q9}		@ load round0 key
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x10		@ pass next round key
+#else
+	add		r4, r3, #264
+#endif
+	vldmia		r8, {q8}			@ .LREVM0SR
+	mov		r5, r10			@ pass rounds
+	vstmia		r9, {q10}			@ save next counter
+	sub		r6, r8, #.LREVM0SR-.LSR	@ pass constants
+
+	bl		_bsaes_encrypt8_alt
+
+	subs		r2, r2, #8
+	blo		.Lctr_enc_loop_done
+
+	vld1.8		{q8-q9}, [r0]!	@ load input
+	vld1.8		{q10-q11}, [r0]!
+	veor		q0, q8
+	veor		q1, q9
+	vld1.8		{q12-q13}, [r0]!
+	veor		q4, q10
+	veor		q6, q11
+	vld1.8		{q14-q15}, [r0]!
+	veor		q3, q12
+	vst1.8		{q0-q1}, [r1]!	@ write output
+	veor		q7, q13
+	veor		q2, q14
+	vst1.8		{q4}, [r1]!
+	veor		q5, q15
+	vst1.8		{q6}, [r1]!
+	vmov.i32	q8, #1			@ compose 1<<96
+	vst1.8		{q3}, [r1]!
+	veor		q9, q9, q9
+	vst1.8		{q7}, [r1]!
+	vext.8		q8, q9, q8, #4
+	vst1.8		{q2}, [r1]!
+	vadd.u32	q9,q8,q8		@ compose 2<<96
+	vst1.8		{q5}, [r1]!
+	vldmia		r9, {q0}			@ load counter
+
+	bne		.Lctr_enc_loop
+	b		.Lctr_enc_done
+
+.align	4
+.Lctr_enc_loop_done:
+	add		r2, r2, #8
+	vld1.8		{q8}, [r0]!	@ load input
+	veor		q0, q8
+	vst1.8		{q0}, [r1]!	@ write output
+	cmp		r2, #2
+	blo		.Lctr_enc_done
+	vld1.8		{q9}, [r0]!
+	veor		q1, q9
+	vst1.8		{q1}, [r1]!
+	beq		.Lctr_enc_done
+	vld1.8		{q10}, [r0]!
+	veor		q4, q10
+	vst1.8		{q4}, [r1]!
+	cmp		r2, #4
+	blo		.Lctr_enc_done
+	vld1.8		{q11}, [r0]!
+	veor		q6, q11
+	vst1.8		{q6}, [r1]!
+	beq		.Lctr_enc_done
+	vld1.8		{q12}, [r0]!
+	veor		q3, q12
+	vst1.8		{q3}, [r1]!
+	cmp		r2, #6
+	blo		.Lctr_enc_done
+	vld1.8		{q13}, [r0]!
+	veor		q7, q13
+	vst1.8		{q7}, [r1]!
+	beq		.Lctr_enc_done
+	vld1.8		{q14}, [r0]
+	veor		q2, q14
+	vst1.8		{q2}, [r1]!
+
+.Lctr_enc_done:
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifndef	BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:			@ wipe key schedule [if any]
+	vstmia		sp!, {q0-q1}
+	cmp		sp, r9
+	bne		.Lctr_enc_bzero
+#else
+	vstmia		sp, {q0-q1}
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
+	VFP_ABI_POP
+	ldmia	sp!, {r4-r10, pc}	@ return
+
+.align	4
+.Lctr_enc_short:
+	ldr	ip, [sp]		@ ctr pointer is passed on stack
+	stmdb	sp!, {r4-r8, lr}
+
+	mov	r4, r0		@ copy arguments
+	mov	r5, r1
+	mov	r6, r2
+	mov	r7, r3
+	ldr	r8, [ip, #12]		@ load counter LSW
+	vld1.8	{q1}, [ip]		@ load whole counter value
+#ifdef __ARMEL__
+	rev	r8, r8
+#endif
+	sub	sp, sp, #0x10
+	vst1.8	{q1}, [sp,:64]	@ copy counter value
+	sub	sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+	add	r0, sp, #0x10		@ input counter value
+	mov	r1, sp			@ output on the stack
+	mov	r2, r7			@ key
+
+	bl	AES_encrypt
+
+	vld1.8	{q0}, [r4]!	@ load input
+	vld1.8	{q1}, [sp,:64]	@ load encrypted counter
+	add	r8, r8, #1
+#ifdef __ARMEL__
+	rev	r0, r8
+	str	r0, [sp, #0x1c]		@ next counter value
+#else
+	str	r8, [sp, #0x1c]		@ next counter value
+#endif
+	veor	q0,q0,q1
+	vst1.8	{q0}, [r5]!	@ store output
+	subs	r6, r6, #1
+	bne	.Lctr_enc_short_loop
+
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+	vstmia		sp!, {q0-q1}
+
+	ldmia	sp!, {r4-r8, pc}
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl	bsaes_xts_encrypt
+.type	bsaes_xts_encrypt,%function
+.align	4
+bsaes_xts_encrypt:
+	mov	ip, sp
+	stmdb	sp!, {r4-r10, lr}		@ 0x20
+	VFP_ABI_PUSH
+	mov	r6, sp				@ future r3
+
+	mov	r7, r0
+	mov	r8, r1
+	mov	r9, r2
+	mov	r10, r3
+
+	sub	r0, sp, #0x10			@ 0x10
+	bic	r0, #0xf			@ align at 16 bytes
+	mov	sp, r0
+
+#ifdef	XTS_CHAIN_TWEAK
+	ldr	r0, [ip]			@ pointer to input tweak
+#else
+	@ generate initial tweak
+	ldr	r0, [ip, #4]			@ iv[]
+	mov	r1, sp
+	ldr	r2, [ip, #0]			@ key2
+	bl	AES_encrypt
+	mov	r0,sp				@ pointer to initial tweak
+#endif
+
+	ldr	r1, [r10, #240]		@ get # of rounds
+	mov	r3, r6
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
+	@ add	r12, #96			@ size of bit-sliced key schedule
+	sub	r12, #48			@ place for tweak[9]
+
+	@ populate the key schedule
+	mov	r4, r10			@ pass key
+	mov	r5, r1			@ pass # of rounds
+	mov	sp, r12
+	add	r12, #0x90			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7, q7, q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+#else
+	ldr	r12, [r10, #244]
+	eors	r12, #1
+	beq	0f
+
+	str	r12, [r10, #244]
+	mov	r4, r10			@ pass key
+	mov	r5, r1			@ pass # of rounds
+	add	r12, r10, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7, q7, q15	@ fix up last round key
+	vstmia	r12, {q7}
+
+.align	2
+0:	sub	sp, #0x90			@ place for tweak[9]
+#endif
+
+	vld1.8	{q8}, [r0]			@ initial tweak
+	adr	r2, .Lxts_magic
+
+	subs	r9, #0x80
+	blo	.Lxts_enc_short
+	b	.Lxts_enc_loop
+
+.align	4
+.Lxts_enc_loop:
+	vldmia		r2, {q5}	@ load XTS magic
+	vshr.s64	q6, q8, #63
+	mov		r0, sp
+	vand		q6, q6, q5
+	vadd.u64	q9, q8, q8
+	vst1.64		{q8}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q9, #63
+	veor		q9, q9, q6
+	vand		q7, q7, q5
+	vadd.u64	q10, q9, q9
+	vst1.64		{q9}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q10, #63
+	veor		q10, q10, q7
+	vand		q6, q6, q5
+	vld1.8		{q0}, [r7]!
+	vadd.u64	q11, q10, q10
+	vst1.64		{q10}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q11, #63
+	veor		q11, q11, q6
+	vand		q7, q7, q5
+	vld1.8		{q1}, [r7]!
+	veor		q0, q0, q8
+	vadd.u64	q12, q11, q11
+	vst1.64		{q11}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q12, #63
+	veor		q12, q12, q7
+	vand		q6, q6, q5
+	vld1.8		{q2}, [r7]!
+	veor		q1, q1, q9
+	vadd.u64	q13, q12, q12
+	vst1.64		{q12}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q13, #63
+	veor		q13, q13, q6
+	vand		q7, q7, q5
+	vld1.8		{q3}, [r7]!
+	veor		q2, q2, q10
+	vadd.u64	q14, q13, q13
+	vst1.64		{q13}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q14, #63
+	veor		q14, q14, q7
+	vand		q6, q6, q5
+	vld1.8		{q4}, [r7]!
+	veor		q3, q3, q11
+	vadd.u64	q15, q14, q14
+	vst1.64		{q14}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q15, #63
+	veor		q15, q15, q6
+	vand		q7, q7, q5
+	vld1.8		{q5}, [r7]!
+	veor		q4, q4, q12
+	vadd.u64	q8, q15, q15
+	vst1.64		{q15}, [r0,:128]!
+	vswp		d15,d14
+	veor		q8, q8, q7
+	vst1.64		{q8}, [r0,:128]		@ next round tweak
+
+	vld1.8		{q6-q7}, [r7]!
+	veor		q5, q5, q13
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q6, q6, q14
+	mov		r5, r1			@ pass rounds
+	veor		q7, q7, q15
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	vld1.64		{q12-q13}, [r0,:128]!
+	veor		q1, q1, q9
+	veor		q8, q4, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q6, q11
+	vld1.64		{q14-q15}, [r0,:128]!
+	veor		q10, q3, q12
+	vst1.8		{q8-q9}, [r8]!
+	veor		q11, q7, q13
+	veor		q12, q2, q14
+	vst1.8		{q10-q11}, [r8]!
+	veor		q13, q5, q15
+	vst1.8		{q12-q13}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+
+	subs		r9, #0x80
+	bpl		.Lxts_enc_loop
+
+.Lxts_enc_short:
+	adds		r9, #0x70
+	bmi		.Lxts_enc_done
+
+	vldmia		r2, {q5}	@ load XTS magic
+	vshr.s64	q7, q8, #63
+	mov		r0, sp
+	vand		q7, q7, q5
+	vadd.u64	q9, q8, q8
+	vst1.64		{q8}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q9, #63
+	veor		q9, q9, q7
+	vand		q6, q6, q5
+	vadd.u64	q10, q9, q9
+	vst1.64		{q9}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q10, #63
+	veor		q10, q10, q6
+	vand		q7, q7, q5
+	vld1.8		{q0}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_enc_1
+	vadd.u64	q11, q10, q10
+	vst1.64		{q10}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q11, #63
+	veor		q11, q11, q7
+	vand		q6, q6, q5
+	vld1.8		{q1}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_enc_2
+	veor		q0, q0, q8
+	vadd.u64	q12, q11, q11
+	vst1.64		{q11}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q12, #63
+	veor		q12, q12, q6
+	vand		q7, q7, q5
+	vld1.8		{q2}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_enc_3
+	veor		q1, q1, q9
+	vadd.u64	q13, q12, q12
+	vst1.64		{q12}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q13, #63
+	veor		q13, q13, q7
+	vand		q6, q6, q5
+	vld1.8		{q3}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_enc_4
+	veor		q2, q2, q10
+	vadd.u64	q14, q13, q13
+	vst1.64		{q13}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q14, #63
+	veor		q14, q14, q6
+	vand		q7, q7, q5
+	vld1.8		{q4}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_enc_5
+	veor		q3, q3, q11
+	vadd.u64	q15, q14, q14
+	vst1.64		{q14}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q15, #63
+	veor		q15, q15, q7
+	vand		q6, q6, q5
+	vld1.8		{q5}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_enc_6
+	veor		q4, q4, q12
+	sub		r9, #0x10
+	vst1.64		{q15}, [r0,:128]		@ next round tweak
+
+	vld1.8		{q6}, [r7]!
+	veor		q5, q5, q13
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q6, q6, q14
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	vld1.64		{q12-q13}, [r0,:128]!
+	veor		q1, q1, q9
+	veor		q8, q4, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q6, q11
+	vld1.64		{q14}, [r0,:128]!
+	veor		q10, q3, q12
+	vst1.8		{q8-q9}, [r8]!
+	veor		q11, q7, q13
+	veor		q12, q2, q14
+	vst1.8		{q10-q11}, [r8]!
+	vst1.8		{q12}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_6:
+	vst1.64		{q14}, [r0,:128]		@ next round tweak
+
+	veor		q4, q4, q12
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q5, q5, q13
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	vld1.64		{q12-q13}, [r0,:128]!
+	veor		q1, q1, q9
+	veor		q8, q4, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q6, q11
+	veor		q10, q3, q12
+	vst1.8		{q8-q9}, [r8]!
+	veor		q11, q7, q13
+	vst1.8		{q10-q11}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align	5
+.Lxts_magic:
+	.quad	1, 0x87
+
+.align	5
+.Lxts_enc_5:
+	vst1.64		{q13}, [r0,:128]		@ next round tweak
+
+	veor		q3, q3, q11
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q4, q4, q12
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	vld1.64		{q12}, [r0,:128]!
+	veor		q1, q1, q9
+	veor		q8, q4, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q6, q11
+	veor		q10, q3, q12
+	vst1.8		{q8-q9}, [r8]!
+	vst1.8		{q10}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_4:
+	vst1.64		{q12}, [r0,:128]		@ next round tweak
+
+	veor		q2, q2, q10
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q3, q3, q11
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	veor		q1, q1, q9
+	veor		q8, q4, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q6, q11
+	vst1.8		{q8-q9}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_3:
+	vst1.64		{q11}, [r0,:128]		@ next round tweak
+
+	veor		q1, q1, q9
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q2, q2, q10
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10}, [r0,:128]!
+	veor		q0, q0, q8
+	veor		q1, q1, q9
+	veor		q8, q4, q10
+	vst1.8		{q0-q1}, [r8]!
+	vst1.8		{q8}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_2:
+	vst1.64		{q10}, [r0,:128]		@ next round tweak
+
+	veor		q0, q0, q8
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q1, q1, q9
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	veor		q0, q0, q8
+	veor		q1, q1, q9
+	vst1.8		{q0-q1}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_1:
+	mov		r0, sp
+	veor		q0, q8
+	mov		r1, sp
+	vst1.8		{q0}, [sp,:128]
+	mov		r2, r10
+	mov		r4, r3				@ preserve fp
+
+	bl		AES_encrypt
+
+	vld1.8		{q0}, [sp,:128]
+	veor		q0, q0, q8
+	vst1.8		{q0}, [r8]!
+	mov		r3, r4
+
+	vmov		q8, q9		@ next round tweak
+
+.Lxts_enc_done:
+#ifndef	XTS_CHAIN_TWEAK
+	adds		r9, #0x10
+	beq		.Lxts_enc_ret
+	sub		r6, r8, #0x10
+
+.Lxts_enc_steal:
+	ldrb		r0, [r7], #1
+	ldrb		r1, [r8, #-0x10]
+	strb		r0, [r8, #-0x10]
+	strb		r1, [r8], #1
+
+	subs		r9, #1
+	bhi		.Lxts_enc_steal
+
+	vld1.8		{q0}, [r6]
+	mov		r0, sp
+	veor		q0, q0, q8
+	mov		r1, sp
+	vst1.8		{q0}, [sp,:128]
+	mov		r2, r10
+	mov		r4, r3			@ preserve fp
+
+	bl		AES_encrypt
+
+	vld1.8		{q0}, [sp,:128]
+	veor		q0, q0, q8
+	vst1.8		{q0}, [r6]
+	mov		r3, r4
+#endif
+
+.Lxts_enc_ret:
+	bic		r0, r3, #0xf
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifdef	XTS_CHAIN_TWEAK
+	ldr		r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
+#endif
+.Lxts_enc_bzero:				@ wipe key schedule [if any]
+	vstmia		sp!, {q0-q1}
+	cmp		sp, r0
+	bne		.Lxts_enc_bzero
+
+	mov		sp, r3
+#ifdef	XTS_CHAIN_TWEAK
+	vst1.8		{q8}, [r1]
+#endif
+	VFP_ABI_POP
+	ldmia		sp!, {r4-r10, pc}	@ return
+
+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl	bsaes_xts_decrypt
+.type	bsaes_xts_decrypt,%function
+.align	4
+bsaes_xts_decrypt:
+	mov	ip, sp
+	stmdb	sp!, {r4-r10, lr}		@ 0x20
+	VFP_ABI_PUSH
+	mov	r6, sp				@ future r3
+
+	mov	r7, r0
+	mov	r8, r1
+	mov	r9, r2
+	mov	r10, r3
+
+	sub	r0, sp, #0x10			@ 0x10
+	bic	r0, #0xf			@ align at 16 bytes
+	mov	sp, r0
+
+#ifdef	XTS_CHAIN_TWEAK
+	ldr	r0, [ip]			@ pointer to input tweak
+#else
+	@ generate initial tweak
+	ldr	r0, [ip, #4]			@ iv[]
+	mov	r1, sp
+	ldr	r2, [ip, #0]			@ key2
+	bl	AES_encrypt
+	mov	r0, sp				@ pointer to initial tweak
+#endif
+
+	ldr	r1, [r10, #240]		@ get # of rounds
+	mov	r3, r6
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
+	@ add	r12, #96			@ size of bit-sliced key schedule
+	sub	r12, #48			@ place for tweak[9]
+
+	@ populate the key schedule
+	mov	r4, r10			@ pass key
+	mov	r5, r1			@ pass # of rounds
+	mov	sp, r12
+	add	r12, #0x90			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, sp, #0x90
+	vldmia	r4, {q6}
+	vstmia	r12,  {q15}		@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	r4, {q7}
+#else
+	ldr	r12, [r10, #244]
+	eors	r12, #1
+	beq	0f
+
+	str	r12, [r10, #244]
+	mov	r4, r10			@ pass key
+	mov	r5, r1			@ pass # of rounds
+	add	r12, r10, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, r10, #248
+	vldmia	r4, {q6}
+	vstmia	r12,  {q15}		@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	r4, {q7}
+
+.align	2
+0:	sub	sp, #0x90			@ place for tweak[9]
+#endif
+	vld1.8	{q8}, [r0]			@ initial tweak
+	adr	r2, .Lxts_magic
+
+	tst	r9, #0xf			@ if not multiple of 16
+	it	ne				@ Thumb2 thing, sanity check in ARM
+	subne	r9, #0x10			@ subtract another 16 bytes
+	subs	r9, #0x80
+
+	blo	.Lxts_dec_short
+	b	.Lxts_dec_loop
+
+.align	4
+.Lxts_dec_loop:
+	vldmia		r2, {q5}	@ load XTS magic
+	vshr.s64	q6, q8, #63
+	mov		r0, sp
+	vand		q6, q6, q5
+	vadd.u64	q9, q8, q8
+	vst1.64		{q8}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q9, #63
+	veor		q9, q9, q6
+	vand		q7, q7, q5
+	vadd.u64	q10, q9, q9
+	vst1.64		{q9}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q10, #63
+	veor		q10, q10, q7
+	vand		q6, q6, q5
+	vld1.8		{q0}, [r7]!
+	vadd.u64	q11, q10, q10
+	vst1.64		{q10}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q11, #63
+	veor		q11, q11, q6
+	vand		q7, q7, q5
+	vld1.8		{q1}, [r7]!
+	veor		q0, q0, q8
+	vadd.u64	q12, q11, q11
+	vst1.64		{q11}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q12, #63
+	veor		q12, q12, q7
+	vand		q6, q6, q5
+	vld1.8		{q2}, [r7]!
+	veor		q1, q1, q9
+	vadd.u64	q13, q12, q12
+	vst1.64		{q12}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q13, #63
+	veor		q13, q13, q6
+	vand		q7, q7, q5
+	vld1.8		{q3}, [r7]!
+	veor		q2, q2, q10
+	vadd.u64	q14, q13, q13
+	vst1.64		{q13}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q14, #63
+	veor		q14, q14, q7
+	vand		q6, q6, q5
+	vld1.8		{q4}, [r7]!
+	veor		q3, q3, q11
+	vadd.u64	q15, q14, q14
+	vst1.64		{q14}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q15, #63
+	veor		q15, q15, q6
+	vand		q7, q7, q5
+	vld1.8		{q5}, [r7]!
+	veor		q4, q4, q12
+	vadd.u64	q8, q15, q15
+	vst1.64		{q15}, [r0,:128]!
+	vswp		d15,d14
+	veor		q8, q8, q7
+	vst1.64		{q8}, [r0,:128]		@ next round tweak
+
+	vld1.8		{q6-q7}, [r7]!
+	veor		q5, q5, q13
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q6, q6, q14
+	mov		r5, r1			@ pass rounds
+	veor		q7, q7, q15
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	vld1.64		{q12-q13}, [r0,:128]!
+	veor		q1, q1, q9
+	veor		q8, q6, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q4, q11
+	vld1.64		{q14-q15}, [r0,:128]!
+	veor		q10, q2, q12
+	vst1.8		{q8-q9}, [r8]!
+	veor		q11, q7, q13
+	veor		q12, q3, q14
+	vst1.8		{q10-q11}, [r8]!
+	veor		q13, q5, q15
+	vst1.8		{q12-q13}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+
+	subs		r9, #0x80
+	bpl		.Lxts_dec_loop
+
+.Lxts_dec_short:
+	adds		r9, #0x70
+	bmi		.Lxts_dec_done
+
+	vldmia		r2, {q5}	@ load XTS magic
+	vshr.s64	q7, q8, #63
+	mov		r0, sp
+	vand		q7, q7, q5
+	vadd.u64	q9, q8, q8
+	vst1.64		{q8}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q9, #63
+	veor		q9, q9, q7
+	vand		q6, q6, q5
+	vadd.u64	q10, q9, q9
+	vst1.64		{q9}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q10, #63
+	veor		q10, q10, q6
+	vand		q7, q7, q5
+	vld1.8		{q0}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_dec_1
+	vadd.u64	q11, q10, q10
+	vst1.64		{q10}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q11, #63
+	veor		q11, q11, q7
+	vand		q6, q6, q5
+	vld1.8		{q1}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_dec_2
+	veor		q0, q0, q8
+	vadd.u64	q12, q11, q11
+	vst1.64		{q11}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q12, #63
+	veor		q12, q12, q6
+	vand		q7, q7, q5
+	vld1.8		{q2}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_dec_3
+	veor		q1, q1, q9
+	vadd.u64	q13, q12, q12
+	vst1.64		{q12}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q13, #63
+	veor		q13, q13, q7
+	vand		q6, q6, q5
+	vld1.8		{q3}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_dec_4
+	veor		q2, q2, q10
+	vadd.u64	q14, q13, q13
+	vst1.64		{q13}, [r0,:128]!
+	vswp		d13,d12
+	vshr.s64	q7, q14, #63
+	veor		q14, q14, q6
+	vand		q7, q7, q5
+	vld1.8		{q4}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_dec_5
+	veor		q3, q3, q11
+	vadd.u64	q15, q14, q14
+	vst1.64		{q14}, [r0,:128]!
+	vswp		d15,d14
+	vshr.s64	q6, q15, #63
+	veor		q15, q15, q7
+	vand		q6, q6, q5
+	vld1.8		{q5}, [r7]!
+	subs		r9, #0x10
+	bmi		.Lxts_dec_6
+	veor		q4, q4, q12
+	sub		r9, #0x10
+	vst1.64		{q15}, [r0,:128]		@ next round tweak
+
+	vld1.8		{q6}, [r7]!
+	veor		q5, q5, q13
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q6, q6, q14
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	vld1.64		{q12-q13}, [r0,:128]!
+	veor		q1, q1, q9
+	veor		q8, q6, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q4, q11
+	vld1.64		{q14}, [r0,:128]!
+	veor		q10, q2, q12
+	vst1.8		{q8-q9}, [r8]!
+	veor		q11, q7, q13
+	veor		q12, q3, q14
+	vst1.8		{q10-q11}, [r8]!
+	vst1.8		{q12}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_6:
+	vst1.64		{q14}, [r0,:128]		@ next round tweak
+
+	veor		q4, q4, q12
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q5, q5, q13
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	vld1.64		{q12-q13}, [r0,:128]!
+	veor		q1, q1, q9
+	veor		q8, q6, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q4, q11
+	veor		q10, q2, q12
+	vst1.8		{q8-q9}, [r8]!
+	veor		q11, q7, q13
+	vst1.8		{q10-q11}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_5:
+	vst1.64		{q13}, [r0,:128]		@ next round tweak
+
+	veor		q3, q3, q11
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q4, q4, q12
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	vld1.64		{q12}, [r0,:128]!
+	veor		q1, q1, q9
+	veor		q8, q6, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q4, q11
+	veor		q10, q2, q12
+	vst1.8		{q8-q9}, [r8]!
+	vst1.8		{q10}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_4:
+	vst1.64		{q12}, [r0,:128]		@ next round tweak
+
+	veor		q2, q2, q10
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q3, q3, q11
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10-q11}, [r0,:128]!
+	veor		q0, q0, q8
+	veor		q1, q1, q9
+	veor		q8, q6, q10
+	vst1.8		{q0-q1}, [r8]!
+	veor		q9, q4, q11
+	vst1.8		{q8-q9}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_3:
+	vst1.64		{q11}, [r0,:128]		@ next round tweak
+
+	veor		q1, q1, q9
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q2, q2, q10
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	vld1.64		{q10}, [r0,:128]!
+	veor		q0, q0, q8
+	veor		q1, q1, q9
+	veor		q8, q6, q10
+	vst1.8		{q0-q1}, [r8]!
+	vst1.8		{q8}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_2:
+	vst1.64		{q10}, [r0,:128]		@ next round tweak
+
+	veor		q0, q0, q8
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, r10, #248			@ pass key schedule
+#endif
+	veor		q1, q1, q9
+	mov		r5, r1			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{q8-q9}, [r0,:128]!
+	veor		q0, q0, q8
+	veor		q1, q1, q9
+	vst1.8		{q0-q1}, [r8]!
+
+	vld1.64		{q8}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_1:
+	mov		r0, sp
+	veor		q0, q8
+	mov		r1, sp
+	vst1.8		{q0}, [sp,:128]
+	mov		r2, r10
+	mov		r4, r3				@ preserve fp
+	mov		r5, r2			@ preserve magic
+
+	bl		AES_decrypt
+
+	vld1.8		{q0}, [sp,:128]
+	veor		q0, q0, q8
+	vst1.8		{q0}, [r8]!
+	mov		r3, r4
+	mov		r2, r5
+
+	vmov		q8, q9		@ next round tweak
+
+.Lxts_dec_done:
+#ifndef	XTS_CHAIN_TWEAK
+	adds		r9, #0x10
+	beq		.Lxts_dec_ret
+
+	@ calculate one round of extra tweak for the stolen ciphertext
+	vldmia		r2, {q5}
+	vshr.s64	q6, q8, #63
+	vand		q6, q6, q5
+	vadd.u64	q9, q8, q8
+	vswp		d13,d12
+	veor		q9, q9, q6
+
+	@ perform the final decryption with the last tweak value
+	vld1.8		{q0}, [r7]!
+	mov		r0, sp
+	veor		q0, q0, q9
+	mov		r1, sp
+	vst1.8		{q0}, [sp,:128]
+	mov		r2, r10
+	mov		r4, r3			@ preserve fp
+
+	bl		AES_decrypt
+
+	vld1.8		{q0}, [sp,:128]
+	veor		q0, q0, q9
+	vst1.8		{q0}, [r8]
+
+	mov		r6, r8
+.Lxts_dec_steal:
+	ldrb		r1, [r8]
+	ldrb		r0, [r7], #1
+	strb		r1, [r8, #0x10]
+	strb		r0, [r8], #1
+
+	subs		r9, #1
+	bhi		.Lxts_dec_steal
+
+	vld1.8		{q0}, [r6]
+	mov		r0, sp
+	veor		q0, q8
+	mov		r1, sp
+	vst1.8		{q0}, [sp,:128]
+	mov		r2, r10
+
+	bl		AES_decrypt
+
+	vld1.8		{q0}, [sp,:128]
+	veor		q0, q0, q8
+	vst1.8		{q0}, [r6]
+	mov		r3, r4
+#endif
+
+.Lxts_dec_ret:
+	bic		r0, r3, #0xf
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifdef	XTS_CHAIN_TWEAK
+	ldr		r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
+#endif
+.Lxts_dec_bzero:				@ wipe key schedule [if any]
+	vstmia		sp!, {q0-q1}
+	cmp		sp, r0
+	bne		.Lxts_dec_bzero
+
+	mov		sp, r3
+#ifdef	XTS_CHAIN_TWEAK
+	vst1.8		{q8}, [r1]
+#endif
+	VFP_ABI_POP
+	ldmia		sp!, {r4-r10, pc}	@ return
+
+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
+#endif
diff --git a/main/openssl/crypto/aes/asm/bsaes-armv7.pl b/main/openssl/crypto/aes/asm/bsaes-armv7.pl
new file mode 100644
index 00000000..f3d96d93
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/bsaes-armv7.pl
@@ -0,0 +1,2467 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt	19.5 cycles per byte processed with 128-bit key
+# decrypt	22.1 cycles per byte processed with 128-bit key
+# key conv.	440  cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+#						<appro@openssl.org>
+
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+#					<ard.biesheuvel@linaro.org>
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InBasisChange	(@b);
+	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
+	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+	veor	@b[2], @b[2], @b[1]
+	veor	@b[5], @b[5], @b[6]
+	veor	@b[3], @b[3], @b[0]
+	veor	@b[6], @b[6], @b[2]
+	veor	@b[5], @b[5], @b[0]
+
+	veor	@b[6], @b[6], @b[3]
+	veor	@b[3], @b[3], @b[7]
+	veor	@b[7], @b[7], @b[5]
+	veor	@b[3], @b[3], @b[4]
+	veor	@b[4], @b[4], @b[5]
+
+	veor	@b[2], @b[2], @b[7]
+	veor	@b[3], @b[3], @b[1]
+	veor	@b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+	veor	@b[0], @b[0], @b[6]
+	veor	@b[1], @b[1], @b[4]
+	veor	@b[4], @b[4], @b[6]
+	veor	@b[2], @b[2], @b[0]
+	veor	@b[6], @b[6], @b[1]
+
+	veor	@b[1], @b[1], @b[5]
+	veor	@b[5], @b[5], @b[3]
+	veor	@b[3], @b[3], @b[7]
+	veor	@b[7], @b[7], @b[5]
+	veor	@b[2], @b[2], @b[5]
+
+	veor	@b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InvInBasisChange	(@b);
+	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
+	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange {		# OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+	 veor	@b[1], @b[1], @b[7]
+	veor	@b[4], @b[4], @b[7]
+
+	veor	@b[7], @b[7], @b[5]
+	 veor	@b[1], @b[1], @b[3]
+	veor	@b[2], @b[2], @b[5]
+	veor	@b[3], @b[3], @b[7]
+
+	veor	@b[6], @b[6], @b[1]
+	veor	@b[2], @b[2], @b[0]
+	 veor	@b[5], @b[5], @b[3]
+	veor	@b[4], @b[4], @b[6]
+	veor	@b[0], @b[0], @b[6]
+	veor	@b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange {		# InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+	veor	@b[1], @b[1], @b[5]
+	veor	@b[2], @b[2], @b[7]
+
+	veor	@b[3], @b[3], @b[1]
+	veor	@b[4], @b[4], @b[5]
+	veor	@b[7], @b[7], @b[5]
+	veor	@b[3], @b[3], @b[4]
+	 veor 	@b[5], @b[5], @b[0]
+	veor	@b[3], @b[3], @b[7]
+	 veor	@b[6], @b[6], @b[2]
+	 veor	@b[2], @b[2], @b[1]
+	veor	@b[6], @b[6], @b[3]
+
+	veor	@b[3], @b[3], @b[0]
+	veor	@b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+	veor 	$t0, $y0, $y1
+	vand	$t0, $t0, $x0
+	veor	$x0, $x0, $x1
+	vand	$t1, $x1, $y0
+	vand	$x0, $x0, $y1
+	veor	$x1, $t1, $t0
+	veor	$x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N {				# not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	veor	$t0, $y0, $y1
+	vand	$t0, $t0, $x0
+	veor	$x0, $x0, $x1
+	vand	$x1, $x1, $y0
+	vand	$x0, $x0, $y1
+	veor	$x1, $x1, $x0
+	veor	$x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+	veor	$t0, $y0, $y1
+	 veor 	$t1, $y2, $y3
+	vand	$t0, $t0, $x0
+	 vand	$t1, $t1, $x2
+	veor	$x0, $x0, $x1
+	 veor	$x2, $x2, $x3
+	vand	$x1, $x1, $y0
+	 vand	$x3, $x3, $y2
+	vand	$x0, $x0, $y1
+	 vand	$x2, $x2, $y3
+	veor	$x1, $x1, $x0
+	 veor	$x2, $x2, $x3
+	veor	$x0, $x0, $t0
+	 veor	$x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+	veor	@t[0], @x[0], @x[2]
+	veor	@t[1], @x[1], @x[3]
+___
+	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+	veor	@y[0], @y[0], @y[2]
+	veor	@y[1], @y[1], @y[3]
+___
+	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	veor	@x[0], @x[0], @t[0]
+	veor	@x[2], @x[2], @t[0]
+	veor	@x[1], @x[1], @t[1]
+	veor	@x[3], @x[3], @t[1]
+
+	veor	@t[0], @x[4], @x[6]
+	veor	@t[1], @x[5], @x[7]
+___
+	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	veor	@y[0], @y[0], @y[2]
+	veor	@y[1], @y[1], @y[3]
+___
+	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+	veor	@x[4], @x[4], @t[0]
+	veor	@x[6], @x[6], @t[0]
+	veor	@x[5], @x[5], @t[1]
+	veor	@x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+	veor	@t[3], @x[4], @x[6]
+	veor	@t[2], @x[5], @x[7]
+	veor	@t[1], @x[1], @x[3]
+	veor	@s[1], @x[7], @x[6]
+	 vmov	@t[0], @t[2]
+	veor	@s[0], @x[0], @x[2]
+
+	vorr	@t[2], @t[2], @t[1]
+	veor	@s[3], @t[3], @t[0]
+	vand	@s[2], @t[3], @s[0]
+	vorr	@t[3], @t[3], @s[0]
+	veor	@s[0], @s[0], @t[1]
+	vand	@t[0], @t[0], @t[1]
+	veor	@t[1], @x[3], @x[2]
+	vand	@s[3], @s[3], @s[0]
+	vand	@s[1], @s[1], @t[1]
+	veor	@t[1], @x[4], @x[5]
+	veor	@s[0], @x[1], @x[0]
+	veor	@t[3], @t[3], @s[1]
+	veor	@t[2], @t[2], @s[1]
+	vand	@s[1], @t[1], @s[0]
+	vorr	@t[1], @t[1], @s[0]
+	veor	@t[3], @t[3], @s[3]
+	veor	@t[0], @t[0], @s[1]
+	veor	@t[2], @t[2], @s[2]
+	veor	@t[1], @t[1], @s[3]
+	veor	@t[0], @t[0], @s[2]
+	vand	@s[0], @x[7], @x[3]
+	veor	@t[1], @t[1], @s[2]
+	vand	@s[1], @x[6], @x[2]
+	vand	@s[2], @x[5], @x[1]
+	vorr	@s[3], @x[4], @x[0]
+	veor	@t[3], @t[3], @s[0]
+	veor	@t[1], @t[1], @s[2]
+	veor	@t[0], @t[0], @s[3]
+	veor	@t[2], @t[2], @s[1]
+
+	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+	@ new smaller inversion
+
+	vand	@s[2], @t[3], @t[1]
+	vmov	@s[0], @t[0]
+
+	veor	@s[1], @t[2], @s[2]
+	veor	@s[3], @t[0], @s[2]
+	veor	@s[2], @t[0], @s[2]	@ @s[2]=@s[3]
+
+	vbsl	@s[1], @t[1], @t[0]
+	vbsl	@s[3], @t[3], @t[2]
+	veor	@t[3], @t[3], @t[2]
+
+	vbsl	@s[0], @s[1], @s[2]
+	vbsl	@t[0], @s[2], @s[1]
+
+	vand	@s[2], @s[0], @s[3]
+	veor	@t[1], @t[1], @t[0]
+
+	veor	@s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+	vldmia	$key!, {@t[0]-@t[3]}
+	veor	@t[0], @t[0], @x[0]
+	veor	@t[1], @t[1], @x[1]
+	vtbl.8	`&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+	vldmia	$key!, {@t[0]}
+	veor	@t[2], @t[2], @x[2]
+	vtbl.8	`&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+	vldmia	$key!, {@t[1]}
+	veor	@t[3], @t[3], @x[3]
+	vtbl.8	`&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+	vldmia	$key!, {@t[2]}
+	vtbl.8	`&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+	vldmia	$key!, {@t[3]}
+	veor	@t[0], @t[0], @x[4]
+	veor	@t[1], @t[1], @x[5]
+	vtbl.8	`&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+	veor	@t[2], @t[2], @x[6]
+	vtbl.8	`&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+	veor	@t[3], @t[3], @x[7]
+	vtbl.8	`&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+	vtbl.8	`&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16];	# optional
+$code.=<<___;
+	vext.8	@t[0], @x[0], @x[0], #12	@ x0 <<< 32
+	vext.8	@t[1], @x[1], @x[1], #12
+	 veor	@x[0], @x[0], @t[0]		@ x0 ^ (x0 <<< 32)
+	vext.8	@t[2], @x[2], @x[2], #12
+	 veor	@x[1], @x[1], @t[1]
+	vext.8	@t[3], @x[3], @x[3], #12
+	 veor	@x[2], @x[2], @t[2]
+	vext.8	@t[4], @x[4], @x[4], #12
+	 veor	@x[3], @x[3], @t[3]
+	vext.8	@t[5], @x[5], @x[5], #12
+	 veor	@x[4], @x[4], @t[4]
+	vext.8	@t[6], @x[6], @x[6], #12
+	 veor	@x[5], @x[5], @t[5]
+	vext.8	@t[7], @x[7], @x[7], #12
+	 veor	@x[6], @x[6], @t[6]
+
+	veor	@t[1], @t[1], @x[0]
+	 veor	@x[7], @x[7], @t[7]
+	 vext.8	@x[0], @x[0], @x[0], #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	@t[2], @t[2], @x[1]
+	veor	@t[0], @t[0], @x[7]
+	veor	@t[1], @t[1], @x[7]
+	 vext.8	@x[1], @x[1], @x[1], #8
+	veor	@t[5], @t[5], @x[4]
+	 veor	@x[0], @x[0], @t[0]
+	veor	@t[6], @t[6], @x[5]
+	 veor	@x[1], @x[1], @t[1]
+	 vext.8	@t[0], @x[4], @x[4], #8
+	veor	@t[4], @t[4], @x[3]
+	 vext.8	@t[1], @x[5], @x[5], #8
+	veor	@t[7], @t[7], @x[6]
+	 vext.8	@x[4], @x[3], @x[3], #8
+	veor	@t[3], @t[3], @x[2]
+	 vext.8	@x[5], @x[7], @x[7], #8
+	veor	@t[4], @t[4], @x[7]
+	 vext.8	@x[3], @x[6], @x[6], #8
+	veor	@t[3], @t[3], @x[7]
+	 vext.8	@x[6], @x[2], @x[2], #8
+	veor	@x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+	veor	@x[2], @t[0], @t[4]
+	veor	@x[4], @x[4], @t[3]
+	veor	@x[5], @x[5], @t[7]
+	veor	@x[3], @x[3], @t[6]
+	 @ vmov	@x[2], @t[0]
+	veor	@x[6], @x[6], @t[2]
+	 @ vmov	@x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+	veor	@t[3], @t[3], @x[4]
+	veor	@x[5], @x[5], @t[7]
+	veor	@x[2], @x[3], @t[6]
+	veor	@x[3], @t[0], @t[4]
+	veor	@x[4], @x[6], @t[2]
+	vmov	@x[6], @t[3]
+	 @ vmov	@x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+	@ multiplication by 0x0e
+	vext.8	@t[7], @x[7], @x[7], #12
+	vmov	@t[2], @x[2]
+	veor	@x[2], @x[2], @x[5]		@ 2 5
+	veor	@x[7], @x[7], @x[5]		@ 7 5
+	vext.8	@t[0], @x[0], @x[0], #12
+	vmov	@t[5], @x[5]
+	veor	@x[5], @x[5], @x[0]		@ 5 0		[1]
+	veor	@x[0], @x[0], @x[1]		@ 0 1
+	vext.8	@t[1], @x[1], @x[1], #12
+	veor	@x[1], @x[1], @x[2]		@ 1 25
+	veor	@x[0], @x[0], @x[6]		@ 01 6		[2]
+	vext.8	@t[3], @x[3], @x[3], #12
+	veor	@x[1], @x[1], @x[3]		@ 125 3		[4]
+	veor	@x[2], @x[2], @x[0]		@ 25 016	[3]
+	veor	@x[3], @x[3], @x[7]		@ 3 75
+	veor	@x[7], @x[7], @x[6]		@ 75 6		[0]
+	vext.8	@t[6], @x[6], @x[6], #12
+	vmov	@t[4], @x[4]
+	veor	@x[6], @x[6], @x[4]		@ 6 4
+	veor	@x[4], @x[4], @x[3]		@ 4 375		[6]
+	veor	@x[3], @x[3], @x[7]		@ 375 756=36
+	veor	@x[6], @x[6], @t[5]		@ 64 5		[7]
+	veor	@x[3], @x[3], @t[2]		@ 36 2
+	vext.8	@t[5], @t[5], @t[5], #12
+	veor	@x[3], @x[3], @t[4]		@ 362 4		[5]
+___
+					my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+	@ multiplication by 0x0b
+	veor	@y[1], @y[1], @y[0]
+	veor	@y[0], @y[0], @t[0]
+	vext.8	@t[2], @t[2], @t[2], #12
+	veor	@y[1], @y[1], @t[1]
+	veor	@y[0], @y[0], @t[5]
+	vext.8	@t[4], @t[4], @t[4], #12
+	veor	@y[1], @y[1], @t[6]
+	veor	@y[0], @y[0], @t[7]
+	veor	@t[7], @t[7], @t[6]		@ clobber t[7]
+
+	veor	@y[3], @y[3], @t[0]
+	 veor	@y[1], @y[1], @y[0]
+	vext.8	@t[0], @t[0], @t[0], #12
+	veor	@y[2], @y[2], @t[1]
+	veor	@y[4], @y[4], @t[1]
+	vext.8	@t[1], @t[1], @t[1], #12
+	veor	@y[2], @y[2], @t[2]
+	veor	@y[3], @y[3], @t[2]
+	veor	@y[5], @y[5], @t[2]
+	veor	@y[2], @y[2], @t[7]
+	vext.8	@t[2], @t[2], @t[2], #12
+	veor	@y[3], @y[3], @t[3]
+	veor	@y[6], @y[6], @t[3]
+	veor	@y[4], @y[4], @t[3]
+	veor	@y[7], @y[7], @t[4]
+	vext.8	@t[3], @t[3], @t[3], #12
+	veor	@y[5], @y[5], @t[4]
+	veor	@y[7], @y[7], @t[7]
+	veor	@t[7], @t[7], @t[5]		@ clobber t[7] even more
+	veor	@y[3], @y[3], @t[5]
+	veor	@y[4], @y[4], @t[4]
+
+	veor	@y[5], @y[5], @t[7]
+	vext.8	@t[4], @t[4], @t[4], #12
+	veor	@y[6], @y[6], @t[7]
+	veor	@y[4], @y[4], @t[7]
+
+	veor	@t[7], @t[7], @t[5]
+	vext.8	@t[5], @t[5], @t[5], #12
+
+	@ multiplication by 0x0d
+	veor	@y[4], @y[4], @y[7]
+	 veor	@t[7], @t[7], @t[6]		@ restore t[7]
+	veor	@y[7], @y[7], @t[4]
+	vext.8	@t[6], @t[6], @t[6], #12
+	veor	@y[2], @y[2], @t[0]
+	veor	@y[7], @y[7], @t[5]
+	vext.8	@t[7], @t[7], @t[7], #12
+	veor	@y[2], @y[2], @t[2]
+
+	veor	@y[3], @y[3], @y[1]
+	veor	@y[1], @y[1], @t[1]
+	veor	@y[0], @y[0], @t[0]
+	veor	@y[3], @y[3], @t[0]
+	veor	@y[1], @y[1], @t[5]
+	veor	@y[0], @y[0], @t[5]
+	vext.8	@t[0], @t[0], @t[0], #12
+	veor	@y[1], @y[1], @t[7]
+	veor	@y[0], @y[0], @t[6]
+	veor	@y[3], @y[3], @y[1]
+	veor	@y[4], @y[4], @t[1]
+	vext.8	@t[1], @t[1], @t[1], #12
+
+	veor	@y[7], @y[7], @t[7]
+	veor	@y[4], @y[4], @t[2]
+	veor	@y[5], @y[5], @t[2]
+	veor	@y[2], @y[2], @t[6]
+	veor	@t[6], @t[6], @t[3]		@ clobber t[6]
+	vext.8	@t[2], @t[2], @t[2], #12
+	veor	@y[4], @y[4], @y[7]
+	veor	@y[3], @y[3], @t[6]
+
+	veor	@y[6], @y[6], @t[6]
+	veor	@y[5], @y[5], @t[5]
+	vext.8	@t[5], @t[5], @t[5], #12
+	veor	@y[6], @y[6], @t[4]
+	vext.8	@t[4], @t[4], @t[4], #12
+	veor	@y[5], @y[5], @t[6]
+	veor	@y[6], @y[6], @t[7]
+	vext.8	@t[7], @t[7], @t[7], #12
+	veor	@t[6], @t[6], @t[3]		@ restore t[6]
+	vext.8	@t[3], @t[3], @t[3], #12
+
+	@ multiplication by 0x09
+	veor	@y[4], @y[4], @y[1]
+	veor	@t[1], @t[1], @y[1]		@ t[1]=y[1]
+	veor	@t[0], @t[0], @t[5]		@ clobber t[0]
+	vext.8	@t[6], @t[6], @t[6], #12
+	veor	@t[1], @t[1], @t[5]
+	veor	@y[3], @y[3], @t[0]
+	veor	@t[0], @t[0], @y[0]		@ t[0]=y[0]
+	veor	@t[1], @t[1], @t[6]
+	veor	@t[6], @t[6], @t[7]		@ clobber t[6]
+	veor	@y[4], @y[4], @t[1]
+	veor	@y[7], @y[7], @t[4]
+	veor	@y[6], @y[6], @t[3]
+	veor	@y[5], @y[5], @t[2]
+	veor	@t[4], @t[4], @y[4]		@ t[4]=y[4]
+	veor	@t[3], @t[3], @y[3]		@ t[3]=y[3]
+	veor	@t[5], @t[5], @y[5]		@ t[5]=y[5]
+	veor	@t[2], @t[2], @y[2]		@ t[2]=y[2]
+	veor	@t[3], @t[3], @t[7]
+	veor	@XMM[5], @t[5], @t[6]
+	veor	@XMM[6], @t[6], @y[6]		@ t[6]=y[6]
+	veor	@XMM[2], @t[2], @t[6]
+	veor	@XMM[7], @t[7], @y[7]		@ t[7]=y[7]
+
+	vmov	@XMM[0], @t[0]
+	vmov	@XMM[1], @t[1]
+	@ vmov	@XMM[2], @t[2]
+	vmov	@XMM[3], @t[3]
+	vmov	@XMM[4], @t[4]
+	@ vmov	@XMM[5], @t[5]
+	@ vmov	@XMM[6], @t[6]
+	@ vmov	@XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
+# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
+
+$code.=<<___;
+	@ multiplication by 0x05-0x00-0x04-0x00
+	vext.8	@t[0], @x[0], @x[0], #8
+	vext.8	@t[6], @x[6], @x[6], #8
+	vext.8	@t[7], @x[7], @x[7], #8
+	veor	@t[0], @t[0], @x[0]
+	vext.8	@t[1], @x[1], @x[1], #8
+	veor	@t[6], @t[6], @x[6]
+	vext.8	@t[2], @x[2], @x[2], #8
+	veor	@t[7], @t[7], @x[7]
+	vext.8	@t[3], @x[3], @x[3], #8
+	veor	@t[1], @t[1], @x[1]
+	vext.8	@t[4], @x[4], @x[4], #8
+	veor	@t[2], @t[2], @x[2]
+	vext.8	@t[5], @x[5], @x[5], #8
+	veor	@t[3], @t[3], @x[3]
+	veor	@t[4], @t[4], @x[4]
+	veor	@t[5], @t[5], @x[5]
+
+	 veor	@x[0], @x[0], @t[6]
+	 veor	@x[1], @x[1], @t[6]
+	 veor	@x[2], @x[2], @t[0]
+	 veor	@x[4], @x[4], @t[2]
+	 veor	@x[3], @x[3], @t[1]
+	 veor	@x[1], @x[1], @t[7]
+	 veor	@x[2], @x[2], @t[7]
+	 veor	@x[4], @x[4], @t[6]
+	 veor	@x[5], @x[5], @t[3]
+	 veor	@x[3], @x[3], @t[6]
+	 veor	@x[6], @x[6], @t[4]
+	 veor	@x[4], @x[4], @t[7]
+	 veor	@x[5], @x[5], @t[7]
+	 veor	@x[7], @x[7], @t[5]
+___
+	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+	vshr.u64	$t, $b, #$n
+	veor		$t, $t, $a
+	vand		$t, $t, $mask
+	veor		$a, $a, $t
+	vshl.u64	$t, $t, #$n
+	veor		$b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+	vshr.u64	$t0, $b0, #$n
+	 vshr.u64	$t1, $b1, #$n
+	veor		$t0, $t0, $a0
+	 veor		$t1, $t1, $a1
+	vand		$t0, $t0, $mask
+	 vand		$t1, $t1, $mask
+	veor		$a0, $a0, $t0
+	vshl.u64	$t0, $t0, #$n
+	 veor		$a1, $a1, $t1
+	 vshl.u64	$t1, $t1, #$n
+	veor		$b0, $b0, $t0
+	 veor		$b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+	vmov.i8	$t0,#0x55			@ compose .LBS0
+	vmov.i8	$t1,#0x33			@ compose .LBS1
+___
+	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+	vmov.i8	$t0,#0x0f			@ compose .LBS2
+___
+	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+# define VFP_ABI_FRAME	0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME	0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code   32
+#endif
+
+.fpu	neon
+
+.type	_bsaes_decrypt8,%function
+.align	4
+_bsaes_decrypt8:
+	adr	$const,_bsaes_decrypt8
+	vldmia	$key!, {@XMM[9]}		@ round 0 key
+	add	$const,$const,#.LM0ISR-_bsaes_decrypt8
+
+	vldmia	$const!, {@XMM[8]}		@ .LM0ISR
+	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
+	veor	@XMM[11], @XMM[1], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+	veor	@XMM[12], @XMM[2], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+	veor	@XMM[13], @XMM[3], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+	veor	@XMM[14], @XMM[4], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+	veor	@XMM[15], @XMM[5], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+	veor	@XMM[10], @XMM[6], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+	veor	@XMM[11], @XMM[7], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	sub	$rounds,$rounds,#1
+	b	.Ldec_sbox
+.align	4
+.Ldec_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+	&InvSbox	(@XMM[0..7, 8..15]);
+$code.=<<___;
+	subs	$rounds,$rounds,#1
+	bcc	.Ldec_done
+___
+	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+	vldmia	$const, {@XMM[12]}		@ .LISR
+	ite	eq				@ Thumb2 thing, sanity check in ARM
+	addeq	$const,$const,#0x10
+	bne	.Ldec_loop
+	vldmia	$const, {@XMM[12]}		@ .LISRM0
+	b	.Ldec_loop
+.align	4
+.Ldec_done:
+___
+	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+	vldmia	$key, {@XMM[8]}			@ last round key
+	veor	@XMM[6], @XMM[6], @XMM[8]
+	veor	@XMM[4], @XMM[4], @XMM[8]
+	veor	@XMM[2], @XMM[2], @XMM[8]
+	veor	@XMM[7], @XMM[7], @XMM[8]
+	veor	@XMM[3], @XMM[3], @XMM[8]
+	veor	@XMM[5], @XMM[5], @XMM[8]
+	veor	@XMM[0], @XMM[0], @XMM[8]
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	bx	lr
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type	_bsaes_const,%object
+.align	6
+_bsaes_const:
+.LM0ISR:	@ InvShiftRows constants
+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:		@ ShiftRows constants
+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+	.quad	0x090d01050c000408, 0x03070b0f060a0e02
+.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	6
+.size	_bsaes_const,.-_bsaes_const
+
+.type	_bsaes_encrypt8,%function
+.align	4
+_bsaes_encrypt8:
+	adr	$const,_bsaes_encrypt8
+	vldmia	$key!, {@XMM[9]}		@ round 0 key
+	sub	$const,$const,#_bsaes_encrypt8-.LM0SR
+
+	vldmia	$const!, {@XMM[8]}		@ .LM0SR
+_bsaes_encrypt8_alt:
+	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
+	veor	@XMM[11], @XMM[1], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+	veor	@XMM[12], @XMM[2], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+	veor	@XMM[13], @XMM[3], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+	veor	@XMM[14], @XMM[4], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+	veor	@XMM[15], @XMM[5], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+	veor	@XMM[10], @XMM[6], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+	veor	@XMM[11], @XMM[7], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	sub	$rounds,$rounds,#1
+	b	.Lenc_sbox
+.align	4
+.Lenc_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+	&Sbox		(@XMM[0..7, 8..15]);
+$code.=<<___;
+	subs	$rounds,$rounds,#1
+	bcc	.Lenc_done
+___
+	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+	vldmia	$const, {@XMM[12]}		@ .LSR
+	ite	eq				@ Thumb2 thing, samity check in ARM
+	addeq	$const,$const,#0x10
+	bne	.Lenc_loop
+	vldmia	$const, {@XMM[12]}		@ .LSRM0
+	b	.Lenc_loop
+.align	4
+.Lenc_done:
+___
+	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+	vldmia	$key, {@XMM[8]}			@ last round key
+	veor	@XMM[4], @XMM[4], @XMM[8]
+	veor	@XMM[6], @XMM[6], @XMM[8]
+	veor	@XMM[3], @XMM[3], @XMM[8]
+	veor	@XMM[7], @XMM[7], @XMM[8]
+	veor	@XMM[2], @XMM[2], @XMM[8]
+	veor	@XMM[5], @XMM[5], @XMM[8]
+	veor	@XMM[0], @XMM[0], @XMM[8]
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	bx	lr
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+	vmov	@x[2], @x[0]
+	vmov	@x[3], @x[1]
+___
+	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+	vmov	@x[4], @x[0]
+	vmov	@x[6], @x[2]
+	vmov	@x[5], @x[1]
+	vmov	@x[7], @x[3]
+___
+	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
+	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type	_bsaes_key_convert,%function
+.align	4
+_bsaes_key_convert:
+	adr	$const,_bsaes_key_convert
+	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
+	sub	$const,$const,#_bsaes_key_convert-.LM0
+	vld1.8	{@XMM[15]}, [$inp]!		@ load round 1 key
+
+	vmov.i8	@XMM[8],  #0x01			@ bit masks
+	vmov.i8	@XMM[9],  #0x02
+	vmov.i8	@XMM[10], #0x04
+	vmov.i8	@XMM[11], #0x08
+	vmov.i8	@XMM[12], #0x10
+	vmov.i8	@XMM[13], #0x20
+	vldmia	$const, {@XMM[14]}		@ .LM0
+
+#ifdef __ARMEL__
+	vrev32.8	@XMM[7],  @XMM[7]
+	vrev32.8	@XMM[15], @XMM[15]
+#endif
+	sub	$rounds,$rounds,#1
+	vstmia	$out!, {@XMM[7]}		@ save round 0 key
+	b	.Lkey_loop
+
+.align	4
+.Lkey_loop:
+	vtbl.8	`&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+	vtbl.8	`&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+	vmov.i8	@XMM[6],  #0x40
+	vmov.i8	@XMM[15], #0x80
+
+	vtst.8	@XMM[0], @XMM[7], @XMM[8]
+	vtst.8	@XMM[1], @XMM[7], @XMM[9]
+	vtst.8	@XMM[2], @XMM[7], @XMM[10]
+	vtst.8	@XMM[3], @XMM[7], @XMM[11]
+	vtst.8	@XMM[4], @XMM[7], @XMM[12]
+	vtst.8	@XMM[5], @XMM[7], @XMM[13]
+	vtst.8	@XMM[6], @XMM[7], @XMM[6]
+	vtst.8	@XMM[7], @XMM[7], @XMM[15]
+	vld1.8	{@XMM[15]}, [$inp]!		@ load next round key
+	vmvn	@XMM[0], @XMM[0]		@ "pnot"
+	vmvn	@XMM[1], @XMM[1]
+	vmvn	@XMM[5], @XMM[5]
+	vmvn	@XMM[6], @XMM[6]
+#ifdef __ARMEL__
+	vrev32.8	@XMM[15], @XMM[15]
+#endif
+	subs	$rounds,$rounds,#1
+	vstmia	$out!,{@XMM[0]-@XMM[7]}		@ write bit-sliced round key
+	bne	.Lkey_loop
+
+	vmov.i8	@XMM[7],#0x63			@ compose .L63
+	@ don't save last round key
+	bx	lr
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0) {		# following four functions are unsupported interface
+			# used for benchmarking...
+$code.=<<___;
+.globl	bsaes_enc_key_convert
+.type	bsaes_enc_key_convert,%function
+.align	4
+bsaes_enc_key_convert:
+	stmdb	sp!,{r4-r6,lr}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+
+	ldr	r5,[$inp,#240]			@ pass rounds
+	mov	r4,$inp				@ pass key
+	mov	r12,$out			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
+	vstmia	r12, {@XMM[7]}			@ save last round key
+
+	vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4-r6,pc}
+.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl	bsaes_encrypt_128
+.type	bsaes_encrypt_128,%function
+.align	4
+bsaes_encrypt_128:
+	stmdb	sp!,{r4-r6,lr}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+.Lenc128_loop:
+	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
+	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
+	mov	r4,$key				@ pass the key
+	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
+	mov	r5,#10				@ pass rounds
+	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
+
+	bl	_bsaes_encrypt8
+
+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	vst1.8	{@XMM[4]}, [$out]!
+	vst1.8	{@XMM[6]}, [$out]!
+	vst1.8	{@XMM[3]}, [$out]!
+	vst1.8	{@XMM[7]}, [$out]!
+	vst1.8	{@XMM[2]}, [$out]!
+	subs	$len,$len,#0x80
+	vst1.8	{@XMM[5]}, [$out]!
+	bhi	.Lenc128_loop
+
+	vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4-r6,pc}
+.size	bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl	bsaes_dec_key_convert
+.type	bsaes_dec_key_convert,%function
+.align	4
+bsaes_dec_key_convert:
+	stmdb	sp!,{r4-r6,lr}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+
+	ldr	r5,[$inp,#240]			@ pass rounds
+	mov	r4,$inp				@ pass key
+	mov	r12,$out			@ pass key schedule
+	bl	_bsaes_key_convert
+	vldmia	$out, {@XMM[6]}
+	vstmia	r12,  {@XMM[15]}		@ save last round key
+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
+	vstmia	$out, {@XMM[7]}
+
+	vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4-r6,pc}
+.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl	bsaes_decrypt_128
+.type	bsaes_decrypt_128,%function
+.align	4
+bsaes_decrypt_128:
+	stmdb	sp!,{r4-r6,lr}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+.Ldec128_loop:
+	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
+	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
+	mov	r4,$key				@ pass the key
+	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
+	mov	r5,#10				@ pass rounds
+	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
+
+	bl	_bsaes_decrypt8
+
+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	vst1.8	{@XMM[6]}, [$out]!
+	vst1.8	{@XMM[4]}, [$out]!
+	vst1.8	{@XMM[2]}, [$out]!
+	vst1.8	{@XMM[7]}, [$out]!
+	vst1.8	{@XMM[3]}, [$out]!
+	subs	$len,$len,#0x80
+	vst1.8	{@XMM[5]}, [$out]!
+	bhi	.Ldec128_loop
+
+	vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4-r6,pc}
+.size	bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,%function
+.align	5
+bsaes_cbc_encrypt:
+#ifndef	__KERNEL__
+	cmp	$len, #128
+#ifndef	__thumb__
+	blo	AES_cbc_encrypt
+#else
+	bhs	1f
+	b	AES_cbc_encrypt
+1:
+#endif
+#endif
+
+	@ it is up to the caller to make sure we are called with enc == 0
+
+	mov	ip, sp
+	stmdb	sp!, {r4-r10, lr}
+	VFP_ABI_PUSH
+	ldr	$ivp, [ip]			@ IV is 1st arg on the stack
+	mov	$len, $len, lsr#4		@ len in 16 byte blocks
+	sub	sp, #0x10			@ scratch space to carry over the IV
+	mov	$fp, sp				@ save sp
+
+	ldr	$rounds, [$key, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
+	add	r12, #`128-32`			@ sifze of bit-slices key schedule
+
+	@ populate the key schedule
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	mov	sp, r12				@ sp is $keysched
+	bl	_bsaes_key_convert
+	vldmia	$keysched, {@XMM[6]}
+	vstmia	r12,  {@XMM[15]}		@ save last round key
+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
+	vstmia	$keysched, {@XMM[7]}
+#else
+	ldr	r12, [$key, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [$key, #244]
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	add	r12, $key, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, $key, #248
+	vldmia	r4, {@XMM[6]}
+	vstmia	r12, {@XMM[15]}			@ save last round key
+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
+	vstmia	r4, {@XMM[7]}
+
+.align	2
+0:
+#endif
+
+	vld1.8	{@XMM[15]}, [$ivp]		@ load IV
+	b	.Lcbc_dec_loop
+
+.align	4
+.Lcbc_dec_loop:
+	subs	$len, $len, #0x8
+	bmi	.Lcbc_dec_loop_finish
+
+	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
+	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, $keysched			@ pass the key
+#else
+	add	r4, $key, #248
+#endif
+	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
+	mov	r5, $rounds
+	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]
+	sub	$inp, $inp, #0x60
+	vstmia	$fp, {@XMM[15]}			@ put aside IV
+
+	bl	_bsaes_decrypt8
+
+	vldmia	$fp, {@XMM[14]}			@ reload IV
+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	veor	@XMM[6], @XMM[6], @XMM[9]
+	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
+	veor	@XMM[4], @XMM[4], @XMM[10]
+	veor	@XMM[2], @XMM[2], @XMM[11]
+	vld1.8	{@XMM[14]-@XMM[15]}, [$inp]!
+	veor	@XMM[7], @XMM[7], @XMM[12]
+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	veor	@XMM[3], @XMM[3], @XMM[13]
+	vst1.8	{@XMM[6]}, [$out]!
+	veor	@XMM[5], @XMM[5], @XMM[14]
+	vst1.8	{@XMM[4]}, [$out]!
+	vst1.8	{@XMM[2]}, [$out]!
+	vst1.8	{@XMM[7]}, [$out]!
+	vst1.8	{@XMM[3]}, [$out]!
+	vst1.8	{@XMM[5]}, [$out]!
+
+	b	.Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+	adds	$len, $len, #8
+	beq	.Lcbc_dec_done
+
+	vld1.8	{@XMM[0]}, [$inp]!		@ load input
+	cmp	$len, #2
+	blo	.Lcbc_dec_one
+	vld1.8	{@XMM[1]}, [$inp]!
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, $keysched			@ pass the key
+#else
+	add	r4, $key, #248
+#endif
+	mov	r5, $rounds
+	vstmia	$fp, {@XMM[15]}			@ put aside IV
+	beq	.Lcbc_dec_two
+	vld1.8	{@XMM[2]}, [$inp]!
+	cmp	$len, #4
+	blo	.Lcbc_dec_three
+	vld1.8	{@XMM[3]}, [$inp]!
+	beq	.Lcbc_dec_four
+	vld1.8	{@XMM[4]}, [$inp]!
+	cmp	$len, #6
+	blo	.Lcbc_dec_five
+	vld1.8	{@XMM[5]}, [$inp]!
+	beq	.Lcbc_dec_six
+	vld1.8	{@XMM[6]}, [$inp]!
+	sub	$inp, $inp, #0x70
+
+	bl	_bsaes_decrypt8
+
+	vldmia	$fp, {@XMM[14]}			@ reload IV
+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	veor	@XMM[6], @XMM[6], @XMM[9]
+	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
+	veor	@XMM[4], @XMM[4], @XMM[10]
+	veor	@XMM[2], @XMM[2], @XMM[11]
+	vld1.8	{@XMM[15]}, [$inp]!
+	veor	@XMM[7], @XMM[7], @XMM[12]
+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	veor	@XMM[3], @XMM[3], @XMM[13]
+	vst1.8	{@XMM[6]}, [$out]!
+	vst1.8	{@XMM[4]}, [$out]!
+	vst1.8	{@XMM[2]}, [$out]!
+	vst1.8	{@XMM[7]}, [$out]!
+	vst1.8	{@XMM[3]}, [$out]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_six:
+	sub	$inp, $inp, #0x60
+	bl	_bsaes_decrypt8
+	vldmia	$fp,{@XMM[14]}			@ reload IV
+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	veor	@XMM[6], @XMM[6], @XMM[9]
+	vld1.8	{@XMM[12]}, [$inp]!
+	veor	@XMM[4], @XMM[4], @XMM[10]
+	veor	@XMM[2], @XMM[2], @XMM[11]
+	vld1.8	{@XMM[15]}, [$inp]!
+	veor	@XMM[7], @XMM[7], @XMM[12]
+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	vst1.8	{@XMM[6]}, [$out]!
+	vst1.8	{@XMM[4]}, [$out]!
+	vst1.8	{@XMM[2]}, [$out]!
+	vst1.8	{@XMM[7]}, [$out]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_five:
+	sub	$inp, $inp, #0x50
+	bl	_bsaes_decrypt8
+	vldmia	$fp, {@XMM[14]}			@ reload IV
+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	veor	@XMM[6], @XMM[6], @XMM[9]
+	vld1.8	{@XMM[15]}, [$inp]!
+	veor	@XMM[4], @XMM[4], @XMM[10]
+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	veor	@XMM[2], @XMM[2], @XMM[11]
+	vst1.8	{@XMM[6]}, [$out]!
+	vst1.8	{@XMM[4]}, [$out]!
+	vst1.8	{@XMM[2]}, [$out]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_four:
+	sub	$inp, $inp, #0x40
+	bl	_bsaes_decrypt8
+	vldmia	$fp, {@XMM[14]}			@ reload IV
+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
+	vld1.8	{@XMM[10]}, [$inp]!
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	veor	@XMM[6], @XMM[6], @XMM[9]
+	vld1.8	{@XMM[15]}, [$inp]!
+	veor	@XMM[4], @XMM[4], @XMM[10]
+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	vst1.8	{@XMM[6]}, [$out]!
+	vst1.8	{@XMM[4]}, [$out]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_three:
+	sub	$inp, $inp, #0x30
+	bl	_bsaes_decrypt8
+	vldmia	$fp, {@XMM[14]}			@ reload IV
+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
+	vld1.8	{@XMM[15]}, [$inp]!
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	veor	@XMM[6], @XMM[6], @XMM[9]
+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	vst1.8	{@XMM[6]}, [$out]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_two:
+	sub	$inp, $inp, #0x20
+	bl	_bsaes_decrypt8
+	vldmia	$fp, {@XMM[14]}			@ reload IV
+	vld1.8	{@XMM[8]}, [$inp]!		@ reload input
+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
+	vld1.8	{@XMM[15]}, [$inp]!		@ reload input
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_one:
+	sub	$inp, $inp, #0x10
+	mov	$rounds, $out			@ save original out pointer
+	mov	$out, $fp			@ use the iv scratch space as out buffer
+	mov	r2, $key
+	vmov	@XMM[4],@XMM[15]		@ just in case ensure that IV
+	vmov	@XMM[5],@XMM[0]			@ and input are preserved
+	bl	AES_decrypt
+	vld1.8	{@XMM[0]}, [$fp,:64]		@ load result
+	veor	@XMM[0], @XMM[0], @XMM[4]	@ ^= IV
+	vmov	@XMM[15], @XMM[5]		@ @XMM[5] holds input
+	vst1.8	{@XMM[0]}, [$rounds]		@ write output
+
+.Lcbc_dec_done:
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+.Lcbc_dec_bzero:				@ wipe key schedule [if any]
+	vstmia		$keysched!, {q0-q1}
+	cmp		$keysched, $fp
+	bne		.Lcbc_dec_bzero
+#endif
+
+	mov	sp, $fp
+	add	sp, #0x10			@ add sp,$fp,#0x10 is no good for thumb
+	vst1.8	{@XMM[15]}, [$ivp]		@ return IV
+	VFP_ABI_POP
+	ldmia	sp!, {r4-r10, pc}
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6";	# shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.extern	AES_encrypt
+.global	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,%function
+.align	5
+bsaes_ctr32_encrypt_blocks:
+	cmp	$len, #8			@ use plain AES for
+	blo	.Lctr_enc_short			@ small sizes
+
+	mov	ip, sp
+	stmdb	sp!, {r4-r10, lr}
+	VFP_ABI_PUSH
+	ldr	$ctr, [ip]			@ ctr is 1st arg on the stack
+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
+	mov	$fp, sp				@ save sp
+
+	ldr	$rounds, [$key, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
+	add	r12, #`128-32`			@ size of bit-sliced key schedule
+
+	@ populate the key schedule
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	mov	sp, r12				@ sp is $keysched
+	bl	_bsaes_key_convert
+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
+	vstmia	r12, {@XMM[7]}			@ save last round key
+
+	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
+	add	$ctr, $const, #.LREVM0SR-.LM0	@ borrow $ctr
+	vldmia	$keysched, {@XMM[4]}		@ load round0 key
+#else
+	ldr	r12, [$key, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [$key, #244]
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	add	r12, $key, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
+	vstmia	r12, {@XMM[7]}			@ save last round key
+
+.align	2
+0:	add	r12, $key, #248
+	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
+	adrl	$ctr, .LREVM0SR			@ borrow $ctr
+	vldmia	r12, {@XMM[4]}			@ load round0 key
+	sub	sp, #0x10			@ place for adjusted round0 key
+#endif
+
+	vmov.i32	@XMM[8],#1		@ compose 1<<96
+	veor		@XMM[9],@XMM[9],@XMM[9]
+	vrev32.8	@XMM[0],@XMM[0]
+	vext.8		@XMM[8],@XMM[9],@XMM[8],#4
+	vrev32.8	@XMM[4],@XMM[4]
+	vadd.u32	@XMM[9],@XMM[8],@XMM[8]	@ compose 2<<96
+	vstmia	$keysched, {@XMM[4]}		@ save adjusted round0 key
+	b	.Lctr_enc_loop
+
+.align	4
+.Lctr_enc_loop:
+	vadd.u32	@XMM[10], @XMM[8], @XMM[9]	@ compose 3<<96
+	vadd.u32	@XMM[1], @XMM[0], @XMM[8]	@ +1
+	vadd.u32	@XMM[2], @XMM[0], @XMM[9]	@ +2
+	vadd.u32	@XMM[3], @XMM[0], @XMM[10]	@ +3
+	vadd.u32	@XMM[4], @XMM[1], @XMM[10]
+	vadd.u32	@XMM[5], @XMM[2], @XMM[10]
+	vadd.u32	@XMM[6], @XMM[3], @XMM[10]
+	vadd.u32	@XMM[7], @XMM[4], @XMM[10]
+	vadd.u32	@XMM[10], @XMM[5], @XMM[10]	@ next counter
+
+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	@ to flip byte order in 32-bit counter
+
+	vldmia		$keysched, {@XMM[9]}		@ load round0 key
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, $keysched, #0x10		@ pass next round key
+#else
+	add		r4, $key, #`248+16`
+#endif
+	vldmia		$ctr, {@XMM[8]}			@ .LREVM0SR
+	mov		r5, $rounds			@ pass rounds
+	vstmia		$fp, {@XMM[10]}			@ save next counter
+	sub		$const, $ctr, #.LREVM0SR-.LSR	@ pass constants
+
+	bl		_bsaes_encrypt8_alt
+
+	subs		$len, $len, #8
+	blo		.Lctr_enc_loop_done
+
+	vld1.8		{@XMM[8]-@XMM[9]}, [$inp]!	@ load input
+	vld1.8		{@XMM[10]-@XMM[11]}, [$inp]!
+	veor		@XMM[0], @XMM[8]
+	veor		@XMM[1], @XMM[9]
+	vld1.8		{@XMM[12]-@XMM[13]}, [$inp]!
+	veor		@XMM[4], @XMM[10]
+	veor		@XMM[6], @XMM[11]
+	vld1.8		{@XMM[14]-@XMM[15]}, [$inp]!
+	veor		@XMM[3], @XMM[12]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	veor		@XMM[7], @XMM[13]
+	veor		@XMM[2], @XMM[14]
+	vst1.8		{@XMM[4]}, [$out]!
+	veor		@XMM[5], @XMM[15]
+	vst1.8		{@XMM[6]}, [$out]!
+	vmov.i32	@XMM[8], #1			@ compose 1<<96
+	vst1.8		{@XMM[3]}, [$out]!
+	veor		@XMM[9], @XMM[9], @XMM[9]
+	vst1.8		{@XMM[7]}, [$out]!
+	vext.8		@XMM[8], @XMM[9], @XMM[8], #4
+	vst1.8		{@XMM[2]}, [$out]!
+	vadd.u32	@XMM[9],@XMM[8],@XMM[8]		@ compose 2<<96
+	vst1.8		{@XMM[5]}, [$out]!
+	vldmia		$fp, {@XMM[0]}			@ load counter
+
+	bne		.Lctr_enc_loop
+	b		.Lctr_enc_done
+
+.align	4
+.Lctr_enc_loop_done:
+	add		$len, $len, #8
+	vld1.8		{@XMM[8]}, [$inp]!	@ load input
+	veor		@XMM[0], @XMM[8]
+	vst1.8		{@XMM[0]}, [$out]!	@ write output
+	cmp		$len, #2
+	blo		.Lctr_enc_done
+	vld1.8		{@XMM[9]}, [$inp]!
+	veor		@XMM[1], @XMM[9]
+	vst1.8		{@XMM[1]}, [$out]!
+	beq		.Lctr_enc_done
+	vld1.8		{@XMM[10]}, [$inp]!
+	veor		@XMM[4], @XMM[10]
+	vst1.8		{@XMM[4]}, [$out]!
+	cmp		$len, #4
+	blo		.Lctr_enc_done
+	vld1.8		{@XMM[11]}, [$inp]!
+	veor		@XMM[6], @XMM[11]
+	vst1.8		{@XMM[6]}, [$out]!
+	beq		.Lctr_enc_done
+	vld1.8		{@XMM[12]}, [$inp]!
+	veor		@XMM[3], @XMM[12]
+	vst1.8		{@XMM[3]}, [$out]!
+	cmp		$len, #6
+	blo		.Lctr_enc_done
+	vld1.8		{@XMM[13]}, [$inp]!
+	veor		@XMM[7], @XMM[13]
+	vst1.8		{@XMM[7]}, [$out]!
+	beq		.Lctr_enc_done
+	vld1.8		{@XMM[14]}, [$inp]
+	veor		@XMM[2], @XMM[14]
+	vst1.8		{@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifndef	BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:			@ wipe key schedule [if any]
+	vstmia		$keysched!, {q0-q1}
+	cmp		$keysched, $fp
+	bne		.Lctr_enc_bzero
+#else
+	vstmia		$keysched, {q0-q1}
+#endif
+
+	mov	sp, $fp
+	add	sp, #0x10		@ add sp,$fp,#0x10 is no good for thumb
+	VFP_ABI_POP
+	ldmia	sp!, {r4-r10, pc}	@ return
+
+.align	4
+.Lctr_enc_short:
+	ldr	ip, [sp]		@ ctr pointer is passed on stack
+	stmdb	sp!, {r4-r8, lr}
+
+	mov	r4, $inp		@ copy arguments
+	mov	r5, $out
+	mov	r6, $len
+	mov	r7, $key
+	ldr	r8, [ip, #12]		@ load counter LSW
+	vld1.8	{@XMM[1]}, [ip]		@ load whole counter value
+#ifdef __ARMEL__
+	rev	r8, r8
+#endif
+	sub	sp, sp, #0x10
+	vst1.8	{@XMM[1]}, [sp,:64]	@ copy counter value
+	sub	sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+	add	r0, sp, #0x10		@ input counter value
+	mov	r1, sp			@ output on the stack
+	mov	r2, r7			@ key
+
+	bl	AES_encrypt
+
+	vld1.8	{@XMM[0]}, [r4]!	@ load input
+	vld1.8	{@XMM[1]}, [sp,:64]	@ load encrypted counter
+	add	r8, r8, #1
+#ifdef __ARMEL__
+	rev	r0, r8
+	str	r0, [sp, #0x1c]		@ next counter value
+#else
+	str	r8, [sp, #0x1c]		@ next counter value
+#endif
+	veor	@XMM[0],@XMM[0],@XMM[1]
+	vst1.8	{@XMM[0]}, [r5]!	@ store output
+	subs	r6, r6, #1
+	bne	.Lctr_enc_short_loop
+
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+	vstmia		sp!, {q0-q1}
+
+	ldmia	sp!, {r4-r8, pc}
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6";		# returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+
+$code.=<<___;
+.globl	bsaes_xts_encrypt
+.type	bsaes_xts_encrypt,%function
+.align	4
+bsaes_xts_encrypt:
+	mov	ip, sp
+	stmdb	sp!, {r4-r10, lr}		@ 0x20
+	VFP_ABI_PUSH
+	mov	r6, sp				@ future $fp
+
+	mov	$inp, r0
+	mov	$out, r1
+	mov	$len, r2
+	mov	$key, r3
+
+	sub	r0, sp, #0x10			@ 0x10
+	bic	r0, #0xf			@ align at 16 bytes
+	mov	sp, r0
+
+#ifdef	XTS_CHAIN_TWEAK
+	ldr	r0, [ip]			@ pointer to input tweak
+#else
+	@ generate initial tweak
+	ldr	r0, [ip, #4]			@ iv[]
+	mov	r1, sp
+	ldr	r2, [ip, #0]			@ key2
+	bl	AES_encrypt
+	mov	r0,sp				@ pointer to initial tweak
+#endif
+
+	ldr	$rounds, [$key, #240]		@ get # of rounds
+	mov	$fp, r6
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
+	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
+	sub	r12, #`32+16`			@ place for tweak[9]
+
+	@ populate the key schedule
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	mov	sp, r12
+	add	r12, #0x90			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
+	vstmia	r12, {@XMM[7]}			@ save last round key
+#else
+	ldr	r12, [$key, #244]
+	eors	r12, #1
+	beq	0f
+
+	str	r12, [$key, #244]
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	add	r12, $key, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
+	vstmia	r12, {@XMM[7]}
+
+.align	2
+0:	sub	sp, #0x90			@ place for tweak[9]
+#endif
+
+	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
+	adr	$magic, .Lxts_magic
+
+	subs	$len, #0x80
+	blo	.Lxts_enc_short
+	b	.Lxts_enc_loop
+
+.align	4
+.Lxts_enc_loop:
+	vldmia		$magic, {$twmask}	@ load XTS magic
+	vshr.s64	@T[0], @XMM[8], #63
+	mov		r0, sp
+	vand		@T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
+	vst1.64		{@XMM[$i-1]}, [r0,:128]!
+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+	vshr.s64	@T[1], @XMM[$i], #63
+	veor		@XMM[$i], @XMM[$i], @T[0]
+	vand		@T[1], @T[1], $twmask
+___
+	@T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+	vld1.8		{@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
+	vst1.64		{@XMM[15]}, [r0,:128]!
+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+	veor		@XMM[8], @XMM[8], @T[0]
+	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+
+	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
+	veor		@XMM[5], @XMM[5], @XMM[13]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[6], @XMM[6], @XMM[14]
+	mov		r5, $rounds			@ pass rounds
+	veor		@XMM[7], @XMM[7], @XMM[15]
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[4], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[6], @XMM[11]
+	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
+	veor		@XMM[10], @XMM[3], @XMM[12]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+	veor		@XMM[11], @XMM[7], @XMM[13]
+	veor		@XMM[12], @XMM[2], @XMM[14]
+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
+	veor		@XMM[13], @XMM[5], @XMM[15]
+	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+
+	subs		$len, #0x80
+	bpl		.Lxts_enc_loop
+
+.Lxts_enc_short:
+	adds		$len, #0x70
+	bmi		.Lxts_enc_done
+
+	vldmia		$magic, {$twmask}	@ load XTS magic
+	vshr.s64	@T[0], @XMM[8], #63
+	mov		r0, sp
+	vand		@T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
+	vst1.64		{@XMM[$i-1]}, [r0,:128]!
+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+	vshr.s64	@T[1], @XMM[$i], #63
+	veor		@XMM[$i], @XMM[$i], @T[0]
+	vand		@T[1], @T[1], $twmask
+___
+	@T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+	vld1.8		{@XMM[$i-10]}, [$inp]!
+	subs		$len, #0x10
+	bmi		.Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+	sub		$len, #0x10
+	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
+
+	vld1.8		{@XMM[6]}, [$inp]!
+	veor		@XMM[5], @XMM[5], @XMM[13]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[6], @XMM[6], @XMM[14]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[4], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[6], @XMM[11]
+	vld1.64		{@XMM[14]}, [r0,:128]!
+	veor		@XMM[10], @XMM[3], @XMM[12]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+	veor		@XMM[11], @XMM[7], @XMM[13]
+	veor		@XMM[12], @XMM[2], @XMM[14]
+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
+	vst1.8		{@XMM[12]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_6:
+	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[4], @XMM[4], @XMM[12]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[5], @XMM[5], @XMM[13]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[4], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[6], @XMM[11]
+	veor		@XMM[10], @XMM[3], @XMM[12]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+	veor		@XMM[11], @XMM[7], @XMM[13]
+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align	5
+.Lxts_magic:
+	.quad	1, 0x87
+
+.align	5
+.Lxts_enc_5:
+	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[3], @XMM[3], @XMM[11]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[4], @XMM[4], @XMM[12]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	vld1.64		{@XMM[12]}, [r0,:128]!
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[4], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[6], @XMM[11]
+	veor		@XMM[10], @XMM[3], @XMM[12]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+	vst1.8		{@XMM[10]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_4:
+	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[2], @XMM[2], @XMM[10]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[3], @XMM[3], @XMM[11]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[4], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[6], @XMM[11]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_3:
+	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[1], @XMM[1], @XMM[9]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[2], @XMM[2], @XMM[10]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
+	vld1.64		{@XMM[10]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[4], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	vst1.8		{@XMM[8]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_2:
+	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[0], @XMM[0], @XMM[8]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[1], @XMM[1], @XMM[9]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_encrypt8
+
+	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_enc_done
+.align	4
+.Lxts_enc_1:
+	mov		r0, sp
+	veor		@XMM[0], @XMM[8]
+	mov		r1, sp
+	vst1.8		{@XMM[0]}, [sp,:128]
+	mov		r2, $key
+	mov		r4, $fp				@ preserve fp
+
+	bl		AES_encrypt
+
+	vld1.8		{@XMM[0]}, [sp,:128]
+	veor		@XMM[0], @XMM[0], @XMM[8]
+	vst1.8		{@XMM[0]}, [$out]!
+	mov		$fp, r4
+
+	vmov		@XMM[8], @XMM[9]		@ next round tweak
+
+.Lxts_enc_done:
+#ifndef	XTS_CHAIN_TWEAK
+	adds		$len, #0x10
+	beq		.Lxts_enc_ret
+	sub		r6, $out, #0x10
+
+.Lxts_enc_steal:
+	ldrb		r0, [$inp], #1
+	ldrb		r1, [$out, #-0x10]
+	strb		r0, [$out, #-0x10]
+	strb		r1, [$out], #1
+
+	subs		$len, #1
+	bhi		.Lxts_enc_steal
+
+	vld1.8		{@XMM[0]}, [r6]
+	mov		r0, sp
+	veor		@XMM[0], @XMM[0], @XMM[8]
+	mov		r1, sp
+	vst1.8		{@XMM[0]}, [sp,:128]
+	mov		r2, $key
+	mov		r4, $fp			@ preserve fp
+
+	bl		AES_encrypt
+
+	vld1.8		{@XMM[0]}, [sp,:128]
+	veor		@XMM[0], @XMM[0], @XMM[8]
+	vst1.8		{@XMM[0]}, [r6]
+	mov		$fp, r4
+#endif
+
+.Lxts_enc_ret:
+	bic		r0, $fp, #0xf
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifdef	XTS_CHAIN_TWEAK
+	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
+#endif
+.Lxts_enc_bzero:				@ wipe key schedule [if any]
+	vstmia		sp!, {q0-q1}
+	cmp		sp, r0
+	bne		.Lxts_enc_bzero
+
+	mov		sp, $fp
+#ifdef	XTS_CHAIN_TWEAK
+	vst1.8		{@XMM[8]}, [r1]
+#endif
+	VFP_ABI_POP
+	ldmia		sp!, {r4-r10, pc}	@ return
+
+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl	bsaes_xts_decrypt
+.type	bsaes_xts_decrypt,%function
+.align	4
+bsaes_xts_decrypt:
+	mov	ip, sp
+	stmdb	sp!, {r4-r10, lr}		@ 0x20
+	VFP_ABI_PUSH
+	mov	r6, sp				@ future $fp
+
+	mov	$inp, r0
+	mov	$out, r1
+	mov	$len, r2
+	mov	$key, r3
+
+	sub	r0, sp, #0x10			@ 0x10
+	bic	r0, #0xf			@ align at 16 bytes
+	mov	sp, r0
+
+#ifdef	XTS_CHAIN_TWEAK
+	ldr	r0, [ip]			@ pointer to input tweak
+#else
+	@ generate initial tweak
+	ldr	r0, [ip, #4]			@ iv[]
+	mov	r1, sp
+	ldr	r2, [ip, #0]			@ key2
+	bl	AES_encrypt
+	mov	r0, sp				@ pointer to initial tweak
+#endif
+
+	ldr	$rounds, [$key, #240]		@ get # of rounds
+	mov	$fp, r6
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
+	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
+	sub	r12, #`32+16`			@ place for tweak[9]
+
+	@ populate the key schedule
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	mov	sp, r12
+	add	r12, #0x90			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, sp, #0x90
+	vldmia	r4, {@XMM[6]}
+	vstmia	r12,  {@XMM[15]}		@ save last round key
+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
+	vstmia	r4, {@XMM[7]}
+#else
+	ldr	r12, [$key, #244]
+	eors	r12, #1
+	beq	0f
+
+	str	r12, [$key, #244]
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	add	r12, $key, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, $key, #248
+	vldmia	r4, {@XMM[6]}
+	vstmia	r12,  {@XMM[15]}		@ save last round key
+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
+	vstmia	r4, {@XMM[7]}
+
+.align	2
+0:	sub	sp, #0x90			@ place for tweak[9]
+#endif
+	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
+	adr	$magic, .Lxts_magic
+
+	tst	$len, #0xf			@ if not multiple of 16
+	it	ne				@ Thumb2 thing, sanity check in ARM
+	subne	$len, #0x10			@ subtract another 16 bytes
+	subs	$len, #0x80
+
+	blo	.Lxts_dec_short
+	b	.Lxts_dec_loop
+
+.align	4
+.Lxts_dec_loop:
+	vldmia		$magic, {$twmask}	@ load XTS magic
+	vshr.s64	@T[0], @XMM[8], #63
+	mov		r0, sp
+	vand		@T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
+	vst1.64		{@XMM[$i-1]}, [r0,:128]!
+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+	vshr.s64	@T[1], @XMM[$i], #63
+	veor		@XMM[$i], @XMM[$i], @T[0]
+	vand		@T[1], @T[1], $twmask
+___
+	@T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+	vld1.8		{@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
+	vst1.64		{@XMM[15]}, [r0,:128]!
+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+	veor		@XMM[8], @XMM[8], @T[0]
+	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+
+	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
+	veor		@XMM[5], @XMM[5], @XMM[13]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[6], @XMM[6], @XMM[14]
+	mov		r5, $rounds			@ pass rounds
+	veor		@XMM[7], @XMM[7], @XMM[15]
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[6], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[4], @XMM[11]
+	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
+	veor		@XMM[10], @XMM[2], @XMM[12]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+	veor		@XMM[11], @XMM[7], @XMM[13]
+	veor		@XMM[12], @XMM[3], @XMM[14]
+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
+	veor		@XMM[13], @XMM[5], @XMM[15]
+	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+
+	subs		$len, #0x80
+	bpl		.Lxts_dec_loop
+
+.Lxts_dec_short:
+	adds		$len, #0x70
+	bmi		.Lxts_dec_done
+
+	vldmia		$magic, {$twmask}	@ load XTS magic
+	vshr.s64	@T[0], @XMM[8], #63
+	mov		r0, sp
+	vand		@T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
+	vst1.64		{@XMM[$i-1]}, [r0,:128]!
+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+	vshr.s64	@T[1], @XMM[$i], #63
+	veor		@XMM[$i], @XMM[$i], @T[0]
+	vand		@T[1], @T[1], $twmask
+___
+	@T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+	vld1.8		{@XMM[$i-10]}, [$inp]!
+	subs		$len, #0x10
+	bmi		.Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+	sub		$len, #0x10
+	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
+
+	vld1.8		{@XMM[6]}, [$inp]!
+	veor		@XMM[5], @XMM[5], @XMM[13]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[6], @XMM[6], @XMM[14]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[6], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[4], @XMM[11]
+	vld1.64		{@XMM[14]}, [r0,:128]!
+	veor		@XMM[10], @XMM[2], @XMM[12]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+	veor		@XMM[11], @XMM[7], @XMM[13]
+	veor		@XMM[12], @XMM[3], @XMM[14]
+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
+	vst1.8		{@XMM[12]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_6:
+	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[4], @XMM[4], @XMM[12]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[5], @XMM[5], @XMM[13]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[6], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[4], @XMM[11]
+	veor		@XMM[10], @XMM[2], @XMM[12]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+	veor		@XMM[11], @XMM[7], @XMM[13]
+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_5:
+	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[3], @XMM[3], @XMM[11]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[4], @XMM[4], @XMM[12]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	vld1.64		{@XMM[12]}, [r0,:128]!
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[6], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[4], @XMM[11]
+	veor		@XMM[10], @XMM[2], @XMM[12]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+	vst1.8		{@XMM[10]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_4:
+	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[2], @XMM[2], @XMM[10]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[3], @XMM[3], @XMM[11]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[6], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	veor		@XMM[9], @XMM[4], @XMM[11]
+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_3:
+	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[1], @XMM[1], @XMM[9]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[2], @XMM[2], @XMM[10]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
+	vld1.64		{@XMM[10]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	veor		@XMM[8], @XMM[6], @XMM[10]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+	vst1.8		{@XMM[8]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_2:
+	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
+
+	veor		@XMM[0], @XMM[0], @XMM[8]
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, sp, #0x90			@ pass key schedule
+#else
+	add		r4, $key, #248			@ pass key schedule
+#endif
+	veor		@XMM[1], @XMM[1], @XMM[9]
+	mov		r5, $rounds			@ pass rounds
+	mov		r0, sp
+
+	bl		_bsaes_decrypt8
+
+	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
+	veor		@XMM[0], @XMM[0], @XMM[ 8]
+	veor		@XMM[1], @XMM[1], @XMM[ 9]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
+
+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
+	b		.Lxts_dec_done
+.align	4
+.Lxts_dec_1:
+	mov		r0, sp
+	veor		@XMM[0], @XMM[8]
+	mov		r1, sp
+	vst1.8		{@XMM[0]}, [sp,:128]
+	mov		r2, $key
+	mov		r4, $fp				@ preserve fp
+	mov		r5, $magic			@ preserve magic
+
+	bl		AES_decrypt
+
+	vld1.8		{@XMM[0]}, [sp,:128]
+	veor		@XMM[0], @XMM[0], @XMM[8]
+	vst1.8		{@XMM[0]}, [$out]!
+	mov		$fp, r4
+	mov		$magic, r5
+
+	vmov		@XMM[8], @XMM[9]		@ next round tweak
+
+.Lxts_dec_done:
+#ifndef	XTS_CHAIN_TWEAK
+	adds		$len, #0x10
+	beq		.Lxts_dec_ret
+
+	@ calculate one round of extra tweak for the stolen ciphertext
+	vldmia		$magic, {$twmask}
+	vshr.s64	@XMM[6], @XMM[8], #63
+	vand		@XMM[6], @XMM[6], $twmask
+	vadd.u64	@XMM[9], @XMM[8], @XMM[8]
+	vswp		`&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+	veor		@XMM[9], @XMM[9], @XMM[6]
+
+	@ perform the final decryption with the last tweak value
+	vld1.8		{@XMM[0]}, [$inp]!
+	mov		r0, sp
+	veor		@XMM[0], @XMM[0], @XMM[9]
+	mov		r1, sp
+	vst1.8		{@XMM[0]}, [sp,:128]
+	mov		r2, $key
+	mov		r4, $fp			@ preserve fp
+
+	bl		AES_decrypt
+
+	vld1.8		{@XMM[0]}, [sp,:128]
+	veor		@XMM[0], @XMM[0], @XMM[9]
+	vst1.8		{@XMM[0]}, [$out]
+
+	mov		r6, $out
+.Lxts_dec_steal:
+	ldrb		r1, [$out]
+	ldrb		r0, [$inp], #1
+	strb		r1, [$out, #0x10]
+	strb		r0, [$out], #1
+
+	subs		$len, #1
+	bhi		.Lxts_dec_steal
+
+	vld1.8		{@XMM[0]}, [r6]
+	mov		r0, sp
+	veor		@XMM[0], @XMM[8]
+	mov		r1, sp
+	vst1.8		{@XMM[0]}, [sp,:128]
+	mov		r2, $key
+
+	bl		AES_decrypt
+
+	vld1.8		{@XMM[0]}, [sp,:128]
+	veor		@XMM[0], @XMM[0], @XMM[8]
+	vst1.8		{@XMM[0]}, [r6]
+	mov		$fp, r4
+#endif
+
+.Lxts_dec_ret:
+	bic		r0, $fp, #0xf
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifdef	XTS_CHAIN_TWEAK
+	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
+#endif
+.Lxts_dec_bzero:				@ wipe key schedule [if any]
+	vstmia		sp!, {q0-q1}
+	cmp		sp, r0
+	bne		.Lxts_dec_bzero
+
+	mov		sp, $fp
+#ifdef	XTS_CHAIN_TWEAK
+	vst1.8		{@XMM[8]}, [r1]
+#endif
+	VFP_ABI_POP
+	ldmia		sp!, {r4-r10, pc}	@ return
+
+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+        last if (!s/^#/@/ and !/^$/);
+        print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/armcap.c b/main/openssl/crypto/armcap.c
new file mode 100644
index 00000000..9abaf396
--- /dev/null
+++ b/main/openssl/crypto/armcap.c
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <crypto.h>
+
+#include "arm_arch.h"
+
+unsigned int OPENSSL_armcap_P;
+
+static sigset_t all_masked;
+
+static sigjmp_buf ill_jmp;
+static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
+
+/*
+ * Following subroutines could have been inlined, but it's not all
+ * ARM compilers support inline assembler...
+ */
+void _armv7_neon_probe(void);
+unsigned int _armv7_tick(void);
+
+unsigned int OPENSSL_rdtsc(void)
+	{
+	if (OPENSSL_armcap_P & ARMV7_TICK)
+		return _armv7_tick();
+	else
+		return 0;
+	}
+
+#if defined(__GNUC__) && __GNUC__>=2
+void OPENSSL_cpuid_setup(void) __attribute__((constructor));
+#endif
+void OPENSSL_cpuid_setup(void)
+	{
+	char *e;
+	struct sigaction	ill_oact,ill_act;
+	sigset_t		oset;
+	static int trigger=0;
+
+	if (trigger) return;
+	trigger=1;
+ 
+	if ((e=getenv("OPENSSL_armcap")))
+		{
+		OPENSSL_armcap_P=strtoul(e,NULL,0);
+		return;
+		}
+
+	sigfillset(&all_masked);
+	sigdelset(&all_masked,SIGILL);
+	sigdelset(&all_masked,SIGTRAP);
+	sigdelset(&all_masked,SIGFPE);
+	sigdelset(&all_masked,SIGBUS);
+	sigdelset(&all_masked,SIGSEGV);
+
+	OPENSSL_armcap_P = 0;
+
+	memset(&ill_act,0,sizeof(ill_act));
+	ill_act.sa_handler = ill_handler;
+	ill_act.sa_mask    = all_masked;
+
+	sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
+	sigaction(SIGILL,&ill_act,&ill_oact);
+
+	if (sigsetjmp(ill_jmp,1) == 0)
+		{
+		_armv7_neon_probe();
+		OPENSSL_armcap_P |= ARMV7_NEON;
+		}
+	if (sigsetjmp(ill_jmp,1) == 0)
+		{
+		_armv7_tick();
+		OPENSSL_armcap_P |= ARMV7_TICK;
+		}
+
+	sigaction (SIGILL,&ill_oact,NULL);
+	sigprocmask(SIG_SETMASK,&oset,NULL);
+	}
diff --git a/main/openssl/crypto/bn/asm/bn-586.S b/main/openssl/crypto/bn/asm/bn-586.S
index fe873ce9..66695e26 100644
--- a/main/openssl/crypto/bn/asm/bn-586.S
+++ b/main/openssl/crypto/bn/asm/bn-586.S
@@ -5,6 +5,103 @@
 .align	16
 bn_mul_add_words:
 .L_bn_mul_add_words_begin:
+	call	.L000PIC_me_up
+.L000PIC_me_up:
+	popl	%eax
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%eax),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L001maw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+	jmp	.L002maw_sse2_entry
+.align	16
+.L003maw_sse2_unrolled:
+	movd	(%eax),%mm3
+	paddq	%mm3,%mm1
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	movd	4(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	movd	8(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	12(%edx),%mm7
+	pmuludq	%mm0,%mm7
+	paddq	%mm2,%mm1
+	movd	4(%eax),%mm3
+	paddq	%mm4,%mm3
+	movd	8(%eax),%mm5
+	paddq	%mm6,%mm5
+	movd	12(%eax),%mm4
+	paddq	%mm4,%mm7
+	movd	%mm1,(%eax)
+	movd	16(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	psrlq	$32,%mm1
+	movd	20(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	paddq	%mm3,%mm1
+	movd	24(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	%mm1,4(%eax)
+	psrlq	$32,%mm1
+	movd	28(%edx),%mm3
+	addl	$32,%edx
+	pmuludq	%mm0,%mm3
+	paddq	%mm5,%mm1
+	movd	16(%eax),%mm5
+	paddq	%mm5,%mm2
+	movd	%mm1,8(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm7,%mm1
+	movd	20(%eax),%mm5
+	paddq	%mm5,%mm4
+	movd	%mm1,12(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm2,%mm1
+	movd	24(%eax),%mm5
+	paddq	%mm5,%mm6
+	movd	%mm1,16(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm4,%mm1
+	movd	28(%eax),%mm5
+	paddq	%mm5,%mm3
+	movd	%mm1,20(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm6,%mm1
+	movd	%mm1,24(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm3,%mm1
+	movd	%mm1,28(%eax)
+	leal	32(%eax),%eax
+	psrlq	$32,%mm1
+	subl	$8,%ecx
+	jz	.L004maw_sse2_exit
+.L002maw_sse2_entry:
+	testl	$4294967288,%ecx
+	jnz	.L003maw_sse2_unrolled
+.align	4
+.L005maw_sse2_loop:
+	movd	(%edx),%mm2
+	movd	(%eax),%mm3
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm3,%mm1
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	.L005maw_sse2_loop
+.L004maw_sse2_exit:
+	movd	%mm1,%eax
+	emms
+	ret
+.align	16
+.L001maw_non_sse2:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
@@ -17,9 +114,9 @@ bn_mul_add_words:
 	andl	$4294967288,%ecx
 	movl	32(%esp),%ebp
 	pushl	%ecx
-	jz	.L000maw_finish
+	jz	.L006maw_finish
 .align	16
-.L001maw_loop:
+.L007maw_loop:
 
 	movl	(%ebx),%eax
 	mull	%ebp
@@ -96,13 +193,13 @@ bn_mul_add_words:
 	subl	$8,%ecx
 	leal	32(%ebx),%ebx
 	leal	32(%edi),%edi
-	jnz	.L001maw_loop
-.L000maw_finish:
+	jnz	.L007maw_loop
+.L006maw_finish:
 	movl	32(%esp),%ecx
 	andl	$7,%ecx
-	jnz	.L002maw_finish2
-	jmp	.L003maw_end
-.L002maw_finish2:
+	jnz	.L008maw_finish2
+	jmp	.L009maw_end
+.L008maw_finish2:
 
 	movl	(%ebx),%eax
 	mull	%ebp
@@ -113,7 +210,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	4(%ebx),%eax
 	mull	%ebp
@@ -124,7 +221,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,4(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	8(%ebx),%eax
 	mull	%ebp
@@ -135,7 +232,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,8(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	12(%ebx),%eax
 	mull	%ebp
@@ -146,7 +243,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,12(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	16(%ebx),%eax
 	mull	%ebp
@@ -157,7 +254,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,16(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	20(%ebx),%eax
 	mull	%ebp
@@ -168,7 +265,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,20(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	24(%ebx),%eax
 	mull	%ebp
@@ -178,7 +275,7 @@ bn_mul_add_words:
 	adcl	$0,%edx
 	movl	%eax,24(%edi)
 	movl	%edx,%esi
-.L003maw_end:
+.L009maw_end:
 	movl	%esi,%eax
 	popl	%ecx
 	popl	%edi
@@ -192,6 +289,34 @@ bn_mul_add_words:
 .align	16
 bn_mul_words:
 .L_bn_mul_words_begin:
+	call	.L010PIC_me_up
+.L010PIC_me_up:
+	popl	%eax
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%eax),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L011mw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+.align	16
+.L012mw_sse2_loop:
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	.L012mw_sse2_loop
+	movd	%mm1,%eax
+	emms
+	ret
+.align	16
+.L011mw_non_sse2:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
@@ -203,8 +328,8 @@ bn_mul_words:
 	movl	28(%esp),%ebp
 	movl	32(%esp),%ecx
 	andl	$4294967288,%ebp
-	jz	.L004mw_finish
-.L005mw_loop:
+	jz	.L013mw_finish
+.L014mw_loop:
 
 	movl	(%ebx),%eax
 	mull	%ecx
@@ -265,14 +390,14 @@ bn_mul_words:
 	addl	$32,%ebx
 	addl	$32,%edi
 	subl	$8,%ebp
-	jz	.L004mw_finish
-	jmp	.L005mw_loop
-.L004mw_finish:
+	jz	.L013mw_finish
+	jmp	.L014mw_loop
+.L013mw_finish:
 	movl	28(%esp),%ebp
 	andl	$7,%ebp
-	jnz	.L006mw_finish2
-	jmp	.L007mw_end
-.L006mw_finish2:
+	jnz	.L015mw_finish2
+	jmp	.L016mw_end
+.L015mw_finish2:
 
 	movl	(%ebx),%eax
 	mull	%ecx
@@ -281,7 +406,7 @@ bn_mul_words:
 	movl	%eax,(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	4(%ebx),%eax
 	mull	%ecx
@@ -290,7 +415,7 @@ bn_mul_words:
 	movl	%eax,4(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	8(%ebx),%eax
 	mull	%ecx
@@ -299,7 +424,7 @@ bn_mul_words:
 	movl	%eax,8(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	12(%ebx),%eax
 	mull	%ecx
@@ -308,7 +433,7 @@ bn_mul_words:
 	movl	%eax,12(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	16(%ebx),%eax
 	mull	%ecx
@@ -317,7 +442,7 @@ bn_mul_words:
 	movl	%eax,16(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	20(%ebx),%eax
 	mull	%ecx
@@ -326,7 +451,7 @@ bn_mul_words:
 	movl	%eax,20(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	24(%ebx),%eax
 	mull	%ecx
@@ -334,7 +459,7 @@ bn_mul_words:
 	adcl	$0,%edx
 	movl	%eax,24(%edi)
 	movl	%edx,%esi
-.L007mw_end:
+.L016mw_end:
 	movl	%esi,%eax
 	popl	%edi
 	popl	%esi
@@ -347,6 +472,29 @@ bn_mul_words:
 .align	16
 bn_sqr_words:
 .L_bn_sqr_words_begin:
+	call	.L017PIC_me_up
+.L017PIC_me_up:
+	popl	%eax
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L017PIC_me_up](%eax),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L018sqr_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+.align	16
+.L019sqr_sse2_loop:
+	movd	(%edx),%mm0
+	pmuludq	%mm0,%mm0
+	leal	4(%edx),%edx
+	movq	%mm0,(%eax)
+	subl	$1,%ecx
+	leal	8(%eax),%eax
+	jnz	.L019sqr_sse2_loop
+	emms
+	ret
+.align	16
+.L018sqr_non_sse2:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
@@ -356,8 +504,8 @@ bn_sqr_words:
 	movl	24(%esp),%edi
 	movl	28(%esp),%ebx
 	andl	$4294967288,%ebx
-	jz	.L008sw_finish
-.L009sw_loop:
+	jz	.L020sw_finish
+.L021sw_loop:
 
 	movl	(%edi),%eax
 	mull	%eax
@@ -402,59 +550,59 @@ bn_sqr_words:
 	addl	$32,%edi
 	addl	$64,%esi
 	subl	$8,%ebx
-	jnz	.L009sw_loop
-.L008sw_finish:
+	jnz	.L021sw_loop
+.L020sw_finish:
 	movl	28(%esp),%ebx
 	andl	$7,%ebx
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	(%edi),%eax
 	mull	%eax
 	movl	%eax,(%esi)
 	decl	%ebx
 	movl	%edx,4(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	4(%edi),%eax
 	mull	%eax
 	movl	%eax,8(%esi)
 	decl	%ebx
 	movl	%edx,12(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	8(%edi),%eax
 	mull	%eax
 	movl	%eax,16(%esi)
 	decl	%ebx
 	movl	%edx,20(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	12(%edi),%eax
 	mull	%eax
 	movl	%eax,24(%esi)
 	decl	%ebx
 	movl	%edx,28(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	16(%edi),%eax
 	mull	%eax
 	movl	%eax,32(%esi)
 	decl	%ebx
 	movl	%edx,36(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	20(%edi),%eax
 	mull	%eax
 	movl	%eax,40(%esi)
 	decl	%ebx
 	movl	%edx,44(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	24(%edi),%eax
 	mull	%eax
 	movl	%eax,48(%esi)
 	movl	%edx,52(%esi)
-.L010sw_end:
+.L022sw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -488,8 +636,8 @@ bn_add_words:
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L011aw_finish
-.L012aw_loop:
+	jz	.L023aw_finish
+.L024aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -567,11 +715,11 @@ bn_add_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L012aw_loop
-.L011aw_finish:
+	jnz	.L024aw_loop
+.L023aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -582,7 +730,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -593,7 +741,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -604,7 +752,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -615,7 +763,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -626,7 +774,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -637,7 +785,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -647,7 +795,7 @@ bn_add_words:
 	addl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-.L013aw_end:
+.L025aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -670,8 +818,8 @@ bn_sub_words:
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L014aw_finish
-.L015aw_loop:
+	jz	.L026aw_finish
+.L027aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -749,11 +897,11 @@ bn_sub_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L015aw_loop
-.L014aw_finish:
+	jnz	.L027aw_loop
+.L026aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -764,7 +912,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -775,7 +923,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -786,7 +934,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -797,7 +945,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -808,7 +956,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -819,7 +967,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -829,7 +977,7 @@ bn_sub_words:
 	subl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-.L016aw_end:
+.L028aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -852,8 +1000,8 @@ bn_sub_part_words:
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L017aw_finish
-.L018aw_loop:
+	jz	.L029aw_finish
+.L030aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -931,11 +1079,11 @@ bn_sub_part_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L018aw_loop
-.L017aw_finish:
+	jnz	.L030aw_loop
+.L029aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -949,7 +1097,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -963,7 +1111,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -977,7 +1125,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -991,7 +1139,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -1005,7 +1153,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -1019,7 +1167,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -1032,20 +1180,20 @@ bn_sub_part_words:
 	addl	$4,%esi
 	addl	$4,%edi
 	addl	$4,%ebx
-.L019aw_end:
+.L031aw_end:
 	cmpl	$0,36(%esp)
-	je	.L020pw_end
+	je	.L032pw_end
 	movl	36(%esp),%ebp
 	cmpl	$0,%ebp
-	je	.L020pw_end
-	jge	.L021pw_pos
+	je	.L032pw_end
+	jge	.L033pw_pos
 
 	movl	$0,%edx
 	subl	%ebp,%edx
 	movl	%edx,%ebp
 	andl	$4294967288,%ebp
-	jz	.L022pw_neg_finish
-.L023pw_neg_loop:
+	jz	.L034pw_neg_finish
+.L035pw_neg_loop:
 
 	movl	$0,%ecx
 	movl	(%edi),%edx
@@ -1122,13 +1270,13 @@ bn_sub_part_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L023pw_neg_loop
-.L022pw_neg_finish:
+	jnz	.L035pw_neg_loop
+.L034pw_neg_finish:
 	movl	36(%esp),%edx
 	movl	$0,%ebp
 	subl	%edx,%ebp
 	andl	$7,%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	(%edi),%edx
@@ -1139,7 +1287,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	4(%edi),%edx
@@ -1150,7 +1298,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	8(%edi),%edx
@@ -1161,7 +1309,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	12(%edi),%edx
@@ -1172,7 +1320,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	16(%edi),%edx
@@ -1183,7 +1331,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	20(%edi),%edx
@@ -1194,7 +1342,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	24(%edi),%edx
@@ -1204,181 +1352,182 @@ bn_sub_part_words:
 	subl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-	jmp	.L020pw_end
-.L021pw_pos:
+	jmp	.L032pw_end
+.L033pw_pos:
 	andl	$4294967288,%ebp
-	jz	.L024pw_pos_finish
-.L025pw_pos_loop:
+	jz	.L036pw_pos_finish
+.L037pw_pos_loop:
 
 	movl	(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,(%ebx)
-	jnc	.L026pw_nc0
+	jnc	.L038pw_nc0
 
 	movl	4(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,4(%ebx)
-	jnc	.L027pw_nc1
+	jnc	.L039pw_nc1
 
 	movl	8(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,8(%ebx)
-	jnc	.L028pw_nc2
+	jnc	.L040pw_nc2
 
 	movl	12(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,12(%ebx)
-	jnc	.L029pw_nc3
+	jnc	.L041pw_nc3
 
 	movl	16(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,16(%ebx)
-	jnc	.L030pw_nc4
+	jnc	.L042pw_nc4
 
 	movl	20(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,20(%ebx)
-	jnc	.L031pw_nc5
+	jnc	.L043pw_nc5
 
 	movl	24(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,24(%ebx)
-	jnc	.L032pw_nc6
+	jnc	.L044pw_nc6
 
 	movl	28(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,28(%ebx)
-	jnc	.L033pw_nc7
+	jnc	.L045pw_nc7
 
 	addl	$32,%esi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L025pw_pos_loop
-.L024pw_pos_finish:
+	jnz	.L037pw_pos_loop
+.L036pw_pos_finish:
 	movl	36(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,(%ebx)
-	jnc	.L034pw_tail_nc0
+	jnc	.L046pw_tail_nc0
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	4(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,4(%ebx)
-	jnc	.L035pw_tail_nc1
+	jnc	.L047pw_tail_nc1
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	8(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,8(%ebx)
-	jnc	.L036pw_tail_nc2
+	jnc	.L048pw_tail_nc2
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	12(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,12(%ebx)
-	jnc	.L037pw_tail_nc3
+	jnc	.L049pw_tail_nc3
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	16(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,16(%ebx)
-	jnc	.L038pw_tail_nc4
+	jnc	.L050pw_tail_nc4
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	20(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,20(%ebx)
-	jnc	.L039pw_tail_nc5
+	jnc	.L051pw_tail_nc5
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	24(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,24(%ebx)
-	jnc	.L040pw_tail_nc6
+	jnc	.L052pw_tail_nc6
 	movl	$1,%eax
-	jmp	.L020pw_end
-.L041pw_nc_loop:
+	jmp	.L032pw_end
+.L053pw_nc_loop:
 	movl	(%esi),%ecx
 	movl	%ecx,(%ebx)
-.L026pw_nc0:
+.L038pw_nc0:
 	movl	4(%esi),%ecx
 	movl	%ecx,4(%ebx)
-.L027pw_nc1:
+.L039pw_nc1:
 	movl	8(%esi),%ecx
 	movl	%ecx,8(%ebx)
-.L028pw_nc2:
+.L040pw_nc2:
 	movl	12(%esi),%ecx
 	movl	%ecx,12(%ebx)
-.L029pw_nc3:
+.L041pw_nc3:
 	movl	16(%esi),%ecx
 	movl	%ecx,16(%ebx)
-.L030pw_nc4:
+.L042pw_nc4:
 	movl	20(%esi),%ecx
 	movl	%ecx,20(%ebx)
-.L031pw_nc5:
+.L043pw_nc5:
 	movl	24(%esi),%ecx
 	movl	%ecx,24(%ebx)
-.L032pw_nc6:
+.L044pw_nc6:
 	movl	28(%esi),%ecx
 	movl	%ecx,28(%ebx)
-.L033pw_nc7:
+.L045pw_nc7:
 
 	addl	$32,%esi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L041pw_nc_loop
+	jnz	.L053pw_nc_loop
 	movl	36(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	(%esi),%ecx
 	movl	%ecx,(%ebx)
-.L034pw_tail_nc0:
+.L046pw_tail_nc0:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	4(%esi),%ecx
 	movl	%ecx,4(%ebx)
-.L035pw_tail_nc1:
+.L047pw_tail_nc1:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	8(%esi),%ecx
 	movl	%ecx,8(%ebx)
-.L036pw_tail_nc2:
+.L048pw_tail_nc2:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	12(%esi),%ecx
 	movl	%ecx,12(%ebx)
-.L037pw_tail_nc3:
+.L049pw_tail_nc3:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	16(%esi),%ecx
 	movl	%ecx,16(%ebx)
-.L038pw_tail_nc4:
+.L050pw_tail_nc4:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	20(%esi),%ecx
 	movl	%ecx,20(%ebx)
-.L039pw_tail_nc5:
+.L051pw_tail_nc5:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	24(%esi),%ecx
 	movl	%ecx,24(%ebx)
-.L040pw_tail_nc6:
-.L042pw_nc_end:
+.L052pw_tail_nc6:
+.L054pw_nc_end:
 	movl	$0,%eax
-.L020pw_end:
+.L032pw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
 	popl	%ebp
 	ret
 .size	bn_sub_part_words,.-.L_bn_sub_part_words_begin
+.comm	OPENSSL_ia32cap_P,8,4
diff --git a/main/openssl/crypto/bn/asm/x86-gf2m.S b/main/openssl/crypto/bn/asm/x86-gf2m.S
index 9403a5aa..9ed29ae0 100644
--- a/main/openssl/crypto/bn/asm/x86-gf2m.S
+++ b/main/openssl/crypto/bn/asm/x86-gf2m.S
@@ -249,6 +249,18 @@ bn_GF2m_mul_2x2:
 	movl	4(%edx),%edx
 	testl	$8388608,%eax
 	jz	.L001ialu
+	testl	$16777216,%eax
+	jz	.L002mmx
+	testl	$2,%edx
+	jz	.L002mmx
+	movups	8(%esp),%xmm0
+	shufps	$177,%xmm0,%xmm0
+.byte	102,15,58,68,192,1
+	movl	4(%esp),%eax
+	movups	%xmm0,(%eax)
+	ret
+.align	16
+.L002mmx:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
diff --git a/main/openssl/crypto/bn/asm/x86-mont.S b/main/openssl/crypto/bn/asm/x86-mont.S
index 2bbb0e3a..c701e9e3 100644
--- a/main/openssl/crypto/bn/asm/x86-mont.S
+++ b/main/openssl/crypto/bn/asm/x86-mont.S
@@ -42,6 +42,127 @@ bn_mul_mont:
 	movl	%esi,20(%esp)
 	leal	-3(%edi),%ebx
 	movl	%ebp,24(%esp)
+	call	.L001PIC_me_up
+.L001PIC_me_up:
+	popl	%eax
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L001PIC_me_up](%eax),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L002non_sse2
+	movl	$-1,%eax
+	movd	%eax,%mm7
+	movl	8(%esp),%esi
+	movl	12(%esp),%edi
+	movl	16(%esp),%ebp
+	xorl	%edx,%edx
+	xorl	%ecx,%ecx
+	movd	(%edi),%mm4
+	movd	(%esi),%mm5
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	movq	%mm5,%mm2
+	movq	%mm5,%mm0
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	incl	%ecx
+.align	16
+.L0031st:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	leal	1(%ecx),%ecx
+	cmpl	%ebx,%ecx
+	jl	.L0031st
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm2,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	incl	%edx
+.L004outer:
+	xorl	%ecx,%ecx
+	movd	(%edi,%edx,4),%mm4
+	movd	(%esi),%mm5
+	movd	32(%esp),%mm6
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	paddq	%mm6,%mm5
+	movq	%mm5,%mm0
+	movq	%mm5,%mm2
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	36(%esp),%mm6
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	incl	%ecx
+	decl	%ebx
+.L005inner:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	movd	36(%esp,%ecx,4),%mm6
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	decl	%ebx
+	leal	1(%ecx),%ecx
+	jnz	.L005inner
+	movl	%ecx,%ebx
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	movd	36(%esp,%ebx,4),%mm6
+	paddq	%mm2,%mm3
+	paddq	%mm6,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	leal	1(%edx),%edx
+	cmpl	%ebx,%edx
+	jle	.L004outer
+	emms
+	jmp	.L006common_tail
+.align	16
+.L002non_sse2:
 	movl	8(%esp),%esi
 	leal	1(%ebx),%ebp
 	movl	12(%esp),%edi
@@ -52,12 +173,12 @@ bn_mul_mont:
 	leal	4(%edi,%ebx,4),%eax
 	orl	%edx,%ebp
 	movl	(%edi),%edi
-	jz	.L001bn_sqr_mont
+	jz	.L007bn_sqr_mont
 	movl	%eax,28(%esp)
 	movl	(%esi),%eax
 	xorl	%edx,%edx
 .align	16
-.L002mull:
+.L008mull:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	%eax,%ebp
@@ -66,7 +187,7 @@ bn_mul_mont:
 	movl	(%esi,%ecx,4),%eax
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L002mull
+	jl	.L008mull
 	movl	%edx,%ebp
 	mull	%edi
 	movl	20(%esp),%edi
@@ -84,9 +205,9 @@ bn_mul_mont:
 	movl	4(%esi),%eax
 	adcl	$0,%edx
 	incl	%ecx
-	jmp	.L0032ndmadd
+	jmp	.L0092ndmadd
 .align	16
-.L0041stmadd:
+.L0101stmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -97,7 +218,7 @@ bn_mul_mont:
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L0041stmadd
+	jl	.L0101stmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%eax
@@ -120,7 +241,7 @@ bn_mul_mont:
 	adcl	$0,%edx
 	movl	$1,%ecx
 .align	16
-.L0032ndmadd:
+.L0092ndmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -131,7 +252,7 @@ bn_mul_mont:
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0032ndmadd
+	jl	.L0092ndmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -147,16 +268,16 @@ bn_mul_mont:
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	28(%esp),%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	.L005common_tail
+	je	.L006common_tail
 	movl	(%ecx),%edi
 	movl	8(%esp),%esi
 	movl	%ecx,12(%esp)
 	xorl	%ecx,%ecx
 	xorl	%edx,%edx
 	movl	(%esi),%eax
-	jmp	.L0041stmadd
+	jmp	.L0101stmadd
 .align	16
-.L001bn_sqr_mont:
+.L007bn_sqr_mont:
 	movl	%ebx,(%esp)
 	movl	%ecx,12(%esp)
 	movl	%edi,%eax
@@ -167,7 +288,7 @@ bn_mul_mont:
 	andl	$1,%ebx
 	incl	%ecx
 .align	16
-.L006sqr:
+.L011sqr:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -179,7 +300,7 @@ bn_mul_mont:
 	cmpl	(%esp),%ecx
 	movl	%eax,%ebx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L006sqr
+	jl	.L011sqr
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -203,7 +324,7 @@ bn_mul_mont:
 	movl	4(%esi),%eax
 	movl	$1,%ecx
 .align	16
-.L0073rdmadd:
+.L0123rdmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -222,7 +343,7 @@ bn_mul_mont:
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0073rdmadd
+	jl	.L0123rdmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -238,7 +359,7 @@ bn_mul_mont:
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	%ebx,%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	.L005common_tail
+	je	.L006common_tail
 	movl	4(%esi,%ecx,4),%edi
 	leal	1(%ecx),%ecx
 	movl	%edi,%eax
@@ -250,12 +371,12 @@ bn_mul_mont:
 	xorl	%ebp,%ebp
 	cmpl	%ebx,%ecx
 	leal	1(%ecx),%ecx
-	je	.L008sqrlast
+	je	.L013sqrlast
 	movl	%edx,%ebx
 	shrl	$1,%edx
 	andl	$1,%ebx
 .align	16
-.L009sqradd:
+.L014sqradd:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -271,13 +392,13 @@ bn_mul_mont:
 	cmpl	(%esp),%ecx
 	movl	%ebp,28(%esp,%ecx,4)
 	movl	%eax,%ebx
-	jle	.L009sqradd
+	jle	.L014sqradd
 	movl	%edx,%ebp
 	addl	%edx,%edx
 	shrl	$31,%ebp
 	addl	%ebx,%edx
 	adcl	$0,%ebp
-.L008sqrlast:
+.L013sqrlast:
 	movl	20(%esp),%edi
 	movl	16(%esp),%esi
 	imull	32(%esp),%edi
@@ -292,9 +413,9 @@ bn_mul_mont:
 	adcl	$0,%edx
 	movl	$1,%ecx
 	movl	4(%esi),%eax
-	jmp	.L0073rdmadd
+	jmp	.L0123rdmadd
 .align	16
-.L005common_tail:
+.L006common_tail:
 	movl	16(%esp),%ebp
 	movl	4(%esp),%edi
 	leal	32(%esp),%esi
@@ -302,13 +423,13 @@ bn_mul_mont:
 	movl	%ebx,%ecx
 	xorl	%edx,%edx
 .align	16
-.L010sub:
+.L015sub:
 	sbbl	(%ebp,%edx,4),%eax
 	movl	%eax,(%edi,%edx,4)
 	decl	%ecx
 	movl	4(%esi,%edx,4),%eax
 	leal	1(%edx),%edx
-	jge	.L010sub
+	jge	.L015sub
 	sbbl	$0,%eax
 	andl	%eax,%esi
 	notl	%eax
@@ -316,12 +437,12 @@ bn_mul_mont:
 	andl	%eax,%ebp
 	orl	%ebp,%esi
 .align	16
-.L011copy:
+.L016copy:
 	movl	(%esi,%ebx,4),%eax
 	movl	%eax,(%edi,%ebx,4)
 	movl	%ecx,32(%esp,%ebx,4)
 	decl	%ebx
-	jge	.L011copy
+	jge	.L016copy
 	movl	24(%esp),%esp
 	movl	$1,%eax
 .L000just_leave:
@@ -336,3 +457,4 @@ bn_mul_mont:
 .byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
 .byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
 .byte	111,114,103,62,0
+.comm	OPENSSL_ia32cap_P,8,4
diff --git a/main/openssl/crypto/evp/e_aes.c b/main/openssl/crypto/evp/e_aes.c
index c7869b69..41cee42d 100644
--- a/main/openssl/crypto/evp/e_aes.c
+++ b/main/openssl/crypto/evp/e_aes.c
@@ -482,6 +482,14 @@ static const EVP_CIPHER aes_##keylen##_##mode = { \
 	NULL,NULL,aes_##mode##_ctrl,NULL }; \
 const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
 { return &aes_##keylen##_##mode; }
+
+#endif
+
+#if defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm))
+#include "arm_arch.h"
+#if __ARM_ARCH__>=7
+#define BSAES_CAPABLE  (OPENSSL_armcap_P & ARMV7_NEON)
+#endif
 #endif
 
 #define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\
@@ -1067,11 +1075,13 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		xctx->stream = NULL;
 #endif
 		/* key_len is two AES keys */
+#if !(defined(__arm__) || defined(__arm))      /* not yet? */
 #ifdef BSAES_CAPABLE
 		if (BSAES_CAPABLE)
 			xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
 		else
 #endif
+#endif
 #ifdef VPAES_CAPABLE
 		if (VPAES_CAPABLE)
 		    {
diff --git a/main/openssl/crypto/modes/asm/ghash-x86.S b/main/openssl/crypto/modes/asm/ghash-x86.S
index cb9ae20d..50473201 100644
--- a/main/openssl/crypto/modes/asm/ghash-x86.S
+++ b/main/openssl/crypto/modes/asm/ghash-x86.S
@@ -203,418 +203,94 @@ gcm_ghash_4bit_x86:
 	popl	%ebp
 	ret
 .size	gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
-.type	_mmx_gmult_4bit_inner,@function
+.globl	gcm_gmult_4bit_mmx
+.type	gcm_gmult_4bit_mmx,@function
 .align	16
-_mmx_gmult_4bit_inner:
+gcm_gmult_4bit_mmx:
+.L_gcm_gmult_4bit_mmx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	call	.L005pic_point
+.L005pic_point:
+	popl	%eax
+	leal	.Lrem_4bit-.L005pic_point(%eax),%eax
+	movzbl	15(%edi),%ebx
 	xorl	%ecx,%ecx
 	movl	%ebx,%edx
 	movb	%dl,%cl
+	movl	$14,%ebp
 	shlb	$4,%cl
 	andl	$240,%edx
 	movq	8(%esi,%ecx,1),%mm0
 	movq	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	14(%edi),%cl
-	psllq	$60,%mm2
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
+	jmp	.L006mmx_loop
+.align	16
+.L006mmx_loop:
 	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
 	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
 	movq	%mm1,%mm2
 	psrlq	$4,%mm1
 	pxor	8(%esi,%edx,1),%mm0
-	movb	13(%edi),%cl
+	movb	(%edi,%ebp,1),%cl
 	psllq	$60,%mm2
 	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
+	decl	%ebp
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	12(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
 	pxor	(%esi,%edx,1),%mm1
 	movl	%ecx,%edx
-	movd	%mm0,%ebx
 	pxor	%mm2,%mm0
+	js	.L007mmx_break
 	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
 	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	11(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
 	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
 	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	10(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
 	movq	%mm1,%mm2
 	psrlq	$4,%mm1
 	pxor	8(%esi,%ecx,1),%mm0
 	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	9(%edi),%cl
-	psllq	$60,%mm2
 	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
 	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	8(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
 	pxor	%mm2,%mm0
+	jmp	.L006mmx_loop
+.align	16
+.L007mmx_break:
 	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
 	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	7(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
 	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
 	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	6(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
 	movq	%mm1,%mm2
 	psrlq	$4,%mm1
 	pxor	8(%esi,%ecx,1),%mm0
 	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	5(%edi),%cl
-	psllq	$60,%mm2
 	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
 	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
 	pxor	%mm2,%mm0
 	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	4(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	3(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	2(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
 	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
 	movq	%mm1,%mm2
 	psrlq	$4,%mm1
 	pxor	8(%esi,%edx,1),%mm0
-	movb	1(%edi),%cl
 	psllq	$60,%mm2
 	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
 	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
 	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	movl	4(%eax,%ebp,8),%edi
 	psrlq	$32,%mm0
 	movd	%mm1,%edx
 	psrlq	$32,%mm1
 	movd	%mm0,%ecx
 	movd	%mm1,%ebp
-	shll	$4,%edi
 	bswap	%ebx
 	bswap	%edx
 	bswap	%ecx
-	xorl	%edi,%ebp
 	bswap	%ebp
-	ret
-.size	_mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner
-.globl	gcm_gmult_4bit_mmx
-.type	gcm_gmult_4bit_mmx,@function
-.align	16
-gcm_gmult_4bit_mmx:
-.L_gcm_gmult_4bit_mmx_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%edi
-	movl	24(%esp),%esi
-	call	.L005pic_point
-.L005pic_point:
-	popl	%eax
-	leal	.Lrem_4bit-.L005pic_point(%eax),%eax
-	movzbl	15(%edi),%ebx
-	call	_mmx_gmult_4bit_inner
-	movl	20(%esp),%edi
 	emms
 	movl	%ebx,12(%edi)
 	movl	%edx,4(%edi)
@@ -635,61 +311,926 @@ gcm_ghash_4bit_mmx:
 	pushl	%ebx
 	pushl	%esi
 	pushl	%edi
-	movl	20(%esp),%ebp
-	movl	24(%esp),%esi
-	movl	28(%esp),%edi
-	movl	32(%esp),%ecx
-	call	.L006pic_point
-.L006pic_point:
-	popl	%eax
-	leal	.Lrem_4bit-.L006pic_point(%eax),%eax
-	addl	%edi,%ecx
-	movl	%ecx,32(%esp)
-	subl	$20,%esp
-	movl	12(%ebp),%ebx
-	movl	4(%ebp),%edx
-	movl	8(%ebp),%ecx
-	movl	(%ebp),%ebp
-	jmp	.L007mmx_outer_loop
+	movl	20(%esp),%eax
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ecx
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	call	.L008pic_point
+.L008pic_point:
+	popl	%esi
+	leal	.Lrem_8bit-.L008pic_point(%esi),%esi
+	subl	$544,%esp
+	andl	$-64,%esp
+	subl	$16,%esp
+	addl	%ecx,%edx
+	movl	%eax,544(%esp)
+	movl	%edx,552(%esp)
+	movl	%ebp,556(%esp)
+	addl	$128,%ebx
+	leal	144(%esp),%edi
+	leal	400(%esp),%ebp
+	movl	-120(%ebx),%edx
+	movq	-120(%ebx),%mm0
+	movq	-128(%ebx),%mm3
+	shll	$4,%edx
+	movb	%dl,(%esp)
+	movl	-104(%ebx),%edx
+	movq	-104(%ebx),%mm2
+	movq	-112(%ebx),%mm5
+	movq	%mm0,-128(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,(%edi)
+	movq	%mm3,%mm7
+	psrlq	$4,%mm3
+	shll	$4,%edx
+	movb	%dl,1(%esp)
+	movl	-88(%ebx),%edx
+	movq	-88(%ebx),%mm1
+	psllq	$60,%mm7
+	movq	-96(%ebx),%mm4
+	por	%mm7,%mm0
+	movq	%mm2,-120(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,8(%edi)
+	movq	%mm5,%mm6
+	movq	%mm0,-128(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,(%ebp)
+	shll	$4,%edx
+	movb	%dl,2(%esp)
+	movl	-72(%ebx),%edx
+	movq	-72(%ebx),%mm0
+	psllq	$60,%mm6
+	movq	-80(%ebx),%mm3
+	por	%mm6,%mm2
+	movq	%mm1,-112(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,16(%edi)
+	movq	%mm4,%mm7
+	movq	%mm2,-120(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,8(%ebp)
+	shll	$4,%edx
+	movb	%dl,3(%esp)
+	movl	-56(%ebx),%edx
+	movq	-56(%ebx),%mm2
+	psllq	$60,%mm7
+	movq	-64(%ebx),%mm5
+	por	%mm7,%mm1
+	movq	%mm0,-104(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,24(%edi)
+	movq	%mm3,%mm6
+	movq	%mm1,-112(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,16(%ebp)
+	shll	$4,%edx
+	movb	%dl,4(%esp)
+	movl	-40(%ebx),%edx
+	movq	-40(%ebx),%mm1
+	psllq	$60,%mm6
+	movq	-48(%ebx),%mm4
+	por	%mm6,%mm0
+	movq	%mm2,-96(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,32(%edi)
+	movq	%mm5,%mm7
+	movq	%mm0,-104(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,24(%ebp)
+	shll	$4,%edx
+	movb	%dl,5(%esp)
+	movl	-24(%ebx),%edx
+	movq	-24(%ebx),%mm0
+	psllq	$60,%mm7
+	movq	-32(%ebx),%mm3
+	por	%mm7,%mm2
+	movq	%mm1,-88(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,40(%edi)
+	movq	%mm4,%mm6
+	movq	%mm2,-96(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,32(%ebp)
+	shll	$4,%edx
+	movb	%dl,6(%esp)
+	movl	-8(%ebx),%edx
+	movq	-8(%ebx),%mm2
+	psllq	$60,%mm6
+	movq	-16(%ebx),%mm5
+	por	%mm6,%mm1
+	movq	%mm0,-80(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,48(%edi)
+	movq	%mm3,%mm7
+	movq	%mm1,-88(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,40(%ebp)
+	shll	$4,%edx
+	movb	%dl,7(%esp)
+	movl	8(%ebx),%edx
+	movq	8(%ebx),%mm1
+	psllq	$60,%mm7
+	movq	(%ebx),%mm4
+	por	%mm7,%mm0
+	movq	%mm2,-72(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,56(%edi)
+	movq	%mm5,%mm6
+	movq	%mm0,-80(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,48(%ebp)
+	shll	$4,%edx
+	movb	%dl,8(%esp)
+	movl	24(%ebx),%edx
+	movq	24(%ebx),%mm0
+	psllq	$60,%mm6
+	movq	16(%ebx),%mm3
+	por	%mm6,%mm2
+	movq	%mm1,-64(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,64(%edi)
+	movq	%mm4,%mm7
+	movq	%mm2,-72(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,56(%ebp)
+	shll	$4,%edx
+	movb	%dl,9(%esp)
+	movl	40(%ebx),%edx
+	movq	40(%ebx),%mm2
+	psllq	$60,%mm7
+	movq	32(%ebx),%mm5
+	por	%mm7,%mm1
+	movq	%mm0,-56(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,72(%edi)
+	movq	%mm3,%mm6
+	movq	%mm1,-64(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,64(%ebp)
+	shll	$4,%edx
+	movb	%dl,10(%esp)
+	movl	56(%ebx),%edx
+	movq	56(%ebx),%mm1
+	psllq	$60,%mm6
+	movq	48(%ebx),%mm4
+	por	%mm6,%mm0
+	movq	%mm2,-48(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,80(%edi)
+	movq	%mm5,%mm7
+	movq	%mm0,-56(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,72(%ebp)
+	shll	$4,%edx
+	movb	%dl,11(%esp)
+	movl	72(%ebx),%edx
+	movq	72(%ebx),%mm0
+	psllq	$60,%mm7
+	movq	64(%ebx),%mm3
+	por	%mm7,%mm2
+	movq	%mm1,-40(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,88(%edi)
+	movq	%mm4,%mm6
+	movq	%mm2,-48(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,80(%ebp)
+	shll	$4,%edx
+	movb	%dl,12(%esp)
+	movl	88(%ebx),%edx
+	movq	88(%ebx),%mm2
+	psllq	$60,%mm6
+	movq	80(%ebx),%mm5
+	por	%mm6,%mm1
+	movq	%mm0,-32(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,96(%edi)
+	movq	%mm3,%mm7
+	movq	%mm1,-40(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,88(%ebp)
+	shll	$4,%edx
+	movb	%dl,13(%esp)
+	movl	104(%ebx),%edx
+	movq	104(%ebx),%mm1
+	psllq	$60,%mm7
+	movq	96(%ebx),%mm4
+	por	%mm7,%mm0
+	movq	%mm2,-24(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,104(%edi)
+	movq	%mm5,%mm6
+	movq	%mm0,-32(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,96(%ebp)
+	shll	$4,%edx
+	movb	%dl,14(%esp)
+	movl	120(%ebx),%edx
+	movq	120(%ebx),%mm0
+	psllq	$60,%mm6
+	movq	112(%ebx),%mm3
+	por	%mm6,%mm2
+	movq	%mm1,-16(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,112(%edi)
+	movq	%mm4,%mm7
+	movq	%mm2,-24(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,104(%ebp)
+	shll	$4,%edx
+	movb	%dl,15(%esp)
+	psllq	$60,%mm7
+	por	%mm7,%mm1
+	movq	%mm0,-8(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,120(%edi)
+	movq	%mm3,%mm6
+	movq	%mm1,-16(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,112(%ebp)
+	psllq	$60,%mm6
+	por	%mm6,%mm0
+	movq	%mm0,-8(%ebp)
+	movq	%mm3,120(%ebp)
+	movq	(%eax),%mm6
+	movl	8(%eax),%ebx
+	movl	12(%eax),%edx
 .align	16
-.L007mmx_outer_loop:
-	xorl	12(%edi),%ebx
-	xorl	4(%edi),%edx
-	xorl	8(%edi),%ecx
-	xorl	(%edi),%ebp
-	movl	%edi,48(%esp)
-	movl	%ebx,12(%esp)
-	movl	%edx,4(%esp)
-	movl	%ecx,8(%esp)
-	movl	%ebp,(%esp)
-	movl	%esp,%edi
-	shrl	$24,%ebx
-	call	_mmx_gmult_4bit_inner
-	movl	48(%esp),%edi
-	leal	16(%edi),%edi
-	cmpl	52(%esp),%edi
-	jb	.L007mmx_outer_loop
-	movl	40(%esp),%edi
+.L009outer:
+	xorl	12(%ecx),%edx
+	xorl	8(%ecx),%ebx
+	pxor	(%ecx),%mm6
+	leal	16(%ecx),%ecx
+	movl	%ebx,536(%esp)
+	movq	%mm6,528(%esp)
+	movl	%ecx,548(%esp)
+	xorl	%eax,%eax
+	roll	$8,%edx
+	movb	%dl,%al
+	movl	%eax,%ebp
+	andb	$15,%al
+	shrl	$4,%ebp
+	pxor	%mm0,%mm0
+	roll	$8,%edx
+	pxor	%mm1,%mm1
+	pxor	%mm2,%mm2
+	movq	16(%esp,%eax,8),%mm7
+	movq	144(%esp,%eax,8),%mm6
+	movb	%dl,%al
+	movd	%mm7,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	shrl	$4,%edi
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movl	536(%esp),%edx
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm1,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm0
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm0,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movl	532(%esp),%edx
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm1,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm0
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm0,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm1,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm0
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movl	528(%esp),%edx
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm0,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm1,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm0
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm0,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movl	524(%esp),%edx
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	pxor	144(%esp,%eax,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	movzbl	%bl,%ebx
+	pxor	%mm2,%mm2
+	psllq	$4,%mm1
+	movd	%mm7,%ecx
+	psrlq	$4,%mm7
+	movq	%mm6,%mm3
+	psrlq	$4,%mm6
+	shll	$4,%ecx
+	pxor	16(%esp,%edi,8),%mm7
+	psllq	$60,%mm3
+	movzbl	%cl,%ecx
+	pxor	%mm3,%mm7
+	pxor	144(%esp,%edi,8),%mm6
+	pinsrw	$2,(%esi,%ebx,2),%mm0
+	pxor	%mm1,%mm6
+	movd	%mm7,%edx
+	pinsrw	$3,(%esi,%ecx,2),%mm2
+	psllq	$12,%mm0
+	pxor	%mm0,%mm6
+	psrlq	$32,%mm7
+	pxor	%mm2,%mm6
+	movl	548(%esp),%ecx
+	movd	%mm7,%ebx
+	movq	%mm6,%mm3
+	psllw	$8,%mm6
+	psrlw	$8,%mm3
+	por	%mm3,%mm6
+	bswap	%edx
+	pshufw	$27,%mm6,%mm6
+	bswap	%ebx
+	cmpl	552(%esp),%ecx
+	jne	.L009outer
+	movl	544(%esp),%eax
+	movl	%edx,12(%eax)
+	movl	%ebx,8(%eax)
+	movq	%mm6,(%eax)
+	movl	556(%esp),%esp
 	emms
-	movl	%ebx,12(%edi)
-	movl	%edx,4(%edi)
-	movl	%ecx,8(%edi)
-	movl	%ebp,(%edi)
-	addl	$20,%esp
 	popl	%edi
 	popl	%esi
 	popl	%ebx
 	popl	%ebp
 	ret
 .size	gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	call	.L010pic
+.L010pic:
+	popl	%ecx
+	leal	.Lbswap-.L010pic(%ecx),%ecx
+	movdqu	(%eax),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+	pand	16(%ecx),%xmm5
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	movdqu	%xmm2,(%edx)
+	movdqu	%xmm0,16(%edx)
+	ret
+.size	gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+.L_gcm_gmult_clmul_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	call	.L011pic
+.L011pic:
+	popl	%ecx
+	leal	.Lbswap-.L011pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movups	(%edx),%xmm2
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	ret
+.size	gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%eax
+	movl	24(%esp),%edx
+	movl	28(%esp),%esi
+	movl	32(%esp),%ebx
+	call	.L012pic
+.L012pic:
+	popl	%ecx
+	leal	.Lbswap-.L012pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movdqu	(%edx),%xmm2
+.byte	102,15,56,0,197
+	subl	$16,%ebx
+	jz	.L013odd_tail
+	movdqu	(%esi),%xmm3
+	movdqu	16(%esi),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm6,%xmm7
+	pshufd	$78,%xmm6,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm6,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm7
+	pxor	%xmm4,%xmm6
+	movups	16(%edx),%xmm2
+	leal	32(%esi),%esi
+	subl	$32,%ebx
+	jbe	.L014even_tail
+.L015mod_loop:
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqu	(%esi),%xmm3
+	movups	(%edx),%xmm2
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+	movdqu	16(%esi),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pshufd	$78,%xmm5,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm5,%xmm3
+	pshufd	$78,%xmm2,%xmm5
+	pxor	%xmm2,%xmm5
+.byte	102,15,58,68,250,17
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.byte	102,15,58,68,221,0
+	movups	16(%edx),%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm3
+	movdqa	%xmm3,%xmm5
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm5
+	pxor	%xmm3,%xmm7
+	pxor	%xmm5,%xmm6
+	movdqa	(%ecx),%xmm5
+	leal	32(%esi),%esi
+	subl	$32,%ebx
+	ja	.L015mod_loop
+.L014even_tail:
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	testl	%ebx,%ebx
+	jnz	.L016done
+	movups	(%edx),%xmm2
+.L013odd_tail:
+	movdqu	(%esi),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.L016done:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align	64
+.Lbswap:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
 .align	64
 .Lrem_4bit:
-.long	0,0,0,29491200,0,58982400,0,38141952
-.long	0,117964800,0,113901568,0,76283904,0,88997888
-.long	0,235929600,0,265420800,0,227803136,0,206962688
-.long	0,152567808,0,148504576,0,177995776,0,190709760
+.long	0,0,0,471859200,0,943718400,0,610271232
+.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
 .align	64
-.L008rem_8bit:
+.Lrem_8bit:
 .value	0,450,900,582,1800,1738,1164,1358
 .value	3600,4050,3476,3158,2328,2266,2716,2910
 .value	7200,7650,8100,7782,6952,6890,6316,6510
diff --git a/main/openssl/crypto/ocsp/ocsp.h b/main/openssl/crypto/ocsp/ocsp.h
index 31e45744..f14e9f7e 100644
--- a/main/openssl/crypto/ocsp/ocsp.h
+++ b/main/openssl/crypto/ocsp/ocsp.h
@@ -90,6 +90,13 @@ extern "C" {
 #define OCSP_RESPID_KEY			0x400
 #define OCSP_NOTIME			0x800
 
+#ifdef OPENSSL_SYS_WIN32
+  /* Under Win32 these are defined in wincrypt.h */
+#undef OCSP_REQUEST
+#undef X509_NAME
+#undef OCSP_RESPONSE
+#endif
+
 /*   CertID ::= SEQUENCE {
  *       hashAlgorithm            AlgorithmIdentifier,
  *       issuerNameHash     OCTET STRING, -- Hash of Issuer's DN
diff --git a/main/openssl/crypto/sha/asm/sha1-586.S b/main/openssl/crypto/sha/asm/sha1-586.S
index e77f6541..47bef2a9 100644
--- a/main/openssl/crypto/sha/asm/sha1-586.S
+++ b/main/openssl/crypto/sha/asm/sha1-586.S
@@ -9,6 +9,21 @@ sha1_block_data_order:
 	pushl	%ebx
 	pushl	%esi
 	pushl	%edi
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000pic_point](%ebp),%esi
+	movl	OPENSSL_ia32cap_P@GOT(%esi),%esi
+	leal	.LK_XX_XX-.L000pic_point(%ebp),%ebp
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	testl	$512,%edx
+	jz	.L001x86
+	testl	$16777216,%eax
+	jz	.L001x86
+	jmp	.Lssse3_shortcut
+.align	16
+.L001x86:
 	movl	20(%esp),%ebp
 	movl	24(%esp),%esi
 	movl	28(%esp),%eax
@@ -17,9 +32,9 @@ sha1_block_data_order:
 	addl	%esi,%eax
 	movl	%eax,104(%esp)
 	movl	16(%ebp),%edi
-	jmp	.L000loop
+	jmp	.L002loop
 .align	16
-.L000loop:
+.L002loop:
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
@@ -1366,7 +1381,7 @@ sha1_block_data_order:
 	movl	%ebx,12(%ebp)
 	movl	%edx,%esi
 	movl	%ecx,16(%ebp)
-	jb	.L000loop
+	jb	.L002loop
 	addl	$76,%esp
 	popl	%edi
 	popl	%esi
@@ -1374,7 +1389,1251 @@ sha1_block_data_order:
 	popl	%ebp
 	ret
 .size	sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.type	_sha1_block_data_order_ssse3,@function
+.align	16
+_sha1_block_data_order_ssse3:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.L003pic_point
+.L003pic_point:
+	popl	%ebp
+	leal	.LK_XX_XX-.L003pic_point(%ebp),%ebp
+.Lssse3_shortcut:
+	movdqa	(%ebp),%xmm7
+	movdqa	16(%ebp),%xmm0
+	movdqa	32(%ebp),%xmm1
+	movdqa	48(%ebp),%xmm2
+	movdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	movdqa	%xmm0,112(%esp)
+	movdqa	%xmm1,128(%esp)
+	movdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	movdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	movdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	movdqu	-64(%ebp),%xmm0
+	movdqu	-48(%ebp),%xmm1
+	movdqu	-32(%ebp),%xmm2
+	movdqu	-16(%ebp),%xmm3
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	movdqa	%xmm7,96(%esp)
+.byte	102,15,56,0,222
+	paddd	%xmm7,%xmm0
+	paddd	%xmm7,%xmm1
+	paddd	%xmm7,%xmm2
+	movdqa	%xmm0,(%esp)
+	psubd	%xmm7,%xmm0
+	movdqa	%xmm1,16(%esp)
+	psubd	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	psubd	%xmm7,%xmm2
+	movdqa	%xmm1,%xmm4
+	jmp	.L004loop
+.align	16
+.L004loop:
+	addl	(%esp),%edi
+	xorl	%edx,%ecx
+.byte	102,15,58,15,224,8
+	movdqa	%xmm3,%xmm6
+	movl	%eax,%ebp
+	roll	$5,%eax
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm0,64(%esp)
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrldq	$4,%xmm6
+	xorl	%edx,%esi
+	addl	%eax,%edi
+	pxor	%xmm0,%xmm4
+	rorl	$2,%ebx
+	addl	%esi,%edi
+	pxor	%xmm2,%xmm6
+	addl	4(%esp),%edx
+	xorl	%ecx,%ebx
+	movl	%edi,%esi
+	roll	$5,%edi
+	pxor	%xmm6,%xmm4
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	movdqa	%xmm7,48(%esp)
+	xorl	%ecx,%ebp
+	addl	%edi,%edx
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm6
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	8(%esp),%ecx
+	xorl	%ebx,%eax
+	pslldq	$12,%xmm0
+	paddd	%xmm4,%xmm4
+	movl	%edx,%ebp
+	roll	$5,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrld	$31,%xmm6
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm0,%xmm7
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	psrld	$30,%xmm0
+	por	%xmm6,%xmm4
+	addl	12(%esp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pslld	$2,%xmm7
+	pxor	%xmm0,%xmm4
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	movdqa	96(%esp),%xmm0
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	pxor	%xmm7,%xmm4
+	movdqa	%xmm2,%xmm5
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	16(%esp),%eax
+	xorl	%edi,%edx
+.byte	102,15,58,15,233,8
+	movdqa	%xmm4,%xmm7
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	paddd	%xmm4,%xmm0
+	movdqa	%xmm1,80(%esp)
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	psrldq	$4,%xmm7
+	xorl	%edi,%esi
+	addl	%ebx,%eax
+	pxor	%xmm1,%xmm5
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm3,%xmm7
+	addl	20(%esp),%edi
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pxor	%xmm7,%xmm5
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	movdqa	%xmm0,(%esp)
+	xorl	%edx,%ebp
+	addl	%eax,%edi
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm5,%xmm7
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	addl	24(%esp),%edx
+	xorl	%ecx,%ebx
+	pslldq	$12,%xmm1
+	paddd	%xmm5,%xmm5
+	movl	%edi,%ebp
+	roll	$5,%edi
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	psrld	$31,%xmm7
+	xorl	%ecx,%esi
+	addl	%edi,%edx
+	movdqa	%xmm1,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	psrld	$30,%xmm1
+	por	%xmm7,%xmm5
+	addl	28(%esp),%ecx
+	xorl	%ebx,%eax
+	movl	%edx,%esi
+	roll	$5,%edx
+	pslld	$2,%xmm0
+	pxor	%xmm1,%xmm5
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	movdqa	112(%esp),%xmm1
+	xorl	%ebx,%ebp
+	addl	%edx,%ecx
+	pxor	%xmm0,%xmm5
+	movdqa	%xmm3,%xmm6
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	addl	32(%esp),%ebx
+	xorl	%eax,%edi
+.byte	102,15,58,15,242,8
+	movdqa	%xmm5,%xmm0
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	paddd	%xmm5,%xmm1
+	movdqa	%xmm2,96(%esp)
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	psrldq	$4,%xmm0
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	pxor	%xmm2,%xmm6
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm4,%xmm0
+	addl	36(%esp),%eax
+	xorl	%edi,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	pxor	%xmm0,%xmm6
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	movdqa	%xmm1,16(%esp)
+	xorl	%edi,%ebp
+	addl	%ebx,%eax
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,%xmm0
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	addl	40(%esp),%edi
+	xorl	%edx,%ecx
+	pslldq	$12,%xmm2
+	paddd	%xmm6,%xmm6
+	movl	%eax,%ebp
+	roll	$5,%eax
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrld	$31,%xmm0
+	xorl	%edx,%esi
+	addl	%eax,%edi
+	movdqa	%xmm2,%xmm1
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	psrld	$30,%xmm2
+	por	%xmm0,%xmm6
+	addl	44(%esp),%edx
+	xorl	%ecx,%ebx
+	movdqa	64(%esp),%xmm0
+	movl	%edi,%esi
+	roll	$5,%edi
+	pslld	$2,%xmm1
+	pxor	%xmm2,%xmm6
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	movdqa	112(%esp),%xmm2
+	xorl	%ecx,%ebp
+	addl	%edi,%edx
+	pxor	%xmm1,%xmm6
+	movdqa	%xmm4,%xmm7
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	48(%esp),%ecx
+	xorl	%ebx,%eax
+.byte	102,15,58,15,251,8
+	movdqa	%xmm6,%xmm1
+	movl	%edx,%ebp
+	roll	$5,%edx
+	paddd	%xmm6,%xmm2
+	movdqa	%xmm3,64(%esp)
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrldq	$4,%xmm1
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	pxor	%xmm3,%xmm7
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	pxor	%xmm5,%xmm1
+	addl	52(%esp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pxor	%xmm1,%xmm7
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	movdqa	%xmm2,32(%esp)
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movdqa	%xmm7,%xmm3
+	movdqa	%xmm7,%xmm1
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	56(%esp),%eax
+	xorl	%edi,%edx
+	pslldq	$12,%xmm3
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	psrld	$31,%xmm1
+	xorl	%edi,%esi
+	addl	%ebx,%eax
+	movdqa	%xmm3,%xmm2
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	psrld	$30,%xmm3
+	por	%xmm1,%xmm7
+	addl	60(%esp),%edi
+	xorl	%edx,%ecx
+	movdqa	80(%esp),%xmm1
+	movl	%eax,%esi
+	roll	$5,%eax
+	pslld	$2,%xmm2
+	pxor	%xmm3,%xmm7
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	movdqa	112(%esp),%xmm3
+	xorl	%edx,%ebp
+	addl	%eax,%edi
+	pxor	%xmm2,%xmm7
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	movdqa	%xmm7,%xmm2
+	addl	(%esp),%edx
+	pxor	%xmm4,%xmm0
+.byte	102,15,58,15,214,8
+	xorl	%ecx,%ebx
+	movl	%edi,%ebp
+	roll	$5,%edi
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,80(%esp)
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	movdqa	%xmm3,%xmm4
+	paddd	%xmm7,%xmm3
+	xorl	%ecx,%esi
+	addl	%edi,%edx
+	pxor	%xmm2,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	4(%esp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	roll	$5,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	pslld	$2,%xmm0
+	xorl	%ebx,%ebp
+	addl	%edx,%ecx
+	psrld	$30,%xmm2
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	addl	8(%esp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	por	%xmm2,%xmm0
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	movdqa	96(%esp),%xmm2
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	12(%esp),%eax
+	movdqa	%xmm0,%xmm3
+	xorl	%edi,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	xorl	%edi,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	addl	16(%esp),%edi
+	pxor	%xmm5,%xmm1
+.byte	102,15,58,15,223,8
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,96(%esp)
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	movdqa	%xmm4,%xmm5
+	paddd	%xmm0,%xmm4
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	pxor	%xmm3,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	pslld	$2,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm3
+	movl	%edx,%ebp
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	por	%xmm3,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%eax,%ebp
+	movdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	movdqa	%xmm1,%xmm4
+	addl	%ebp,%ebx
+	addl	32(%esp),%eax
+	pxor	%xmm6,%xmm2
+.byte	102,15,58,15,224,8
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,64(%esp)
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	movdqa	128(%esp),%xmm6
+	paddd	%xmm1,%xmm5
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm4,%xmm2
+	addl	36(%esp),%edi
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	pslld	$2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ecx,%esi
+	psrld	$30,%xmm4
+	movl	%edi,%ebp
+	roll	$5,%edi
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	por	%xmm4,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%ebx,%ebp
+	movdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	movdqa	%xmm2,%xmm5
+	addl	%ebp,%ecx
+	addl	48(%esp),%ebx
+	pxor	%xmm7,%xmm3
+.byte	102,15,58,15,233,8
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,80(%esp)
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	movdqa	%xmm6,%xmm7
+	paddd	%xmm2,%xmm6
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	pslld	$2,%xmm3
+	addl	56(%esp),%edi
+	xorl	%edx,%esi
+	psrld	$30,%xmm5
+	movl	%eax,%ebp
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	por	%xmm5,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ecx,%ebp
+	movdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	roll	$5,%edi
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	movdqa	%xmm3,%xmm6
+	addl	%ebp,%edx
+	addl	(%esp),%ecx
+	pxor	%xmm0,%xmm4
+.byte	102,15,58,15,242,8
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm0,96(%esp)
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm7,%xmm0
+	paddd	%xmm3,%xmm7
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	pxor	%xmm6,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm7,48(%esp)
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	pslld	$2,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edi,%esi
+	psrld	$30,%xmm6
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	por	%xmm6,%xmm4
+	addl	12(%esp),%edi
+	xorl	%edx,%ebp
+	movdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	movdqa	%xmm4,%xmm7
+	addl	%ebp,%edi
+	addl	16(%esp),%edx
+	pxor	%xmm1,%xmm5
+.byte	102,15,58,15,251,8
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm1,64(%esp)
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	movdqa	%xmm0,%xmm1
+	paddd	%xmm4,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	pxor	%xmm7,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm0,(%esp)
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	pslld	$2,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%eax,%esi
+	psrld	$30,%xmm7
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	por	%xmm7,%xmm5
+	addl	28(%esp),%eax
+	xorl	%edi,%ebp
+	movdqa	80(%esp),%xmm7
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	movdqa	%xmm5,%xmm0
+	addl	%ebp,%eax
+	movl	%ecx,%ebp
+	pxor	%xmm2,%xmm6
+.byte	102,15,58,15,196,8
+	xorl	%edx,%ecx
+	addl	32(%esp),%edi
+	andl	%edx,%ebp
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm2,80(%esp)
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm5,%xmm1
+	addl	%ebp,%edi
+	movl	%eax,%ebp
+	pxor	%xmm0,%xmm6
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%edx,%ecx
+	addl	%eax,%edi
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm1,16(%esp)
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	36(%esp),%edx
+	andl	%ecx,%esi
+	pslld	$2,%xmm6
+	andl	%ebx,%ebp
+	rorl	$7,%eax
+	psrld	$30,%xmm0
+	addl	%esi,%edx
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ecx,%ebx
+	addl	%edi,%edx
+	por	%xmm0,%xmm6
+	movl	%eax,%ebp
+	xorl	%ebx,%eax
+	movdqa	96(%esp),%xmm0
+	addl	40(%esp),%ecx
+	andl	%ebx,%ebp
+	andl	%eax,%esi
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	movdqa	%xmm6,%xmm1
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	addl	44(%esp),%ebx
+	andl	%eax,%esi
+	andl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movl	%edx,%ebp
+	pxor	%xmm3,%xmm7
+.byte	102,15,58,15,205,8
+	xorl	%edi,%edx
+	addl	48(%esp),%eax
+	andl	%edi,%ebp
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm3,96(%esp)
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	144(%esp),%xmm3
+	paddd	%xmm6,%xmm2
+	addl	%ebp,%eax
+	movl	%ebx,%ebp
+	pxor	%xmm1,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edi,%edx
+	addl	%ebx,%eax
+	movdqa	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	52(%esp),%edi
+	andl	%edx,%esi
+	pslld	$2,%xmm7
+	andl	%ecx,%ebp
+	rorl	$7,%ebx
+	psrld	$30,%xmm1
+	addl	%esi,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%edx,%ecx
+	addl	%eax,%edi
+	por	%xmm1,%xmm7
+	movl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	movdqa	64(%esp),%xmm1
+	addl	56(%esp),%edx
+	andl	%ecx,%ebp
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	movdqa	%xmm7,%xmm2
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%edi,%edx
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	60(%esp),%ecx
+	andl	%ebx,%esi
+	andl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%edi,%ebp
+	pxor	%xmm4,%xmm0
+.byte	102,15,58,15,214,8
+	xorl	%eax,%edi
+	addl	(%esp),%ebx
+	andl	%eax,%ebp
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,64(%esp)
+	andl	%edi,%esi
+	rorl	$7,%edx
+	movdqa	%xmm3,%xmm4
+	paddd	%xmm7,%xmm3
+	addl	%ebp,%ebx
+	movl	%ecx,%ebp
+	pxor	%xmm2,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	xorl	%edi,%edx
+	addl	4(%esp),%eax
+	andl	%edi,%esi
+	pslld	$2,%xmm0
+	andl	%edx,%ebp
+	rorl	$7,%ecx
+	psrld	$30,%xmm2
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edi,%edx
+	addl	%ebx,%eax
+	por	%xmm2,%xmm0
+	movl	%ecx,%ebp
+	xorl	%edx,%ecx
+	movdqa	80(%esp),%xmm2
+	addl	8(%esp),%edi
+	andl	%edx,%ebp
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	movdqa	%xmm0,%xmm3
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%edx,%ecx
+	addl	%eax,%edi
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	12(%esp),%edx
+	andl	%ecx,%esi
+	andl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ecx,%ebx
+	addl	%edi,%edx
+	movl	%eax,%ebp
+	pxor	%xmm5,%xmm1
+.byte	102,15,58,15,223,8
+	xorl	%ebx,%eax
+	addl	16(%esp),%ecx
+	andl	%ebx,%ebp
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,80(%esp)
+	andl	%eax,%esi
+	rorl	$7,%edi
+	movdqa	%xmm4,%xmm5
+	paddd	%xmm0,%xmm4
+	addl	%ebp,%ecx
+	movl	%edx,%ebp
+	pxor	%xmm3,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	addl	20(%esp),%ebx
+	andl	%eax,%esi
+	pslld	$2,%xmm1
+	andl	%edi,%ebp
+	rorl	$7,%edx
+	psrld	$30,%xmm3
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	por	%xmm3,%xmm1
+	movl	%edx,%ebp
+	xorl	%edi,%edx
+	movdqa	96(%esp),%xmm3
+	addl	24(%esp),%eax
+	andl	%edi,%ebp
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	movdqa	%xmm1,%xmm4
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edi,%edx
+	addl	%ebx,%eax
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	28(%esp),%edi
+	andl	%edx,%esi
+	andl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%edx,%ecx
+	addl	%eax,%edi
+	movl	%ebx,%ebp
+	pxor	%xmm6,%xmm2
+.byte	102,15,58,15,224,8
+	xorl	%ecx,%ebx
+	addl	32(%esp),%edx
+	andl	%ecx,%ebp
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,96(%esp)
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	movdqa	%xmm5,%xmm6
+	paddd	%xmm1,%xmm5
+	addl	%ebp,%edx
+	movl	%edi,%ebp
+	pxor	%xmm4,%xmm2
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%edi,%edx
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	36(%esp),%ecx
+	andl	%ebx,%esi
+	pslld	$2,%xmm2
+	andl	%eax,%ebp
+	rorl	$7,%edi
+	psrld	$30,%xmm4
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	por	%xmm4,%xmm2
+	movl	%edi,%ebp
+	xorl	%eax,%edi
+	movdqa	64(%esp),%xmm4
+	addl	40(%esp),%ebx
+	andl	%eax,%ebp
+	andl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	movdqa	%xmm2,%xmm5
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movl	%edx,%esi
+	xorl	%edi,%edx
+	addl	44(%esp),%eax
+	andl	%edi,%esi
+	andl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edi,%edx
+	addl	%ebx,%eax
+	addl	48(%esp),%edi
+	pxor	%xmm7,%xmm3
+.byte	102,15,58,15,233,8
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,64(%esp)
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	movdqa	%xmm6,%xmm7
+	paddd	%xmm2,%xmm6
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	pslld	$2,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm5
+	movl	%edx,%ebp
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	por	%xmm5,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	(%esp),%eax
+	paddd	%xmm3,%xmm7
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	movdqa	%xmm7,48(%esp)
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	4(%esp),%edi
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	addl	8(%esp),%edx
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	12(%esp),%ecx
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	.L005done
+	movdqa	160(%esp),%xmm7
+	movdqa	176(%esp),%xmm6
+	movdqu	(%ebp),%xmm0
+	movdqu	16(%ebp),%xmm1
+	movdqu	32(%ebp),%xmm2
+	movdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+.byte	102,15,56,0,198
+	movl	%ebp,196(%esp)
+	movdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%eax,%esi
+.byte	102,15,56,0,206
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	paddd	%xmm7,%xmm0
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movdqa	%xmm0,(%esp)
+	addl	20(%esp),%eax
+	xorl	%edi,%ebp
+	psubd	%xmm7,%xmm0
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	addl	24(%esp),%edi
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	addl	28(%esp),%edx
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%esp),%ecx
+	xorl	%ebx,%esi
+.byte	102,15,56,0,214
+	movl	%edx,%ebp
+	roll	$5,%edx
+	paddd	%xmm7,%xmm1
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	movdqa	%xmm1,16(%esp)
+	addl	36(%esp),%ebx
+	xorl	%eax,%ebp
+	psubd	%xmm7,%xmm1
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	40(%esp),%eax
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%esp),%edi
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	addl	48(%esp),%edx
+	xorl	%ecx,%esi
+.byte	102,15,56,0,222
+	movl	%edi,%ebp
+	roll	$5,%edi
+	paddd	%xmm7,%xmm2
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movdqa	%xmm2,32(%esp)
+	addl	52(%esp),%ecx
+	xorl	%ebx,%ebp
+	psubd	%xmm7,%xmm2
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	addl	56(%esp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%esi,%ebx
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	movdqa	%xmm1,%xmm4
+	jmp	.L004loop
+.align	16
+.L005done:
+	addl	16(%esp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	20(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	addl	24(%esp),%edi
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	addl	28(%esp),%edx
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%esp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	addl	36(%esp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	40(%esp),%eax
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%esp),%edi
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	addl	48(%esp),%edx
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	52(%esp),%ecx
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	addl	56(%esp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.align	64
+.LK_XX_XX:
+.long	1518500249,1518500249,1518500249,1518500249
+.long	1859775393,1859775393,1859775393,1859775393
+.long	2400959708,2400959708,2400959708,2400959708
+.long	3395469782,3395469782,3395469782,3395469782
+.long	66051,67438087,134810123,202182159
 .byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
 .byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
 .byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
 .byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.comm	OPENSSL_ia32cap_P,8,4
diff --git a/main/openssl/crypto/sha/asm/sha512-586.S b/main/openssl/crypto/sha/asm/sha512-586.S
index 4b806f35..82c76c41 100644
--- a/main/openssl/crypto/sha/asm/sha512-586.S
+++ b/main/openssl/crypto/sha/asm/sha512-586.S
@@ -25,6 +25,278 @@ sha512_block_data_order:
 	movl	%edi,4(%esp)
 	movl	%eax,8(%esp)
 	movl	%ebx,12(%esp)
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L001K512](%ebp),%edx
+	movl	OPENSSL_ia32cap_P@GOT(%edx),%edx
+	btl	$26,(%edx)
+	jnc	.L002loop_x86
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm1
+	movq	16(%esi),%mm2
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	subl	$80,%esp
+.align	16
+.L003loop_sse2:
+	movq	%mm1,8(%esp)
+	movq	%mm2,16(%esp)
+	movq	%mm3,24(%esp)
+	movq	%mm5,40(%esp)
+	movq	%mm6,48(%esp)
+	movq	%mm7,56(%esp)
+	movl	(%edi),%ecx
+	movl	4(%edi),%edx
+	addl	$8,%edi
+	bswap	%ecx
+	bswap	%edx
+	movl	%ecx,76(%esp)
+	movl	%edx,72(%esp)
+.align	16
+.L00400_14_sse2:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,68(%esp)
+	movl	%ebx,64(%esp)
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	movq	%mm4,%mm1
+	movq	%mm4,%mm2
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	psllq	$23,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm2,%mm3
+	psllq	$23,%mm2
+	pxor	%mm1,%mm3
+	psrlq	$23,%mm1
+	pxor	%mm2,%mm3
+	psllq	$4,%mm2
+	pxor	%mm1,%mm3
+	paddq	(%ebp),%mm7
+	pxor	%mm2,%mm3
+	pxor	%mm6,%mm5
+	movq	8(%esp),%mm1
+	pand	%mm4,%mm5
+	movq	16(%esp),%mm2
+	pxor	%mm6,%mm5
+	movq	24(%esp),%mm4
+	paddq	%mm5,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	movq	%mm0,%mm6
+	paddq	72(%esp),%mm3
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	psllq	$25,%mm6
+	movq	%mm5,%mm7
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	subl	$8,%esp
+	pxor	%mm6,%mm7
+	movq	%mm0,%mm5
+	por	%mm2,%mm0
+	pand	%mm2,%mm5
+	pand	%mm1,%mm0
+	por	%mm0,%mm5
+	paddq	%mm5,%mm7
+	movq	%mm3,%mm0
+	movb	(%ebp),%dl
+	paddq	%mm7,%mm0
+	addl	$8,%ebp
+	cmpb	$53,%dl
+	jne	.L00400_14_sse2
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	movq	%mm4,%mm1
+	movq	%mm4,%mm2
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	psllq	$23,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm2,%mm3
+	psllq	$23,%mm2
+	pxor	%mm1,%mm3
+	psrlq	$23,%mm1
+	pxor	%mm2,%mm3
+	psllq	$4,%mm2
+	pxor	%mm1,%mm3
+	paddq	(%ebp),%mm7
+	pxor	%mm2,%mm3
+	pxor	%mm6,%mm5
+	movq	8(%esp),%mm1
+	pand	%mm4,%mm5
+	movq	16(%esp),%mm2
+	pxor	%mm6,%mm5
+	movq	24(%esp),%mm4
+	paddq	%mm5,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	movq	%mm0,%mm6
+	paddq	72(%esp),%mm3
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	psllq	$25,%mm6
+	movq	%mm5,%mm7
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	subl	$8,%esp
+	pxor	%mm6,%mm7
+	movq	%mm0,%mm5
+	por	%mm2,%mm0
+	movq	88(%esp),%mm6
+	pand	%mm2,%mm5
+	pand	%mm1,%mm0
+	movq	192(%esp),%mm2
+	por	%mm0,%mm5
+	paddq	%mm5,%mm7
+	movq	%mm3,%mm0
+	movb	(%ebp),%dl
+	paddq	%mm7,%mm0
+	addl	$8,%ebp
+.align	16
+.L00516_79_sse2:
+	movq	%mm2,%mm1
+	psrlq	$1,%mm2
+	movq	%mm6,%mm7
+	psrlq	$6,%mm6
+	movq	%mm2,%mm3
+	psrlq	$6,%mm2
+	movq	%mm6,%mm5
+	psrlq	$13,%mm6
+	pxor	%mm2,%mm3
+	psrlq	$1,%mm2
+	pxor	%mm6,%mm5
+	psrlq	$42,%mm6
+	pxor	%mm2,%mm3
+	movq	200(%esp),%mm2
+	psllq	$56,%mm1
+	pxor	%mm6,%mm5
+	psllq	$3,%mm7
+	pxor	%mm1,%mm3
+	paddq	128(%esp),%mm2
+	psllq	$7,%mm1
+	pxor	%mm7,%mm5
+	psllq	$42,%mm7
+	pxor	%mm1,%mm3
+	pxor	%mm7,%mm5
+	paddq	%mm5,%mm3
+	paddq	%mm2,%mm3
+	movq	%mm3,72(%esp)
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	movq	%mm4,%mm1
+	movq	%mm4,%mm2
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	psllq	$23,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm2,%mm3
+	psllq	$23,%mm2
+	pxor	%mm1,%mm3
+	psrlq	$23,%mm1
+	pxor	%mm2,%mm3
+	psllq	$4,%mm2
+	pxor	%mm1,%mm3
+	paddq	(%ebp),%mm7
+	pxor	%mm2,%mm3
+	pxor	%mm6,%mm5
+	movq	8(%esp),%mm1
+	pand	%mm4,%mm5
+	movq	16(%esp),%mm2
+	pxor	%mm6,%mm5
+	movq	24(%esp),%mm4
+	paddq	%mm5,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	movq	%mm0,%mm6
+	paddq	72(%esp),%mm3
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	psllq	$25,%mm6
+	movq	%mm5,%mm7
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	subl	$8,%esp
+	pxor	%mm6,%mm7
+	movq	%mm0,%mm5
+	por	%mm2,%mm0
+	movq	88(%esp),%mm6
+	pand	%mm2,%mm5
+	pand	%mm1,%mm0
+	movq	192(%esp),%mm2
+	por	%mm0,%mm5
+	paddq	%mm5,%mm7
+	movq	%mm3,%mm0
+	movb	(%ebp),%dl
+	paddq	%mm7,%mm0
+	addl	$8,%ebp
+	cmpb	$23,%dl
+	jne	.L00516_79_sse2
+	movq	8(%esp),%mm1
+	movq	16(%esp),%mm2
+	movq	24(%esp),%mm3
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	addl	$640,%esp
+	subl	$640,%ebp
+	cmpl	88(%esp),%edi
+	jb	.L003loop_sse2
+	emms
+	movl	92(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
 .align	16
 .L002loop_x86:
 	movl	(%edi),%eax
@@ -130,7 +402,7 @@ sha512_block_data_order:
 	movl	$16,%ecx
 .long	2784229001
 .align	16
-.L00300_15_x86:
+.L00600_15_x86:
 	movl	40(%esp),%ecx
 	movl	44(%esp),%edx
 	movl	%ecx,%esi
@@ -237,9 +509,9 @@ sha512_block_data_order:
 	subl	$8,%esp
 	leal	8(%ebp),%ebp
 	cmpb	$148,%dl
-	jne	.L00300_15_x86
+	jne	.L00600_15_x86
 .align	16
-.L00416_79_x86:
+.L00716_79_x86:
 	movl	312(%esp),%ecx
 	movl	316(%esp),%edx
 	movl	%ecx,%esi
@@ -412,7 +684,7 @@ sha512_block_data_order:
 	subl	$8,%esp
 	leal	8(%ebp),%ebp
 	cmpb	$23,%dl
-	jne	.L00416_79_x86
+	jne	.L00716_79_x86
 	movl	840(%esp),%esi
 	movl	844(%esp),%edi
 	movl	(%esi),%eax
@@ -561,3 +833,4 @@ sha512_block_data_order:
 .byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
 .byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
 .byte	62,0
+.comm	OPENSSL_ia32cap_P,8,4
diff --git a/main/openssl/crypto/x86cpuid.S b/main/openssl/crypto/x86cpuid.S
index 73b5d98e..87a46d4b 100644
--- a/main/openssl/crypto/x86cpuid.S
+++ b/main/openssl/crypto/x86cpuid.S
@@ -226,6 +226,18 @@ OPENSSL_wipe_cpu:
 	movl	(%ecx),%ecx
 	btl	$1,(%ecx)
 	jnc	.L015no_x87
+	andl	$83886080,%ecx
+	cmpl	$83886080,%ecx
+	jne	.L016no_sse2
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+.L016no_sse2:
 .long	4007259865,4007259865,4007259865,4007259865,2430851995
 .L015no_x87:
 	leal	4(%esp),%eax
@@ -241,11 +253,11 @@ OPENSSL_atomic_add:
 	pushl	%ebx
 	nop
 	movl	(%edx),%eax
-.L016spin:
+.L017spin:
 	leal	(%eax,%ecx,1),%ebx
 	nop
 .long	447811568
-	jne	.L016spin
+	jne	.L017spin
 	movl	%ebx,%eax
 	popl	%ebx
 	ret
@@ -286,32 +298,32 @@ OPENSSL_cleanse:
 	movl	8(%esp),%ecx
 	xorl	%eax,%eax
 	cmpl	$7,%ecx
-	jae	.L017lot
+	jae	.L018lot
 	cmpl	$0,%ecx
-	je	.L018ret
-.L019little:
+	je	.L019ret
+.L020little:
 	movb	%al,(%edx)
 	subl	$1,%ecx
 	leal	1(%edx),%edx
-	jnz	.L019little
-.L018ret:
+	jnz	.L020little
+.L019ret:
 	ret
 .align	16
-.L017lot:
+.L018lot:
 	testl	$3,%edx
-	jz	.L020aligned
+	jz	.L021aligned
 	movb	%al,(%edx)
 	leal	-1(%ecx),%ecx
 	leal	1(%edx),%edx
-	jmp	.L017lot
-.L020aligned:
+	jmp	.L018lot
+.L021aligned:
 	movl	%eax,(%edx)
 	leal	-4(%ecx),%ecx
 	testl	$-4,%ecx
 	leal	4(%edx),%edx
-	jnz	.L020aligned
+	jnz	.L021aligned
 	cmpl	$0,%ecx
-	jne	.L019little
+	jne	.L020little
 	ret
 .size	OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
 .globl	OPENSSL_ia32_rdrand
@@ -320,11 +332,11 @@ OPENSSL_cleanse:
 OPENSSL_ia32_rdrand:
 .L_OPENSSL_ia32_rdrand_begin:
 	movl	$8,%ecx
-.L021loop:
+.L022loop:
 .byte	15,199,240
-	jc	.L022break
-	loop	.L021loop
-.L022break:
+	jc	.L023break
+	loop	.L022loop
+.L023break:
 	cmpl	$0,%eax
 	cmovel	%ecx,%eax
 	ret
diff --git a/main/openssl/import_openssl.sh b/main/openssl/import_openssl.sh
index 3f581530..02d2ab1c 100755
--- a/main/openssl/import_openssl.sh
+++ b/main/openssl/import_openssl.sh
@@ -143,7 +143,7 @@ function gen_asm_mips () {
 function gen_asm_x86 () {
   local OUT
   OUT=$(default_asm_file "$@")
-  $PERL_EXE "$1" elf -fPIC > "$OUT"
+  $PERL_EXE "$1" elf -fPIC $(print_values_with_prefix -D $OPENSSL_CRYPTO_DEFINES_x86) > "$OUT"
 }
 
 function gen_asm_x86_64 () {
@@ -186,12 +186,12 @@ function generate_build_config_headers() {
   local configure_args_bits=CONFIGURE_ARGS_$1
   local configure_args_stat=''
   local outname=$1
-  if [ $2 -eq "1" ] ; then
+  if [[ $2 == 1 ]] ; then
       configure_args_stat=CONFIGURE_ARGS_STATIC
       outname="static-$1"
   fi
 
-  if [ $1 == "trusty" ] ; then
+  if [[ $1 == trusty ]] ; then
     PERL=/usr/bin/perl ./Configure $CONFIGURE_ARGS_TRUSTY
   else
     PERL=/usr/bin/perl ./Configure $CONFIGURE_ARGS ${!configure_args_bits} ${!configure_args_stat}
@@ -284,6 +284,17 @@ var_sorted_value() {
   uniq_sort $(var_value $1)
 }
 
+# Print the values in a list with a prefix
+# $1: prefix to use
+# $2+: values of list
+print_values_with_prefix() {
+  declare -r prefix=$1
+  shift
+  for src; do
+    echo -n " $prefix$src "
+  done
+}
+
 # Print the definition of a given variable in a GNU Make build file.
 # $1: Variable name (e.g. common_src_files)
 # $2: prefix for each variable contents
@@ -393,36 +404,28 @@ LOCAL_CFLAGS_${arch} += \$(${arch}_cflags)"
       done
     else
       echo "
-ifeq (\$(HOST_OS)-\$(HOST_ARCH),linux-x86)
-ifneq (\$(BUILD_HOST_64bit),)
-host_arch := x86_64
-else
-host_arch := x86
-endif
-else
-ifeq (\$(HOST_OS)-\$(HOST_ARCH),linux-x86_64)
-host_arch := x86_64
-else
-\$(warning Unknown host architecture \$(HOST_OS)-\$(HOST_ARCH))
-host_arch := unknown
-endif
-endif
-
-LOCAL_CFLAGS     += \$(common_cflags) \$(\$(host_arch)_cflags)
+LOCAL_CFLAGS += \$(common_cflags)
 LOCAL_C_INCLUDES += \$(common_c_includes) \$(local_c_includes)
-LOCAL_SRC_FILES  += \$(filter-out \$(\$(host_arch)_exclude_files), \$(common_src_files) \$(\$(host_arch)_src_files))"
+
+ifeq (\$(HOST_OS),linux)
+LOCAL_CFLAGS_x86 += \$(x86_cflags)
+LOCAL_SRC_FILES_x86 += \$(filter-out \$(x86_exclude_files), \$(common_src_files) \$(x86_src_files))
+LOCAL_CFLAGS_x86_64 += \$(x86_64_cflags)
+LOCAL_SRC_FILES_x86_64 += \$(filter-out \$(x86_64_exclude_files), \$(common_src_files) \$(x86_64_src_files))
+else
+\$(warning Unknown host OS \$(HOST_OS))
+LOCAL_SRC_FILES += \$(common_src_files)
+endif"
     fi
   ) > "$output"
 }
 
 function import() {
   declare -r OPENSSL_SOURCE=$1
-
   untar $OPENSSL_SOURCE readonly
   applypatches $OPENSSL_DIR
 
   cd $OPENSSL_DIR
-
   generate_build_config_mk
   generate_opensslconf_h
 
@@ -440,6 +443,7 @@ function import() {
 
   # Generate arm asm
   gen_asm_arm crypto/aes/asm/aes-armv4.pl
+  gen_asm_arm crypto/aes/asm/bsaes-armv7.pl
   gen_asm_arm crypto/bn/asm/armv4-gf2m.pl
   gen_asm_arm crypto/bn/asm/armv4-mont.pl
   gen_asm_arm crypto/modes/asm/ghash-armv4.pl
diff --git a/main/openssl/include/openssl/ocsp.h b/main/openssl/include/openssl/ocsp.h
index 31e45744..f14e9f7e 100644
--- a/main/openssl/include/openssl/ocsp.h
+++ b/main/openssl/include/openssl/ocsp.h
@@ -90,6 +90,13 @@ extern "C" {
 #define OCSP_RESPID_KEY			0x400
 #define OCSP_NOTIME			0x800
 
+#ifdef OPENSSL_SYS_WIN32
+  /* Under Win32 these are defined in wincrypt.h */
+#undef OCSP_REQUEST
+#undef X509_NAME
+#undef OCSP_RESPONSE
+#endif
+
 /*   CertID ::= SEQUENCE {
  *       hashAlgorithm            AlgorithmIdentifier,
  *       issuerNameHash     OCTET STRING, -- Hash of Issuer's DN
diff --git a/main/openssl/include/openssl/ssl.h b/main/openssl/include/openssl/ssl.h
index 40c4d9cf..57335a98 100644
--- a/main/openssl/include/openssl/ssl.h
+++ b/main/openssl/include/openssl/ssl.h
@@ -1315,6 +1315,10 @@ struct ssl_st
 #endif	/* OPENSSL_NO_KRB5 */
 
 #ifndef OPENSSL_NO_PSK
+	/* PSK identity hint is stored here only to enable setting a hint on an SSL object before an
+	 * SSL_SESSION is associated with it. Once an SSL_SESSION is associated with this SSL object,
+	 * the psk_identity_hint from the session takes precedence over this one. */
+	char *psk_identity_hint;
 	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
 		unsigned int max_identity_len, unsigned char *psk,
 		unsigned int max_psk_len);
diff --git a/main/openssl/include/openssl/tls1.h b/main/openssl/include/openssl/tls1.h
index 6283c6a7..ec8948d5 100644
--- a/main/openssl/include/openssl/tls1.h
+++ b/main/openssl/include/openssl/tls1.h
@@ -531,6 +531,10 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256        0x0300C031
 #define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384        0x0300C032
 
+/* ECDHE PSK ciphersuites from RFC 5489 */
+#define TLS1_CK_ECDHE_PSK_WITH_AES_128_CBC_SHA256    0x0300C037
+#define TLS1_CK_ECDHE_PSK_WITH_AES_256_CBC_SHA384    0x0300C038
+
 /* XXX
  * Inconsistency alert:
  * The OpenSSL names of ciphers with ephemeral DH here include the string
@@ -682,6 +686,10 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256       "ECDH-RSA-AES128-GCM-SHA256"
 #define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384       "ECDH-RSA-AES256-GCM-SHA384"
 
+/* ECDHE PSK ciphersuites from RFC 5489 */
+#define TLS1_TXT_ECDHE_PSK_WITH_AES_128_CBC_SHA256  "ECDHE-PSK-WITH-AES-128-CBC-SHA256"
+#define TLS1_TXT_ECDHE_PSK_WITH_AES_256_CBC_SHA384  "ECDHE-PSK-WITH-AES-256-CBC-SHA384"
+
 #define TLS_CT_RSA_SIGN			1
 #define TLS_CT_DSS_SIGN			2
 #define TLS_CT_RSA_FIXED_DH		3
diff --git a/main/openssl/openssl.config b/main/openssl/openssl.config
index 82481076..8e97e9c0 100644
--- a/main/openssl/openssl.config
+++ b/main/openssl/openssl.config
@@ -107,7 +107,6 @@ crypto/Makefile \
 crypto/Makefile.save \
 crypto/aes/Makefile \
 crypto/aes/Makefile.save \
-crypto/armcap.c \
 crypto/asn1/Makefile \
 crypto/asn1/Makefile.save \
 crypto/bf/INSTALL \
@@ -288,8 +287,10 @@ NO_WINDOWS_BRAINDEATH \
 OPENSSL_CRYPTO_DEFINES_arm="\
 OPENSSL_BN_ASM_GF2m \
 OPENSSL_BN_ASM_MONT \
+OPENSSL_CPUID_OBJ \
 GHASH_ASM \
 AES_ASM \
+BSAES_ASM \
 SHA1_ASM \
 SHA256_ASM \
 SHA512_ASM \
@@ -307,10 +308,12 @@ SHA256_ASM \
 "
 
 OPENSSL_CRYPTO_DEFINES_x86="\
+OPENSSL_IA32_SSE2 \
 OPENSSL_BN_ASM_GF2m \
 OPENSSL_BN_ASM_MONT \
 OPENSSL_BN_ASM_PART_WORDS \
 AES_ASM \
+VPAES_ASM \
 GHASH_ASM \
 SHA1_ASM \
 SHA256_ASM \
@@ -325,7 +328,10 @@ OPENSSL_CPUID_OBJ \
 OPENSSL_CRYPTO_DEFINES_x86_64="\
 OPENSSL_BN_ASM_GF2m \
 OPENSSL_BN_ASM_MONT \
+OPENSSL_BN_ASM_MONT5 \
 AES_ASM \
+VPAES_ASM \
+BSAES_ASM \
 GHASH_ASM \
 SHA1_ASM \
 SHA256_ASM \
@@ -867,6 +873,9 @@ crypto/x509v3/v3err.c \
 
 OPENSSL_CRYPTO_SOURCES_arm="\
 crypto/aes/asm/aes-armv4.S \
+crypto/aes/asm/bsaes-armv7.S \
+crypto/armcap.c \
+crypto/armv4cpuid.S \
 crypto/bn/asm/armv4-gf2m.S \
 crypto/bn/asm/armv4-mont.S \
 crypto/modes/asm/ghash-armv4.S \
@@ -877,6 +886,7 @@ crypto/sha/asm/sha512-armv4.S \
 
 OPENSSL_CRYPTO_SOURCES_EXCLUDES_arm="\
 crypto/aes/aes_core.c \
+crypto/mem_clr.c \
 "
 
 OPENSSL_CRYPTO_SOURCES_arm64="\
@@ -1083,6 +1093,11 @@ tls12_digests.patch \
 alpn.patch \
 cbc_record_splitting.patch \
 dsa_nonce.patch \
+ecdhe_psk.patch \
+wincrypt.patch \
+tls_psk_hint.patch \
+arm_asm.patch \
+psk_client_callback_128_byte_id_bug.patch \
 "
 
 source ./openssl.trusty.config
diff --git a/main/openssl/patches/README b/main/openssl/patches/README
index 4159a85c..2ff69282 100644
--- a/main/openssl/patches/README
+++ b/main/openssl/patches/README
@@ -48,3 +48,17 @@ dsa_nonce.patch
 Adds an option to mix in hash of message and private key into (EC)DSA nonces to
 make (EC)DSA more resilient to weaknesses in RNGs used for nonces. The feature
 is disabled by default.
+
+ecdhe_psk.patch
+
+Adds support for ECDHE Pre-Shared Key (PSK) TLS cipher suites.
+
+tls_psk_hint.patch
+
+Fixes issues with TLS-PSK identity hint implementation where
+per-connection/session and per-context hints were being mixed up.
+
+psk_client_callback_128_byte_id_bug.patch
+
+Fixes the issue where it was impossible to return a 128 byte long PSK identity
+(the maximum supported length) from psk_client_callback.
diff --git a/main/openssl/rules.mk b/main/openssl/rules.mk
index c0c13e10..252dbbb3 100644
--- a/main/openssl/rules.mk
+++ b/main/openssl/rules.mk
@@ -1,7 +1,18 @@
 LOCAL_DIR := $(GET_LOCAL_DIR)
 
 MODULE := $(LOCAL_DIR)
-MODULE_USER := true
+
+TARGET_ARCH := $(ARCH)
+TARGET_2ND_ARCH := $(ARCH)
+
+# Reset local variables
+LOCAL_CFLAGS :=
+LOCAL_C_INCLUDES :=
+LOCAL_SRC_FILES_$(TARGET_ARCH) :=
+LOCAL_SRC_FILES_$(TARGET_2ND_ARCH) :=
+LOCAL_CFLAGS_$(TARGET_ARCH) :=
+LOCAL_CFLAGS_$(TARGET_2ND_ARCH) :=
+LOCAL_ADDITIONAL_DEPENDENCIES :=
 
 # get openssl_cflags
 MODULE_SRCDEPS += $(LOCAL_DIR)/build-config-trusty.mk
@@ -9,16 +20,15 @@ include $(LOCAL_DIR)/build-config-trusty.mk
 
 # get target_c_flags, target_c_includes, target_src_files
 MODULE_SRCDEPS += $(LOCAL_DIR)/Crypto-config-trusty.mk
-TARGET_ARCH := $(ARCH)
 include $(LOCAL_DIR)/Crypto-config-trusty.mk
 
-MODULE_SRCS += $(addprefix $(LOCAL_DIR)/,$(LOCAL_SRC_FILES_arm))
+MODULE_SRCS += $(addprefix $(LOCAL_DIR)/,$(LOCAL_SRC_FILES_$(ARCH)))
 
 MODULE_CFLAGS += $(LOCAL_CFLAGS)
 MODULE_CFLAGS += -Wno-error=implicit-function-declaration
 
 # Global for other modules which include openssl headers
-GLOBAL_CFLAGS += -DOPENSSL_SYS_TRUSTY
+GLOBAL_DEFINES += OPENSSL_SYS_TRUSTY
 
 LOCAL_C_INCLUDES := $(patsubst external/openssl/%,%,$(LOCAL_C_INCLUDES))
 GLOBAL_INCLUDES += $(addprefix $(LOCAL_DIR)/,$(LOCAL_C_INCLUDES))
diff --git a/main/openssl/ssl/d1_clnt.c b/main/openssl/ssl/d1_clnt.c
index 4fc4e1b9..5ee8f58e 100644
--- a/main/openssl/ssl/d1_clnt.c
+++ b/main/openssl/ssl/d1_clnt.c
@@ -1440,7 +1440,7 @@ int dtls1_send_client_key_exchange(SSL *s)
 				goto err;
 				}
 
-			psk_len = s->psk_client_callback(s, s->ctx->psk_identity_hint,
+			psk_len = s->psk_client_callback(s, s->session->psk_identity_hint,
 				identity, PSK_MAX_IDENTITY_LEN,
 				psk_or_pre_ms, sizeof(psk_or_pre_ms));
 			if (psk_len > PSK_MAX_PSK_LEN)
@@ -1465,17 +1465,6 @@ int dtls1_send_client_key_exchange(SSL *s)
 			t+=psk_len;
 			s2n(psk_len, t);
 
-			if (s->session->psk_identity_hint != NULL)
-				OPENSSL_free(s->session->psk_identity_hint);
-			s->session->psk_identity_hint = BUF_strdup(s->ctx->psk_identity_hint);
-			if (s->ctx->psk_identity_hint != NULL &&
-				s->session->psk_identity_hint == NULL)
-				{
-				SSLerr(SSL_F_DTLS1_SEND_CLIENT_KEY_EXCHANGE,
-					ERR_R_MALLOC_FAILURE);
-				goto psk_err;
-				}
-
 			if (s->session->psk_identity != NULL)
 				OPENSSL_free(s->session->psk_identity);
 			s->session->psk_identity = BUF_strdup(identity);
diff --git a/main/openssl/ssl/d1_srvr.c b/main/openssl/ssl/d1_srvr.c
index 9975e208..09f47627 100644
--- a/main/openssl/ssl/d1_srvr.c
+++ b/main/openssl/ssl/d1_srvr.c
@@ -471,7 +471,7 @@ int dtls1_accept(SSL *s)
 			/* PSK: send ServerKeyExchange if PSK identity
 			 * hint if provided */
 #ifndef OPENSSL_NO_PSK
-			    || ((alg_k & SSL_kPSK) && s->ctx->psk_identity_hint)
+			    || ((alg_k & SSL_kPSK) && s->session->psk_identity_hint)
 #endif
 			    || (alg_k & (SSL_kEDH|SSL_kDHr|SSL_kDHd))
 			    || (alg_k & SSL_kEECDH)
@@ -1288,7 +1288,7 @@ int dtls1_send_server_key_exchange(SSL *s)
 			if (type & SSL_kPSK)
 				{
 				/* reserve size for record length and PSK identity hint*/
-				n+=2+strlen(s->ctx->psk_identity_hint);
+				n+=2+strlen(s->session->psk_identity_hint);
 				}
 			else
 #endif /* !OPENSSL_NO_PSK */
@@ -1364,9 +1364,9 @@ int dtls1_send_server_key_exchange(SSL *s)
 		if (type & SSL_kPSK)
 			{
 			/* copy PSK identity hint */
-			s2n(strlen(s->ctx->psk_identity_hint), p); 
-			strncpy((char *)p, s->ctx->psk_identity_hint, strlen(s->ctx->psk_identity_hint));
-			p+=strlen(s->ctx->psk_identity_hint);
+			s2n(strlen(s->session->psk_identity_hint), p);
+			strncpy((char *)p, s->session->psk_identity_hint, strlen(s->session->psk_identity_hint));
+			p+=strlen(s->session->psk_identity_hint);
 			}
 #endif
 
diff --git a/main/openssl/ssl/s3_clnt.c b/main/openssl/ssl/s3_clnt.c
index f71470a3..b65b12d9 100644
--- a/main/openssl/ssl/s3_clnt.c
+++ b/main/openssl/ssl/s3_clnt.c
@@ -345,9 +345,10 @@ int ssl3_connect(SSL *s)
 				}
 #endif
 			/* Check if it is anon DH/ECDH */
-			/* or PSK */
+			/* or non-RSA PSK */
 			if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL) &&
-			    !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK))
+			    !((s->s3->tmp.new_cipher->algorithm_auth & SSL_aPSK) &&
+			      !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kRSA)))
 				{
 				ret=ssl3_get_server_certificate(s);
 				if (ret <= 0) goto end;
@@ -1377,12 +1378,14 @@ int ssl3_get_key_exchange(SSL *s)
 		   omitted if no identity hint is sent. Set
 		   session->sess_cert anyway to avoid problems
 		   later.*/
-		if (s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK)
+		if (s->s3->tmp.new_cipher->algorithm_auth & SSL_aPSK)
 			{
 			s->session->sess_cert=ssl_sess_cert_new();
-			if (s->ctx->psk_identity_hint)
-				OPENSSL_free(s->ctx->psk_identity_hint);
-			s->ctx->psk_identity_hint = NULL;
+			if (s->session->psk_identity_hint)
+				{
+				OPENSSL_free(s->session->psk_identity_hint);
+				s->session->psk_identity_hint = NULL;
+				}
 			}
 #endif
 		s->s3->tmp.reuse_message=1;
@@ -1425,52 +1428,58 @@ int ssl3_get_key_exchange(SSL *s)
 	EVP_MD_CTX_init(&md_ctx);
 
 #ifndef OPENSSL_NO_PSK
-	if (alg_k & SSL_kPSK)
+	if (alg_a & SSL_aPSK)
 		{
 		char tmp_id_hint[PSK_MAX_IDENTITY_LEN+1];
 
 		al=SSL_AD_HANDSHAKE_FAILURE;
 		n2s(p,i);
 		param_len=i+2;
-		/* Store PSK identity hint for later use, hint is used
-		 * in ssl3_send_client_key_exchange.  Assume that the
-		 * maximum length of a PSK identity hint can be as
-		 * long as the maximum length of a PSK identity. */
-		if (i > PSK_MAX_IDENTITY_LEN)
-			{
-			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,
-				SSL_R_DATA_LENGTH_TOO_LONG);
-			goto f_err;
+		if (s->session->psk_identity_hint)
+			{
+			OPENSSL_free(s->session->psk_identity_hint);
+			s->session->psk_identity_hint = NULL;
 			}
-		if (param_len > n)
+		if (i != 0)
 			{
-			al=SSL_AD_DECODE_ERROR;
-			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,
-				SSL_R_BAD_PSK_IDENTITY_HINT_LENGTH);
-			goto f_err;
+			/* Store PSK identity hint for later use, hint is used
+			 * in ssl3_send_client_key_exchange.  Assume that the
+			 * maximum length of a PSK identity hint can be as
+			 * long as the maximum length of a PSK identity. */
+			if (i > PSK_MAX_IDENTITY_LEN)
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,
+					SSL_R_DATA_LENGTH_TOO_LONG);
+				goto f_err;
+				}
+			if (param_len > n)
+				{
+				al=SSL_AD_DECODE_ERROR;
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,
+					SSL_R_BAD_PSK_IDENTITY_HINT_LENGTH);
+				goto f_err;
+				}
+			/* If received PSK identity hint contains NULL
+			 * characters, the hint is truncated from the first
+			 * NULL. p may not be ending with NULL, so create a
+			 * NULL-terminated string. */
+			memcpy(tmp_id_hint, p, i);
+			memset(tmp_id_hint+i, 0, PSK_MAX_IDENTITY_LEN+1-i);
+			s->session->psk_identity_hint = BUF_strdup(tmp_id_hint);
+			if (s->session->psk_identity_hint == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE, ERR_R_MALLOC_FAILURE);
+				goto f_err;
+				}
 			}
-		/* If received PSK identity hint contains NULL
-		 * characters, the hint is truncated from the first
-		 * NULL. p may not be ending with NULL, so create a
-		 * NULL-terminated string. */
-		memcpy(tmp_id_hint, p, i);
-		memset(tmp_id_hint+i, 0, PSK_MAX_IDENTITY_LEN+1-i);
-		if (s->ctx->psk_identity_hint != NULL)
-			OPENSSL_free(s->ctx->psk_identity_hint);
-		s->ctx->psk_identity_hint = BUF_strdup(tmp_id_hint);
-		if (s->ctx->psk_identity_hint == NULL)
-			{
-			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE, ERR_R_MALLOC_FAILURE);
-			goto f_err;
-			}	   
-
 		p+=i;
 		n-=param_len;
 		}
-	else
 #endif /* !OPENSSL_NO_PSK */
+
+	if (0) {}
 #ifndef OPENSSL_NO_SRP
-	if (alg_k & SSL_kSRP)
+	else if (alg_k & SSL_kSRP)
 		{
 		n2s(p,i);
 		param_len=i+2;
@@ -1547,10 +1556,9 @@ int ssl3_get_key_exchange(SSL *s)
 			pkey=X509_get_pubkey(s->session->sess_cert->peer_pkeys[SSL_PKEY_DSA_SIGN].x509);
 #endif
 		}
-	else
 #endif /* !OPENSSL_NO_SRP */
 #ifndef OPENSSL_NO_RSA
-	if (alg_k & SSL_kRSA)
+	else if (alg_k & SSL_kRSA)
 		{
 		if ((rsa=RSA_new()) == NULL)
 			{
@@ -1599,9 +1607,6 @@ int ssl3_get_key_exchange(SSL *s)
 		s->session->sess_cert->peer_rsa_tmp=rsa;
 		rsa=NULL;
 		}
-#else /* OPENSSL_NO_RSA */
-	if (0)
-		;
 #endif
 #ifndef OPENSSL_NO_DH
 	else if (alg_k & SSL_kEDH)
@@ -1782,14 +1787,14 @@ int ssl3_get_key_exchange(SSL *s)
 		EC_POINT_free(srvr_ecpoint);
 		srvr_ecpoint = NULL;
 		}
-	else if (alg_k)
+#endif /* !OPENSSL_NO_ECDH */
+
+	else if (!(alg_k & SSL_kPSK))
 		{
 		al=SSL_AD_UNEXPECTED_MESSAGE;
 		SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_UNEXPECTED_MESSAGE);
 		goto f_err;
 		}
-#endif /* !OPENSSL_NO_ECDH */
-
 
 	/* p points to the next byte, there are 'n' bytes left */
 
@@ -1894,8 +1899,9 @@ fprintf(stderr, "USING TLSv1.2 HASH %s\n", EVP_MD_name(md));
 		}
 	else
 		{
-		if (!(alg_a & SSL_aNULL) && !(alg_k & SSL_kPSK))
-			/* aNULL or kPSK do not need public keys */
+		if (!(alg_a & SSL_aNULL) &&
+			/* Among PSK ciphers only RSA_PSK needs a public key */
+			!((alg_a & SSL_aPSK) && !(alg_k & SSL_kRSA)))
 			{
 			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
 			goto err;
@@ -2297,6 +2303,7 @@ int ssl3_send_client_key_exchange(SSL *s)
 	unsigned char *p,*d;
 	int n;
 	unsigned long alg_k;
+	unsigned long alg_a;
 #ifndef OPENSSL_NO_RSA
 	unsigned char *q;
 	EVP_PKEY *pkey=NULL;
@@ -2311,7 +2318,11 @@ int ssl3_send_client_key_exchange(SSL *s)
 	unsigned char *encodedPoint = NULL;
 	int encoded_pt_len = 0;
 	BN_CTX * bn_ctx = NULL;
-#endif
+#ifndef OPENSSL_NO_PSK
+	unsigned int psk_len = 0;
+	unsigned char psk[PSK_MAX_PSK_LEN];
+#endif /* OPENSSL_NO_PSK */
+#endif /* OPENSSL_NO_ECDH */
 
 	if (s->state == SSL3_ST_CW_KEY_EXCH_A)
 		{
@@ -2319,7 +2330,89 @@ int ssl3_send_client_key_exchange(SSL *s)
 		p= &(d[4]);
 
 		alg_k=s->s3->tmp.new_cipher->algorithm_mkey;
+		alg_a=s->s3->tmp.new_cipher->algorithm_auth;
+
+#ifndef OPENSSL_NO_PSK
+		if (alg_a & SSL_aPSK)
+			{
+			char identity[PSK_MAX_IDENTITY_LEN + 1];
+			size_t identity_len;
+			unsigned char *t = NULL;
+			unsigned char pre_ms[PSK_MAX_PSK_LEN*2+4];
+			unsigned int pre_ms_len = 0;
+			int psk_err = 1;
+
+			n = 0;
+			if (s->psk_client_callback == NULL)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
+					SSL_R_PSK_NO_CLIENT_CB);
+				goto err;
+				}
+
+			memset(identity, 0, sizeof(identity));
+			psk_len = s->psk_client_callback(s, s->session->psk_identity_hint,
+				identity, sizeof(identity), psk, sizeof(psk));
+			if (psk_len > PSK_MAX_PSK_LEN)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
+					ERR_R_INTERNAL_ERROR);
+				goto psk_err;
+				}
+			else if (psk_len == 0)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
+					SSL_R_PSK_IDENTITY_NOT_FOUND);
+				goto psk_err;
+				}
+			identity_len = strnlen(identity, sizeof(identity));
+			if (identity_len > PSK_MAX_IDENTITY_LEN)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
+					ERR_R_INTERNAL_ERROR);
+				goto psk_err;
+				}
 
+			if (!(alg_k & SSL_kEECDH))
+				{
+				/* Create the shared secret now if we're not using ECDHE-PSK.*/
+				pre_ms_len = 2+psk_len+2+psk_len;
+				t = pre_ms;
+				s2n(psk_len, t);
+				memset(t, 0, psk_len);
+				t+=psk_len;
+				s2n(psk_len, t);
+				memcpy(t, psk, psk_len);
+
+				s->session->master_key_length =
+					s->method->ssl3_enc->generate_master_secret(s,
+						s->session->master_key,
+						pre_ms, pre_ms_len);
+				s2n(identity_len, p);
+				memcpy(p, identity, identity_len);
+				n = 2 + identity_len;
+				}
+
+			if (s->session->psk_identity != NULL)
+				OPENSSL_free(s->session->psk_identity);
+			s->session->psk_identity = BUF_strdup(identity);
+			if (s->session->psk_identity == NULL)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
+					ERR_R_MALLOC_FAILURE);
+				goto psk_err;
+				}
+			psk_err = 0;
+		psk_err:
+			OPENSSL_cleanse(identity, PSK_MAX_IDENTITY_LEN);
+			OPENSSL_cleanse(pre_ms, sizeof(pre_ms));
+			if (psk_err != 0)
+				{
+				ssl3_send_alert(s, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
+				goto err;
+				}
+			}
+#endif
 		/* Fool emacs indentation */
 		if (0) {}
 #ifndef OPENSSL_NO_RSA
@@ -2580,14 +2673,19 @@ int ssl3_send_client_key_exchange(SSL *s)
 			/* perhaps clean things up a bit EAY EAY EAY EAY*/
 			}
 #endif
-
-#ifndef OPENSSL_NO_ECDH 
+#ifndef OPENSSL_NO_ECDH
 		else if (alg_k & (SSL_kEECDH|SSL_kECDHr|SSL_kECDHe))
 			{
 			const EC_GROUP *srvr_group = NULL;
 			EC_KEY *tkey;
 			int ecdh_clnt_cert = 0;
 			int field_size = 0;
+#ifndef OPENSSL_NO_PSK
+			unsigned char *pre_ms;
+			unsigned char *t;
+			unsigned int pre_ms_len;
+			unsigned int i;
+#endif
 
 			/* Did we send out the client's
 			 * ECDH share for use in premaster
@@ -2708,15 +2806,41 @@ int ssl3_send_client_key_exchange(SSL *s)
 				goto err;
 				}
 
-			/* generate master key from the result */
-			s->session->master_key_length = s->method->ssl3_enc \
-			    -> generate_master_secret(s, 
-				s->session->master_key,
-				p, n);
-
+#ifndef OPENSSL_NO_PSK
+			/* ECDHE PSK ciphersuites from RFC 5489 */
+			if ((alg_a & SSL_aPSK) && psk_len != 0)
+				{
+				pre_ms_len = 2+psk_len+2+n;
+				pre_ms = OPENSSL_malloc(pre_ms_len);
+				if (pre_ms == NULL)
+					{
+					SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
+			    		ERR_R_MALLOC_FAILURE);
+					goto err;
+					}
+				memset(pre_ms, 0, pre_ms_len);
+				t = pre_ms;
+				s2n(psk_len, t);
+				memcpy(t, psk, psk_len);
+				t += psk_len;
+				s2n(n, t);
+				memcpy(t, p, n);
+				s->session->master_key_length = s->method->ssl3_enc \
+					-> generate_master_secret(s,
+						s->session->master_key, pre_ms, pre_ms_len);
+				OPENSSL_cleanse(pre_ms, pre_ms_len);
+				OPENSSL_free(pre_ms);
+				}
+#endif /* OPENSSL_NO_PSK */
+			if (!(alg_a & SSL_aPSK))
+				{
+				/* generate master key from the result */
+				s->session->master_key_length = s->method->ssl3_enc \
+					-> generate_master_secret(s,
+						s->session->master_key, p, n);
+				}
 			memset(p, 0, n); /* clean up */
-
-			if (ecdh_clnt_cert) 
+			if (ecdh_clnt_cert)
 				{
 				/* Send empty client key exch message */
 				n = 0;
@@ -2744,29 +2868,42 @@ int ssl3_send_client_key_exchange(SSL *s)
 					}
 
 				/* Encode the public key */
-				n = EC_POINT_point2oct(srvr_group, 
-				    EC_KEY_get0_public_key(clnt_ecdh), 
-				    POINT_CONVERSION_UNCOMPRESSED, 
+				encoded_pt_len = EC_POINT_point2oct(srvr_group,
+				    EC_KEY_get0_public_key(clnt_ecdh),
+				    POINT_CONVERSION_UNCOMPRESSED,
 				    encodedPoint, encoded_pt_len, bn_ctx);
+				
+				n = 0;
+#ifndef OPENSSL_NO_PSK
+				if ((alg_a & SSL_aPSK) && psk_len != 0)
+					{
+					i = strlen(s->session->psk_identity);
+					s2n(i, p);
+					memcpy(p, s->session->psk_identity, i);
+					p += i;
+					n = i + 2;
+					}
+#endif
 
-				*p = n; /* length of encoded point */
+				*p = encoded_pt_len; /* length of encoded point */
 				/* Encoded point will be copied here */
-				p += 1; 
+				p += 1;
+				n += 1;
 				/* copy the point */
-				memcpy((unsigned char *)p, encodedPoint, n);
+				memcpy((unsigned char *)p, encodedPoint, encoded_pt_len);
 				/* increment n to account for length field */
-				n += 1; 
+				n += encoded_pt_len;
 				}
 
 			/* Free allocated memory */
 			BN_CTX_free(bn_ctx);
 			if (encodedPoint != NULL) OPENSSL_free(encodedPoint);
-			if (clnt_ecdh != NULL) 
+			if (clnt_ecdh != NULL)
 				 EC_KEY_free(clnt_ecdh);
 			EVP_PKEY_free(srvr_pub_pkey);
 			}
 #endif /* !OPENSSL_NO_ECDH */
-		else if (alg_k & SSL_kGOST) 
+		else if (alg_k & SSL_kGOST)
 			{
 			/* GOST key exchange message creation */
 			EVP_PKEY_CTX *pkey_ctx;
@@ -2889,89 +3026,7 @@ int ssl3_send_client_key_exchange(SSL *s)
 				}
 			}
 #endif
-#ifndef OPENSSL_NO_PSK
-		else if (alg_k & SSL_kPSK)
-			{
-			char identity[PSK_MAX_IDENTITY_LEN];
-			unsigned char *t = NULL;
-			unsigned char psk_or_pre_ms[PSK_MAX_PSK_LEN*2+4];
-			unsigned int pre_ms_len = 0, psk_len = 0;
-			int psk_err = 1;
-
-			n = 0;
-			if (s->psk_client_callback == NULL)
-				{
-				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
-					SSL_R_PSK_NO_CLIENT_CB);
-				goto err;
-				}
-
-			psk_len = s->psk_client_callback(s, s->ctx->psk_identity_hint,
-				identity, PSK_MAX_IDENTITY_LEN,
-				psk_or_pre_ms, sizeof(psk_or_pre_ms));
-			if (psk_len > PSK_MAX_PSK_LEN)
-				{
-				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
-					ERR_R_INTERNAL_ERROR);
-				goto psk_err;
-				}
-			else if (psk_len == 0)
-				{
-				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
-					SSL_R_PSK_IDENTITY_NOT_FOUND);
-				goto psk_err;
-				}
-
-			/* create PSK pre_master_secret */
-			pre_ms_len = 2+psk_len+2+psk_len;
-			t = psk_or_pre_ms;
-			memmove(psk_or_pre_ms+psk_len+4, psk_or_pre_ms, psk_len);
-			s2n(psk_len, t);
-			memset(t, 0, psk_len);
-			t+=psk_len;
-			s2n(psk_len, t);
-
-			if (s->session->psk_identity_hint != NULL)
-				OPENSSL_free(s->session->psk_identity_hint);
-			s->session->psk_identity_hint = BUF_strdup(s->ctx->psk_identity_hint);
-			if (s->ctx->psk_identity_hint != NULL &&
-				s->session->psk_identity_hint == NULL)
-				{
-				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
-					ERR_R_MALLOC_FAILURE);
-				goto psk_err;
-				}
-
-			if (s->session->psk_identity != NULL)
-				OPENSSL_free(s->session->psk_identity);
-			s->session->psk_identity = BUF_strdup(identity);
-			if (s->session->psk_identity == NULL)
-				{
-				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
-					ERR_R_MALLOC_FAILURE);
-				goto psk_err;
-				}
-
-			s->session->master_key_length =
-				s->method->ssl3_enc->generate_master_secret(s,
-					s->session->master_key,
-					psk_or_pre_ms, pre_ms_len); 
-			n = strlen(identity);
-			s2n(n, p);
-			memcpy(p, identity, n);
-			n+=2;
-			psk_err = 0;
-		psk_err:
-			OPENSSL_cleanse(identity, PSK_MAX_IDENTITY_LEN);
-			OPENSSL_cleanse(psk_or_pre_ms, sizeof(psk_or_pre_ms));
-			if (psk_err != 0)
-				{
-				ssl3_send_alert(s, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
-				goto err;
-				}
-			}
-#endif
-		else
+		else if (!(alg_k & SSL_kPSK))
 			{
 			ssl3_send_alert(s, SSL3_AL_FATAL,
 			    SSL_AD_HANDSHAKE_FAILURE);
@@ -3276,7 +3331,7 @@ int ssl3_check_cert_and_algorithm(SSL *s)
 	alg_a=s->s3->tmp.new_cipher->algorithm_auth;
 
 	/* we don't have a certificate */
-	if ((alg_a & (SSL_aDH|SSL_aNULL|SSL_aKRB5)) || (alg_k & SSL_kPSK))
+	if ((alg_a & (SSL_aDH|SSL_aNULL|SSL_aKRB5)) || ((alg_a & SSL_aPSK) && !(alg_k & SSL_kRSA)))
 		return(1);
 
 	sc=s->session->sess_cert;
diff --git a/main/openssl/ssl/s3_lib.c b/main/openssl/ssl/s3_lib.c
index f7a5c6f0..f84da7f5 100644
--- a/main/openssl/ssl/s3_lib.c
+++ b/main/openssl/ssl/s3_lib.c
@@ -2826,6 +2826,42 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	256,
 	},
 
+#ifndef OPENSSL_NO_PSK
+    /* ECDH PSK ciphersuites from RFC 5489 */
+
+	/* Cipher C037 */
+	{
+	1,
+	TLS1_TXT_ECDHE_PSK_WITH_AES_128_CBC_SHA256,
+	TLS1_CK_ECDHE_PSK_WITH_AES_128_CBC_SHA256,
+	SSL_kEECDH,
+	SSL_aPSK,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C038 */
+	{
+	1,
+	TLS1_TXT_ECDHE_PSK_WITH_AES_256_CBC_SHA384,
+	TLS1_CK_ECDHE_PSK_WITH_AES_256_CBC_SHA384,
+	SSL_kEECDH,
+	SSL_aPSK,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+#endif /* OPENSSL_NO_PSK */
+
 #endif /* OPENSSL_NO_ECDH */
 
 
@@ -3911,7 +3947,7 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
 #endif /* OPENSSL_NO_KRB5 */
 #ifndef OPENSSL_NO_PSK
 		/* with PSK there must be server callback set */
-		if ((alg_k & SSL_kPSK) && s->psk_server_callback == NULL)
+		if ((alg_a & SSL_aPSK) && s->psk_server_callback == NULL)
 			continue;
 #endif /* OPENSSL_NO_PSK */
 
diff --git a/main/openssl/ssl/s3_srvr.c b/main/openssl/ssl/s3_srvr.c
index 8692f149..0ee781f1 100644
--- a/main/openssl/ssl/s3_srvr.c
+++ b/main/openssl/ssl/s3_srvr.c
@@ -217,6 +217,7 @@ int ssl3_accept(SSL *s)
 	{
 	BUF_MEM *buf;
 	unsigned long alg_k,Time=(unsigned long)time(NULL);
+	unsigned long alg_a;
 	void (*cb)(const SSL *ssl,int type,int val)=NULL;
 	int ret= -1;
 	int new_state,state,skip=0;
@@ -412,9 +413,11 @@ int ssl3_accept(SSL *s)
 		case SSL3_ST_SW_CERT_A:
 		case SSL3_ST_SW_CERT_B:
 			/* Check if it is anon DH or anon ECDH, */
-			/* normal PSK or KRB5 or SRP */
+			/* non-RSA PSK or KRB5 or SRP */
 			if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
-				&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK)
+				/* Among PSK ciphersuites only RSA_PSK uses server certificate */
+				&& !(s->s3->tmp.new_cipher->algorithm_auth & SSL_aPSK &&
+					 !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kRSA))
 				&& !(s->s3->tmp.new_cipher->algorithm_auth & SSL_aKRB5))
 				{
 				ret=ssl3_send_server_certificate(s);
@@ -443,6 +446,7 @@ int ssl3_accept(SSL *s)
 		case SSL3_ST_SW_KEY_EXCH_A:
 		case SSL3_ST_SW_KEY_EXCH_B:
 			alg_k = s->s3->tmp.new_cipher->algorithm_mkey;
+			alg_a = s->s3->tmp.new_cipher->algorithm_auth;
 
 			/* clear this, it may get reset by
 			 * send_server_key_exchange */
@@ -472,10 +476,12 @@ int ssl3_accept(SSL *s)
 			 * public key for key exchange.
 			 */
 			if (s->s3->tmp.use_rsa_tmp
-			/* PSK: send ServerKeyExchange if PSK identity
-			 * hint if provided */
+			/* PSK: send ServerKeyExchange if either:
+			 *   - PSK identity hint is provided, or
+			 *   - the key exchange is kEECDH.
+			 */
 #ifndef OPENSSL_NO_PSK
-			    || ((alg_k & SSL_kPSK) && s->ctx->psk_identity_hint)
+			    || ((alg_a & SSL_aPSK) && ((alg_k & SSL_kEECDH) || s->session->psk_identity_hint))
 #endif
 #ifndef OPENSSL_NO_SRP
 			    /* SRP: send ServerKeyExchange */
@@ -1588,12 +1594,17 @@ int ssl3_send_server_key_exchange(SSL *s)
 	int encodedlen = 0;
 	int curve_id = 0;
 	BN_CTX *bn_ctx = NULL; 
+#endif
+#ifndef OPENSSL_NO_PSK
+	const char* psk_identity_hint;
+	size_t psk_identity_hint_len;
 #endif
 	EVP_PKEY *pkey;
 	const EVP_MD *md = NULL;
 	unsigned char *p,*d;
 	int al,i;
-	unsigned long type;
+	unsigned long alg_k;
+	unsigned long alg_a;
 	int n;
 	CERT *cert;
 	BIGNUM *r[4];
@@ -1604,15 +1615,28 @@ int ssl3_send_server_key_exchange(SSL *s)
 	EVP_MD_CTX_init(&md_ctx);
 	if (s->state == SSL3_ST_SW_KEY_EXCH_A)
 		{
-		type=s->s3->tmp.new_cipher->algorithm_mkey;
+		alg_k=s->s3->tmp.new_cipher->algorithm_mkey;
+		alg_a=s->s3->tmp.new_cipher->algorithm_auth;
 		cert=s->cert;
 
 		buf=s->init_buf;
 
 		r[0]=r[1]=r[2]=r[3]=NULL;
 		n=0;
+#ifndef OPENSSL_NO_PSK
+		if (alg_a & SSL_aPSK)
+			{
+			/* size for PSK identity hint */
+			psk_identity_hint = s->session->psk_identity_hint;
+			if (psk_identity_hint)
+				psk_identity_hint_len = strlen(psk_identity_hint);
+			else
+				psk_identity_hint_len = 0;
+			n+=2+psk_identity_hint_len;
+			}
+#endif /* !OPENSSL_NO_PSK */
 #ifndef OPENSSL_NO_RSA
-		if (type & SSL_kRSA)
+		if (alg_k & SSL_kRSA)
 			{
 			rsa=cert->rsa_tmp;
 			if ((rsa == NULL) && (s->cert->rsa_tmp_cb != NULL))
@@ -1639,10 +1663,9 @@ int ssl3_send_server_key_exchange(SSL *s)
 			r[1]=rsa->e;
 			s->s3->tmp.use_rsa_tmp=1;
 			}
-		else
 #endif
 #ifndef OPENSSL_NO_DH
-			if (type & SSL_kEDH)
+		else if (alg_k & SSL_kEDH)
 			{
 			dhp=cert->dh_tmp;
 			if ((dhp == NULL) && (s->cert->dh_tmp_cb != NULL))
@@ -1695,10 +1718,9 @@ int ssl3_send_server_key_exchange(SSL *s)
 			r[1]=dh->g;
 			r[2]=dh->pub_key;
 			}
-		else 
 #endif
 #ifndef OPENSSL_NO_ECDH
-			if (type & SSL_kEECDH)
+		else if (alg_k & SSL_kEECDH)
 			{
 			const EC_GROUP *group;
 
@@ -1811,7 +1833,7 @@ int ssl3_send_server_key_exchange(SSL *s)
 			 * to encode the entire ServerECDHParams
 			 * structure. 
 			 */
-			n = 4 + encodedlen;
+			n += 4 + encodedlen;
 
 			/* We'll generate the serverKeyExchange message
 			 * explicitly so we can set these to NULLs
@@ -1821,18 +1843,9 @@ int ssl3_send_server_key_exchange(SSL *s)
 			r[2]=NULL;
 			r[3]=NULL;
 			}
-		else 
 #endif /* !OPENSSL_NO_ECDH */
-#ifndef OPENSSL_NO_PSK
-			if (type & SSL_kPSK)
-				{
-				/* reserve size for record length and PSK identity hint*/
-				n+=2+strlen(s->ctx->psk_identity_hint);
-				}
-			else
-#endif /* !OPENSSL_NO_PSK */
 #ifndef OPENSSL_NO_SRP
-		if (type & SSL_kSRP)
+		else if (alg_k & SSL_kSRP)
 			{
 			if ((s->srp_ctx.N == NULL) ||
 				(s->srp_ctx.g == NULL) ||
@@ -1847,8 +1860,8 @@ int ssl3_send_server_key_exchange(SSL *s)
 			r[2]=s->srp_ctx.s;
 			r[3]=s->srp_ctx.B;
 			}
-		else 
 #endif
+		else if (!(alg_k & SSL_kPSK))
 			{
 			al=SSL_AD_HANDSHAKE_FAILURE;
 			SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE);
@@ -1858,15 +1871,16 @@ int ssl3_send_server_key_exchange(SSL *s)
 			{
 			nr[i]=BN_num_bytes(r[i]);
 #ifndef OPENSSL_NO_SRP
-			if ((i == 2) && (type & SSL_kSRP))
+			if ((i == 2) && (alg_k & SSL_kSRP))
 				n+=1+nr[i];
 			else
 #endif
 			n+=2+nr[i];
 			}
 
-		if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
-			&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK))
+		if (!(alg_a & SSL_aNULL)
+			/* Among PSK ciphersuites only RSA uses a certificate */
+			&& !((alg_a & SSL_aPSK) && !(alg_k & SSL_kRSA)))
 			{
 			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher,&md))
 				== NULL)
@@ -1893,7 +1907,7 @@ int ssl3_send_server_key_exchange(SSL *s)
 		for (i=0; i < 4 && r[i] != NULL; i++)
 			{
 #ifndef OPENSSL_NO_SRP
-			if ((i == 2) && (type & SSL_kSRP))
+			if ((i == 2) && (alg_k & SSL_kSRP))
 				{
 				*p = nr[i];
 				p++;
@@ -1905,8 +1919,24 @@ int ssl3_send_server_key_exchange(SSL *s)
 			p+=nr[i];
 			}
 
+/* Note: ECDHE PSK ciphersuites use SSL_kEECDH and SSL_aPSK.
+ * When one of them is used, the server key exchange record needs to have both
+ * the psk_identity_hint and the ServerECDHParams. */
+#ifndef OPENSSL_NO_PSK
+		if (alg_a & SSL_aPSK)
+			{
+			/* copy PSK identity hint (if provided) */
+			s2n(psk_identity_hint_len, p);
+			if (psk_identity_hint_len > 0)
+				{
+				memcpy(p, psk_identity_hint, psk_identity_hint_len);
+				p+=psk_identity_hint_len;
+				}
+			}
+#endif /* OPENSSL_NO_PSK */
+
 #ifndef OPENSSL_NO_ECDH
-		if (type & SSL_kEECDH) 
+		if (alg_k & SSL_kEECDH)
 			{
 			/* XXX: For now, we only support named (not generic) curves.
 			 * In this situation, the serverKeyExchange message has:
@@ -1929,17 +1959,7 @@ int ssl3_send_server_key_exchange(SSL *s)
 			encodedPoint = NULL;
 			p += encodedlen;
 			}
-#endif
-
-#ifndef OPENSSL_NO_PSK
-		if (type & SSL_kPSK)
-			{
-			/* copy PSK identity hint */
-			s2n(strlen(s->ctx->psk_identity_hint), p); 
-			strncpy((char *)p, s->ctx->psk_identity_hint, strlen(s->ctx->psk_identity_hint));
-			p+=strlen(s->ctx->psk_identity_hint);
-			}
-#endif
+#endif /* OPENSSL_NO_ECDH */
 
 		/* not anonymous */
 		if (pkey != NULL)
@@ -1976,7 +1996,7 @@ int ssl3_send_server_key_exchange(SSL *s)
 				n+=u+2;
 				}
 			else
-#endif
+#endif /* OPENSSL_NO_RSA */
 			if (md)
 				{
 				/* For TLS1.2 and later send signature
@@ -2145,6 +2165,7 @@ int ssl3_get_client_key_exchange(SSL *s)
 	int i,al,ok;
 	long n;
 	unsigned long alg_k;
+	unsigned long alg_a;
 	unsigned char *p;
 #ifndef OPENSSL_NO_RSA
 	RSA *rsa=NULL;
@@ -2162,7 +2183,11 @@ int ssl3_get_client_key_exchange(SSL *s)
 	EC_KEY *srvr_ecdh = NULL;
 	EVP_PKEY *clnt_pub_pkey = NULL;
 	EC_POINT *clnt_ecpoint = NULL;
-	BN_CTX *bn_ctx = NULL; 
+	BN_CTX *bn_ctx = NULL;
+#ifndef OPENSSL_NO_PSK
+	unsigned int psk_len = 0;
+	unsigned char psk[PSK_MAX_PSK_LEN];
+#endif /* OPENSSL_NO_PSK */
 #endif
 
 	n=s->method->ssl_get_message(s,
@@ -2176,7 +2201,95 @@ int ssl3_get_client_key_exchange(SSL *s)
 	p=(unsigned char *)s->init_msg;
 
 	alg_k=s->s3->tmp.new_cipher->algorithm_mkey;
+	alg_a=s->s3->tmp.new_cipher->algorithm_auth;
 
+#ifndef OPENSSL_NO_PSK
+	if (alg_a & SSL_aPSK)
+		{
+		unsigned char *t = NULL;
+		unsigned char pre_ms[PSK_MAX_PSK_LEN*2+4];
+		unsigned int pre_ms_len = 0;
+		int psk_err = 1;
+		char tmp_id[PSK_MAX_IDENTITY_LEN+1];
+
+		al=SSL_AD_HANDSHAKE_FAILURE;
+
+		n2s(p, i);
+		if (n != i+2 && !(alg_k & SSL_kEECDH))
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+				SSL_R_LENGTH_MISMATCH);
+			goto psk_err;
+			}
+		if (i > PSK_MAX_IDENTITY_LEN)
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+				SSL_R_DATA_LENGTH_TOO_LONG);
+			goto psk_err;
+			}
+		if (s->psk_server_callback == NULL)
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+			       SSL_R_PSK_NO_SERVER_CB);
+			goto psk_err;
+			}
+
+		/* Create guaranteed NUL-terminated identity
+		 * string for the callback */
+		memcpy(tmp_id, p, i);
+		memset(tmp_id+i, 0, PSK_MAX_IDENTITY_LEN+1-i);
+		psk_len = s->psk_server_callback(s, tmp_id, psk, sizeof(psk));
+
+		if (psk_len > PSK_MAX_PSK_LEN)
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+				ERR_R_INTERNAL_ERROR);
+			goto psk_err;
+			}
+		else if (psk_len == 0)
+			{
+			/* PSK related to the given identity not found */
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+			       SSL_R_PSK_IDENTITY_NOT_FOUND);
+			al=SSL_AD_UNKNOWN_PSK_IDENTITY;
+			goto psk_err;
+			}
+		if (!(alg_k & SSL_kEECDH))
+			{
+			/* Create the shared secret now if we're not using ECDHE-PSK.*/
+			pre_ms_len=2+psk_len+2+psk_len;
+			t = pre_ms;
+			s2n(psk_len, t);
+			memset(t, 0, psk_len);
+			t+=psk_len;
+			s2n(psk_len, t);
+			memcpy(t, psk, psk_len);
+
+			s->session->master_key_length=
+				s->method->ssl3_enc->generate_master_secret(s,
+					s->session->master_key, pre_ms, pre_ms_len);
+			}
+		if (s->session->psk_identity != NULL)
+			OPENSSL_free(s->session->psk_identity);
+		s->session->psk_identity = BUF_strdup(tmp_id);
+		OPENSSL_cleanse(tmp_id, PSK_MAX_IDENTITY_LEN+1);
+		if (s->session->psk_identity == NULL)
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+				ERR_R_MALLOC_FAILURE);
+			goto psk_err;
+			}
+
+		p += i;
+		n -= (i + 2);
+		psk_err = 0;
+	psk_err:
+		OPENSSL_cleanse(pre_ms, sizeof(pre_ms));
+		if (psk_err != 0)
+			goto f_err;
+		}
+#endif /* OPENSSL_NO_PSK */
+	if (0) {}
 #ifndef OPENSSL_NO_RSA
 	if (alg_k & SSL_kRSA)
 		{
@@ -2281,10 +2394,9 @@ int ssl3_get_client_key_exchange(SSL *s)
 				p,i);
 		OPENSSL_cleanse(p,i);
 		}
-	else
 #endif
 #ifndef OPENSSL_NO_DH
-		if (alg_k & (SSL_kEDH|SSL_kDHr|SSL_kDHd))
+	else if (alg_k & (SSL_kEDH|SSL_kDHr|SSL_kDHd))
 		{
 		n2s(p,i);
 		if (n != i+2)
@@ -2345,10 +2457,9 @@ int ssl3_get_client_key_exchange(SSL *s)
 				s->session->master_key,p,i);
 		OPENSSL_cleanse(p,i);
 		}
-	else
 #endif
 #ifndef OPENSSL_NO_KRB5
-	if (alg_k & SSL_kKRB5)
+	else if (alg_k & SSL_kKRB5)
 		{
 		krb5_error_code		krb5rc;
 		krb5_data		enc_ticket;
@@ -2537,17 +2648,20 @@ int ssl3_get_client_key_exchange(SSL *s)
 		**  if (s->kssl_ctx)  s->kssl_ctx = NULL;
 		*/
 		}
-	else
 #endif	/* OPENSSL_NO_KRB5 */
-
 #ifndef OPENSSL_NO_ECDH
-		if (alg_k & (SSL_kEECDH|SSL_kECDHr|SSL_kECDHe))
+	else if (alg_k & (SSL_kEECDH|SSL_kECDHr|SSL_kECDHe))
 		{
 		int ret = 1;
 		int field_size = 0;
 		const EC_KEY   *tkey;
 		const EC_GROUP *group;
 		const BIGNUM *priv_key;
+#ifndef OPENSSL_NO_PSK
+		unsigned char *pre_ms;
+		unsigned int pre_ms_len;
+		unsigned char *t;
+#endif /* OPENSSL_NO_PSK */
 
 		/* initialize structures for server's ECDH key pair */
 		if ((srvr_ecdh = EC_KEY_new()) == NULL) 
@@ -2643,7 +2757,7 @@ int ssl3_get_client_key_exchange(SSL *s)
 				}
 
 			/* Get encoded point length */
-			i = *p; 
+			i = *p;
 			p += 1;
 			if (n != 1 + i)
 				{
@@ -2685,223 +2799,155 @@ int ssl3_get_client_key_exchange(SSL *s)
 		EC_KEY_free(srvr_ecdh);
 		BN_CTX_free(bn_ctx);
 		EC_KEY_free(s->s3->tmp.ecdh);
-		s->s3->tmp.ecdh = NULL; 
+		s->s3->tmp.ecdh = NULL;
 
-		/* Compute the master secret */
-		s->session->master_key_length = s->method->ssl3_enc-> \
-		    generate_master_secret(s, s->session->master_key, p, i);
-		
-		OPENSSL_cleanse(p, i);
-		return (ret);
-		}
-	else
-#endif
 #ifndef OPENSSL_NO_PSK
-		if (alg_k & SSL_kPSK)
+		/* ECDHE PSK ciphersuites from RFC 5489 */
+	    if ((alg_a & SSL_aPSK) && psk_len != 0)
 			{
-			unsigned char *t = NULL;
-			unsigned char psk_or_pre_ms[PSK_MAX_PSK_LEN*2+4];
-			unsigned int pre_ms_len = 0, psk_len = 0;
-			int psk_err = 1;
-			char tmp_id[PSK_MAX_IDENTITY_LEN+1];
-
-			al=SSL_AD_HANDSHAKE_FAILURE;
-
-			n2s(p,i);
-			if (n != i+2)
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
-					SSL_R_LENGTH_MISMATCH);
-				goto psk_err;
-				}
-			if (i > PSK_MAX_IDENTITY_LEN)
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
-					SSL_R_DATA_LENGTH_TOO_LONG);
-				goto psk_err;
-				}
-			if (s->psk_server_callback == NULL)
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
-				       SSL_R_PSK_NO_SERVER_CB);
-				goto psk_err;
-				}
-
-			/* Create guaranteed NULL-terminated identity
-			 * string for the callback */
-			memcpy(tmp_id, p, i);
-			memset(tmp_id+i, 0, PSK_MAX_IDENTITY_LEN+1-i);
-			psk_len = s->psk_server_callback(s, tmp_id,
-				psk_or_pre_ms, sizeof(psk_or_pre_ms));
-			OPENSSL_cleanse(tmp_id, PSK_MAX_IDENTITY_LEN+1);
-
-			if (psk_len > PSK_MAX_PSK_LEN)
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
-					ERR_R_INTERNAL_ERROR);
-				goto psk_err;
-				}
-			else if (psk_len == 0)
-				{
-				/* PSK related to the given identity not found */
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
-				       SSL_R_PSK_IDENTITY_NOT_FOUND);
-				al=SSL_AD_UNKNOWN_PSK_IDENTITY;
-				goto psk_err;
-				}
-
-			/* create PSK pre_master_secret */
-			pre_ms_len=2+psk_len+2+psk_len;
-			t = psk_or_pre_ms;
-			memmove(psk_or_pre_ms+psk_len+4, psk_or_pre_ms, psk_len);
-			s2n(psk_len, t);
-			memset(t, 0, psk_len);
-			t+=psk_len;
-			s2n(psk_len, t);
-
-			if (s->session->psk_identity != NULL)
-				OPENSSL_free(s->session->psk_identity);
-			s->session->psk_identity = BUF_strdup((char *)p);
-			if (s->session->psk_identity == NULL)
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
-					ERR_R_MALLOC_FAILURE);
-				goto psk_err;
-				}
-
-			if (s->session->psk_identity_hint != NULL)
-				OPENSSL_free(s->session->psk_identity_hint);
-			s->session->psk_identity_hint = BUF_strdup(s->ctx->psk_identity_hint);
-			if (s->ctx->psk_identity_hint != NULL &&
-				s->session->psk_identity_hint == NULL)
+			pre_ms_len = 2+psk_len+2+i;
+			pre_ms = OPENSSL_malloc(pre_ms_len);
+			if (pre_ms == NULL)
 				{
 				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
 					ERR_R_MALLOC_FAILURE);
-				goto psk_err;
+				goto err;
 				}
-
-			s->session->master_key_length=
-				s->method->ssl3_enc->generate_master_secret(s,
-					s->session->master_key, psk_or_pre_ms, pre_ms_len);
-			psk_err = 0;
-		psk_err:
-			OPENSSL_cleanse(psk_or_pre_ms, sizeof(psk_or_pre_ms));
-			if (psk_err != 0)
-				goto f_err;
+			memset(pre_ms, 0, pre_ms_len);
+			t = pre_ms;
+			s2n(psk_len, t);
+			memcpy(t, psk, psk_len);
+			t += psk_len;
+			s2n(i, t);
+			memcpy(t, p, i);
+			s->session->master_key_length = s->method->ssl3_enc \
+				-> generate_master_secret(s,
+					s->session->master_key, pre_ms, pre_ms_len);
+			OPENSSL_cleanse(pre_ms, pre_ms_len);
+			OPENSSL_free(pre_ms);
 			}
-		else
+#endif /* OPENSSL_NO_PSK */
+		if (!(alg_a & SSL_aPSK))
+			{
+			/* Compute the master secret */
+			s->session->master_key_length = s->method->ssl3_enc \
+				-> generate_master_secret(s,
+					s->session->master_key, p, i);
+			}
+
+		OPENSSL_cleanse(p, i);
+		}
 #endif
 #ifndef OPENSSL_NO_SRP
-		if (alg_k & SSL_kSRP)
+	else if (alg_k & SSL_kSRP)
+		{
+		int param_len;
+
+		n2s(p,i);
+		param_len=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_BAD_SRP_A_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.A=BN_bin2bn(p,i,NULL)))
 			{
-			int param_len;
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		if (s->session->srp_username != NULL)
+			OPENSSL_free(s->session->srp_username);
+		s->session->srp_username = BUF_strdup(s->srp_ctx.login);
+		if (s->session->srp_username == NULL)
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+				ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
 
-			n2s(p,i);
-			param_len=i+2;
-			if (param_len > n)
-				{
-				al=SSL_AD_DECODE_ERROR;
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_BAD_SRP_A_LENGTH);
-				goto f_err;
-				}
-			if (!(s->srp_ctx.A=BN_bin2bn(p,i,NULL)))
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_BN_LIB);
-				goto err;
-				}
-			if (s->session->srp_username != NULL)
-				OPENSSL_free(s->session->srp_username);
-			s->session->srp_username = BUF_strdup(s->srp_ctx.login);
-			if (s->session->srp_username == NULL)
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
-					ERR_R_MALLOC_FAILURE);
-				goto err;
-				}
+		if ((s->session->master_key_length = SRP_generate_server_master_secret(s,s->session->master_key))<0)
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
 
-			if ((s->session->master_key_length = SRP_generate_server_master_secret(s,s->session->master_key))<0)
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
-				goto err;
-				}
+		p+=i;
+		}
+#endif	/* OPENSSL_NO_SRP */
+	else if (alg_k & SSL_kGOST) 
+		{
+		int ret = 0;
+		EVP_PKEY_CTX *pkey_ctx;
+		EVP_PKEY *client_pub_pkey = NULL, *pk = NULL;
+		unsigned char premaster_secret[32], *start;
+		size_t outlen=32, inlen;
+		unsigned long alg_a;
+
+		/* Get our certificate private key*/
+		alg_a = s->s3->tmp.new_cipher->algorithm_auth;
+		if (alg_a & SSL_aGOST94)
+			pk = s->cert->pkeys[SSL_PKEY_GOST94].privatekey;
+		else if (alg_a & SSL_aGOST01)
+			pk = s->cert->pkeys[SSL_PKEY_GOST01].privatekey;
 
-			p+=i;
+		pkey_ctx = EVP_PKEY_CTX_new(pk,NULL);
+		EVP_PKEY_decrypt_init(pkey_ctx);
+		/* If client certificate is present and is of the same type, maybe
+		 * use it for key exchange.  Don't mind errors from
+		 * EVP_PKEY_derive_set_peer, because it is completely valid to use
+		 * a client certificate for authorization only. */
+		client_pub_pkey = X509_get_pubkey(s->session->peer);
+		if (client_pub_pkey)
+			{
+			if (EVP_PKEY_derive_set_peer(pkey_ctx, client_pub_pkey) <= 0)
+				ERR_clear_error();
+			}
+		/* Decrypt session key */
+		if ((*p!=( V_ASN1_SEQUENCE| V_ASN1_CONSTRUCTED))) 
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_DECRYPTION_FAILED);
+			goto gerr;
+			}
+		if (p[1] == 0x81)
+			{
+			start = p+3;
+			inlen = p[2];
+			}
+		else if (p[1] < 0x80)
+			{
+			start = p+2;
+			inlen = p[1];
 			}
 		else
-#endif	/* OPENSSL_NO_SRP */
-		if (alg_k & SSL_kGOST) 
 			{
-			int ret = 0;
-			EVP_PKEY_CTX *pkey_ctx;
-			EVP_PKEY *client_pub_pkey = NULL, *pk = NULL;
-			unsigned char premaster_secret[32], *start;
-			size_t outlen=32, inlen;
-			unsigned long alg_a;
-
-			/* Get our certificate private key*/
-			alg_a = s->s3->tmp.new_cipher->algorithm_auth;
-			if (alg_a & SSL_aGOST94)
-				pk = s->cert->pkeys[SSL_PKEY_GOST94].privatekey;
-			else if (alg_a & SSL_aGOST01)
-				pk = s->cert->pkeys[SSL_PKEY_GOST01].privatekey;
-
-			pkey_ctx = EVP_PKEY_CTX_new(pk,NULL);
-			EVP_PKEY_decrypt_init(pkey_ctx);
-			/* If client certificate is present and is of the same type, maybe
-			 * use it for key exchange.  Don't mind errors from
-			 * EVP_PKEY_derive_set_peer, because it is completely valid to use
-			 * a client certificate for authorization only. */
-			client_pub_pkey = X509_get_pubkey(s->session->peer);
-			if (client_pub_pkey)
-				{
-				if (EVP_PKEY_derive_set_peer(pkey_ctx, client_pub_pkey) <= 0)
-					ERR_clear_error();
-				}
-			/* Decrypt session key */
-			if ((*p!=( V_ASN1_SEQUENCE| V_ASN1_CONSTRUCTED))) 
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_DECRYPTION_FAILED);
-				goto gerr;
-				}
-			if (p[1] == 0x81)
-				{
-				start = p+3;
-				inlen = p[2];
-				}
-			else if (p[1] < 0x80)
-				{
-				start = p+2;
-				inlen = p[1];
-				}
-			else
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_DECRYPTION_FAILED);
-				goto gerr;
-				}
-			if (EVP_PKEY_decrypt(pkey_ctx,premaster_secret,&outlen,start,inlen) <=0) 
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_DECRYPTION_FAILED);
+			goto gerr;
+			}
+		if (EVP_PKEY_decrypt(pkey_ctx,premaster_secret,&outlen,start,inlen) <=0) 
 
-				{
-				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_DECRYPTION_FAILED);
-				goto gerr;
-				}
-			/* Generate master secret */
-			s->session->master_key_length=
-				s->method->ssl3_enc->generate_master_secret(s,
-					s->session->master_key,premaster_secret,32);
-			/* Check if pubkey from client certificate was used */
-			if (EVP_PKEY_CTX_ctrl(pkey_ctx, -1, -1, EVP_PKEY_CTRL_PEER_KEY, 2, NULL) > 0)
-				ret = 2;
-			else
-				ret = 1;
-		gerr:
-			EVP_PKEY_free(client_pub_pkey);
-			EVP_PKEY_CTX_free(pkey_ctx);
-			if (ret)
-				return ret;
-			else
-				goto err;
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_DECRYPTION_FAILED);
+			goto gerr;
 			}
+		/* Generate master secret */
+		s->session->master_key_length=
+			s->method->ssl3_enc->generate_master_secret(s,
+				s->session->master_key,premaster_secret,32);
+		/* Check if pubkey from client certificate was used */
+		if (EVP_PKEY_CTX_ctrl(pkey_ctx, -1, -1, EVP_PKEY_CTRL_PEER_KEY, 2, NULL) > 0)
+			ret = 2;
+		else
+			ret = 1;
+	gerr:
+		EVP_PKEY_free(client_pub_pkey);
+		EVP_PKEY_CTX_free(pkey_ctx);
+		if (ret)
+			return ret;
 		else
+			goto err;
+		}
+	else if (!(alg_k & SSL_kPSK))
 		{
 		al=SSL_AD_HANDSHAKE_FAILURE;
 		SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
diff --git a/main/openssl/ssl/ssl.h b/main/openssl/ssl/ssl.h
index 40c4d9cf..57335a98 100644
--- a/main/openssl/ssl/ssl.h
+++ b/main/openssl/ssl/ssl.h
@@ -1315,6 +1315,10 @@ struct ssl_st
 #endif	/* OPENSSL_NO_KRB5 */
 
 #ifndef OPENSSL_NO_PSK
+	/* PSK identity hint is stored here only to enable setting a hint on an SSL object before an
+	 * SSL_SESSION is associated with it. Once an SSL_SESSION is associated with this SSL object,
+	 * the psk_identity_hint from the session takes precedence over this one. */
+	char *psk_identity_hint;
 	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
 		unsigned int max_identity_len, unsigned char *psk,
 		unsigned int max_psk_len);
diff --git a/main/openssl/ssl/ssl_lib.c b/main/openssl/ssl/ssl_lib.c
index 74523862..8d2c3a76 100644
--- a/main/openssl/ssl/ssl_lib.c
+++ b/main/openssl/ssl/ssl_lib.c
@@ -388,6 +388,13 @@ SSL *SSL_new(SSL_CTX *ctx)
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_SSL, s, &s->ex_data);
 
 #ifndef OPENSSL_NO_PSK
+	s->psk_identity_hint = NULL;
+	if (ctx->psk_identity_hint)
+		{
+		s->psk_identity_hint = BUF_strdup(ctx->psk_identity_hint);
+		if (s->psk_identity_hint == NULL)
+			goto err;
+		}
 	s->psk_client_callback=ctx->psk_client_callback;
 	s->psk_server_callback=ctx->psk_server_callback;
 #endif
@@ -596,6 +603,11 @@ void SSL_free(SSL *s)
 		OPENSSL_free(s->alpn_client_proto_list);
 #endif
 
+#ifndef OPENSSL_NO_PSK
+	if (s->psk_identity_hint)
+		OPENSSL_free(s->psk_identity_hint);
+#endif
+
 	if (s->client_CA != NULL)
 		sk_X509_NAME_pop_free(s->client_CA,X509_NAME_free);
 
@@ -1440,7 +1452,7 @@ int ssl_cipher_list_to_bytes(SSL *s,STACK_OF(SSL_CIPHER) *sk,unsigned char *p,
 #endif /* OPENSSL_NO_KRB5 */
 #ifndef OPENSSL_NO_PSK
 		/* with PSK there must be client callback set */
-		if (((c->algorithm_mkey & SSL_kPSK) || (c->algorithm_auth & SSL_aPSK)) &&
+		if ((c->algorithm_auth & SSL_aPSK) &&
 		    s->psk_client_callback == NULL)
 			continue;
 #endif /* OPENSSL_NO_PSK */
@@ -3303,32 +3315,54 @@ int SSL_use_psk_identity_hint(SSL *s, const char *identity_hint)
 	if (s == NULL)
 		return 0;
 
-	if (s->session == NULL)
-		return 1; /* session not created yet, ignored */
-
 	if (identity_hint != NULL && strlen(identity_hint) > PSK_MAX_IDENTITY_LEN)
 		{
 		SSLerr(SSL_F_SSL_USE_PSK_IDENTITY_HINT, SSL_R_DATA_LENGTH_TOO_LONG);
 		return 0;
 		}
-	if (s->session->psk_identity_hint != NULL)
+
+	/* Clear hint in SSL and associated SSL_SESSION (if any). */
+	if (s->psk_identity_hint != NULL)
+		{
+		OPENSSL_free(s->psk_identity_hint);
+		s->psk_identity_hint = NULL;
+		}
+	if (s->session != NULL && s->session->psk_identity_hint != NULL)
+		{
 		OPENSSL_free(s->session->psk_identity_hint);
+		s->session->psk_identity_hint = NULL;
+		}
+
 	if (identity_hint != NULL)
 		{
-		s->session->psk_identity_hint = BUF_strdup(identity_hint);
-		if (s->session->psk_identity_hint == NULL)
-			return 0;
+		/* The hint is stored in SSL and SSL_SESSION with the one in
+		 * SSL_SESSION taking precedence. Thus, if SSL_SESSION is avaiable,
+		 * we store the hint there, otherwise we store it in SSL. */
+		if (s->session != NULL)
+			{
+			s->session->psk_identity_hint = BUF_strdup(identity_hint);
+			if (s->session->psk_identity_hint == NULL)
+				return 0;
+			}
+		else
+			{
+			s->psk_identity_hint = BUF_strdup(identity_hint);
+			if (s->psk_identity_hint == NULL)
+				return 0;
+			}
 		}
-	else
-		s->session->psk_identity_hint = NULL;
 	return 1;
 	}
 
 const char *SSL_get_psk_identity_hint(const SSL *s)
 	{
-	if (s == NULL || s->session == NULL)
+	if (s == NULL)
 		return NULL;
-	return(s->session->psk_identity_hint);
+	/* The hint is stored in SSL and SSL_SESSION with the one in SSL_SESSION
+	 * taking precedence. */
+	if (s->session != NULL)
+		return(s->session->psk_identity_hint);
+	return(s->psk_identity_hint);
 	}
 
 const char *SSL_get_psk_identity(const SSL *s)
diff --git a/main/openssl/ssl/ssl_sess.c b/main/openssl/ssl/ssl_sess.c
index 920b763e..ec088404 100644
--- a/main/openssl/ssl/ssl_sess.c
+++ b/main/openssl/ssl/ssl_sess.c
@@ -426,6 +426,18 @@ int ssl_get_new_session(SSL *s, int session)
 			memcpy(ss->tlsext_ellipticcurvelist, s->tlsext_ellipticcurvelist, s->tlsext_ellipticcurvelist_length);
 			}
 #endif
+#endif
+#ifndef OPENSSL_NO_PSK
+		if (s->psk_identity_hint)
+			{
+			ss->psk_identity_hint = BUF_strdup(s->psk_identity_hint);
+			if (ss->psk_identity_hint == NULL)
+				{
+				SSLerr(SSL_F_SSL_GET_NEW_SESSION, ERR_R_MALLOC_FAILURE);
+				SSL_SESSION_free(ss);
+				return 0;
+				}
+			}
 #endif
 		}
 	else
diff --git a/main/openssl/ssl/tls1.h b/main/openssl/ssl/tls1.h
index 6283c6a7..ec8948d5 100644
--- a/main/openssl/ssl/tls1.h
+++ b/main/openssl/ssl/tls1.h
@@ -531,6 +531,10 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256        0x0300C031
 #define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384        0x0300C032
 
+/* ECDHE PSK ciphersuites from RFC 5489 */
+#define TLS1_CK_ECDHE_PSK_WITH_AES_128_CBC_SHA256    0x0300C037
+#define TLS1_CK_ECDHE_PSK_WITH_AES_256_CBC_SHA384    0x0300C038
+
 /* XXX
  * Inconsistency alert:
  * The OpenSSL names of ciphers with ephemeral DH here include the string
@@ -682,6 +686,10 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256       "ECDH-RSA-AES128-GCM-SHA256"
 #define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384       "ECDH-RSA-AES256-GCM-SHA384"
 
+/* ECDHE PSK ciphersuites from RFC 5489 */
+#define TLS1_TXT_ECDHE_PSK_WITH_AES_128_CBC_SHA256  "ECDHE-PSK-WITH-AES-128-CBC-SHA256"
+#define TLS1_TXT_ECDHE_PSK_WITH_AES_256_CBC_SHA384  "ECDHE-PSK-WITH-AES-256-CBC-SHA384"
+
 #define TLS_CT_RSA_SIGN			1
 #define TLS_CT_DSS_SIGN			2
 #define TLS_CT_RSA_FIXED_DH		3
-- 
cgit v1.2.3