Update OpenSSL to 1.0.1g and statically link OpenVPN with it

author: Arne Schwabe <arne@rfc2549.org> 2014-04-23 09:56:37 +0200
committer: Arne Schwabe <arne@rfc2549.org> 2014-04-23 09:56:37 +0200
commit: e436c963f0976b885a7db04681344779e26dd3b5 (patch)
tree: 240663106f32e02e1c34080656f4ef21a2e1776e /main/openssl/crypto
parent: 6a99715a9b072fa249e79c98cd9f03991f0f1219 (diff)
423 files changed, 115106 insertions, 5525 deletions
diff --git a/main/openssl/crypto/Android.mk b/main/openssl/crypto/Android.mk
deleted file mode 100644
index 70aef35d..00000000
--- a/main/openssl/crypto/Android.mk
+++ /dev/null
@@ -1,559 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-arm_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
-arm_src_files := \
-    aes/asm/aes-armv4.s \
-    bn/asm/armv4-mont.s \
-    sha/asm/sha1-armv4-large.s \
-    sha/asm/sha256-armv4.s \
-    sha/asm/sha512-armv4.s
-non_arm_src_files := aes/aes_core.c
-
-local_src_files := \
-	cryptlib.c \
-	mem.c \
-	mem_clr.c \
-	mem_dbg.c \
-	cversion.c \
-	ex_data.c \
-	cpt_err.c \
-	ebcdic.c \
-	uid.c \
-	o_time.c \
-	o_str.c \
-	o_dir.c \
-	aes/aes_cbc.c \
-	aes/aes_cfb.c \
-	aes/aes_ctr.c \
-	aes/aes_ecb.c \
-	aes/aes_misc.c \
-	aes/aes_ofb.c \
-	aes/aes_wrap.c \
-	asn1/a_bitstr.c \
-	asn1/a_bool.c \
-	asn1/a_bytes.c \
-	asn1/a_d2i_fp.c \
-	asn1/a_digest.c \
-	asn1/a_dup.c \
-	asn1/a_enum.c \
-	asn1/a_gentm.c \
-	asn1/a_i2d_fp.c \
-	asn1/a_int.c \
-	asn1/a_mbstr.c \
-	asn1/a_object.c \
-	asn1/a_octet.c \
-	asn1/a_print.c \
-	asn1/a_set.c \
-	asn1/a_sign.c \
-	asn1/a_strex.c \
-	asn1/a_strnid.c \
-	asn1/a_time.c \
-	asn1/a_type.c \
-	asn1/a_utctm.c \
-	asn1/a_utf8.c \
-	asn1/a_verify.c \
-	asn1/ameth_lib.c \
-	asn1/asn1_err.c \
-	asn1/asn1_gen.c \
-	asn1/asn1_lib.c \
-	asn1/asn1_par.c \
-	asn1/asn_mime.c \
-	asn1/asn_moid.c \
-	asn1/asn_pack.c \
-	asn1/bio_asn1.c \
-	asn1/bio_ndef.c \
-	asn1/d2i_pr.c \
-	asn1/d2i_pu.c \
-	asn1/evp_asn1.c \
-	asn1/f_enum.c \
-	asn1/f_int.c \
-	asn1/f_string.c \
-	asn1/i2d_pr.c \
-	asn1/i2d_pu.c \
-	asn1/n_pkey.c \
-	asn1/nsseq.c \
-	asn1/p5_pbe.c \
-	asn1/p5_pbev2.c \
-	asn1/p8_pkey.c \
-	asn1/t_bitst.c \
-	asn1/t_crl.c \
-	asn1/t_pkey.c \
-	asn1/t_req.c \
-	asn1/t_spki.c \
-	asn1/t_x509.c \
-	asn1/t_x509a.c \
-	asn1/tasn_dec.c \
-	asn1/tasn_enc.c \
-	asn1/tasn_fre.c \
-	asn1/tasn_new.c \
-	asn1/tasn_prn.c \
-	asn1/tasn_typ.c \
-	asn1/tasn_utl.c \
-	asn1/x_algor.c \
-	asn1/x_attrib.c \
-	asn1/x_bignum.c \
-	asn1/x_crl.c \
-	asn1/x_exten.c \
-	asn1/x_info.c \
-	asn1/x_long.c \
-	asn1/x_name.c \
-	asn1/x_nx509.c \
-	asn1/x_pkey.c \
-	asn1/x_pubkey.c \
-	asn1/x_req.c \
-	asn1/x_sig.c \
-	asn1/x_spki.c \
-	asn1/x_val.c \
-	asn1/x_x509.c \
-	asn1/x_x509a.c \
-	bf/bf_cfb64.c \
-	bf/bf_ecb.c \
-	bf/bf_enc.c \
-	bf/bf_ofb64.c \
-	bf/bf_skey.c \
-	bio/b_dump.c \
-	bio/b_print.c \
-	bio/b_sock.c \
-	bio/bf_buff.c \
-	bio/bf_nbio.c \
-	bio/bf_null.c \
-	bio/bio_cb.c \
-	bio/bio_err.c \
-	bio/bio_lib.c \
-	bio/bss_acpt.c \
-	bio/bss_bio.c \
-	bio/bss_conn.c \
-	bio/bss_dgram.c \
-	bio/bss_fd.c \
-	bio/bss_file.c \
-	bio/bss_log.c \
-	bio/bss_mem.c \
-	bio/bss_null.c \
-	bio/bss_sock.c \
-	bn/bn_add.c \
-	bn/bn_asm.c \
-	bn/bn_blind.c \
-	bn/bn_const.c \
-	bn/bn_ctx.c \
-	bn/bn_div.c \
-	bn/bn_err.c \
-	bn/bn_exp.c \
-	bn/bn_exp2.c \
-	bn/bn_gcd.c \
-	bn/bn_gf2m.c \
-	bn/bn_kron.c \
-	bn/bn_lib.c \
-	bn/bn_mod.c \
-	bn/bn_mont.c \
-	bn/bn_mpi.c \
-	bn/bn_mul.c \
-	bn/bn_nist.c \
-	bn/bn_prime.c \
-	bn/bn_print.c \
-	bn/bn_rand.c \
-	bn/bn_recp.c \
-	bn/bn_shift.c \
-	bn/bn_sqr.c \
-	bn/bn_sqrt.c \
-	bn/bn_word.c \
-	buffer/buf_err.c \
-	buffer/buffer.c \
-	comp/c_rle.c \
-	comp/c_zlib.c \
-	comp/comp_err.c \
-	comp/comp_lib.c \
-	conf/conf_api.c \
-	conf/conf_def.c \
-	conf/conf_err.c \
-	conf/conf_lib.c \
-	conf/conf_mall.c \
-	conf/conf_mod.c \
-	conf/conf_sap.c \
-	des/cbc_cksm.c \
-	des/cbc_enc.c \
-	des/cfb64ede.c \
-	des/cfb64enc.c \
-	des/cfb_enc.c \
-	des/des_enc.c \
-	des/des_old.c \
-	des/des_old2.c \
-	des/ecb3_enc.c \
-	des/ecb_enc.c \
-	des/ede_cbcm_enc.c \
-	des/enc_read.c \
-	des/enc_writ.c \
-	des/fcrypt.c \
-	des/fcrypt_b.c \
-	des/ofb64ede.c \
-	des/ofb64enc.c \
-	des/ofb_enc.c \
-	des/pcbc_enc.c \
-	des/qud_cksm.c \
-	des/rand_key.c \
-	des/read2pwd.c \
-	des/rpc_enc.c \
-	des/set_key.c \
-	des/str2key.c \
-	des/xcbc_enc.c \
-	dh/dh_ameth.c \
-	dh/dh_asn1.c \
-	dh/dh_check.c \
-	dh/dh_depr.c \
-	dh/dh_err.c \
-	dh/dh_gen.c \
-	dh/dh_key.c \
-	dh/dh_lib.c \
-	dh/dh_pmeth.c \
-	dsa/dsa_ameth.c \
-	dsa/dsa_asn1.c \
-	dsa/dsa_depr.c \
-	dsa/dsa_err.c \
-	dsa/dsa_gen.c \
-	dsa/dsa_key.c \
-	dsa/dsa_lib.c \
-	dsa/dsa_ossl.c \
-	dsa/dsa_pmeth.c \
-	dsa/dsa_prn.c \
-	dsa/dsa_sign.c \
-	dsa/dsa_vrf.c \
-	dso/dso_dl.c \
-	dso/dso_dlfcn.c \
-	dso/dso_err.c \
-	dso/dso_lib.c \
-	dso/dso_null.c \
-	dso/dso_openssl.c \
-	ec/ec2_mult.c \
-	ec/ec2_smpl.c \
-	ec/ec_ameth.c \
-	ec/ec_asn1.c \
-	ec/ec_check.c \
-	ec/ec_curve.c \
-	ec/ec_cvt.c \
-	ec/ec_err.c \
-	ec/ec_key.c \
-	ec/ec_lib.c \
-	ec/ec_mult.c \
-	ec/ec_pmeth.c \
-	ec/ec_print.c \
-	ec/eck_prn.c \
-	ec/ecp_mont.c \
-	ec/ecp_nist.c \
-	ec/ecp_smpl.c \
-	ecdh/ech_err.c \
-	ecdh/ech_key.c \
-	ecdh/ech_lib.c \
-	ecdh/ech_ossl.c \
-	ecdsa/ecs_asn1.c \
-	ecdsa/ecs_err.c \
-	ecdsa/ecs_lib.c \
-	ecdsa/ecs_ossl.c \
-	ecdsa/ecs_sign.c \
-	ecdsa/ecs_vrf.c \
-	err/err.c \
-	err/err_all.c \
-	err/err_prn.c \
-	evp/bio_b64.c \
-	evp/bio_enc.c \
-	evp/bio_md.c \
-	evp/bio_ok.c \
-	evp/c_all.c \
-	evp/c_allc.c \
-	evp/c_alld.c \
-	evp/digest.c \
-	evp/e_aes.c \
-	evp/e_bf.c \
-	evp/e_des.c \
-	evp/e_des3.c \
-	evp/e_null.c \
-	evp/e_old.c \
-	evp/e_rc2.c \
-	evp/e_rc4.c \
-	evp/e_rc5.c \
-	evp/e_xcbc_d.c \
-	evp/encode.c \
-	evp/evp_acnf.c \
-	evp/evp_enc.c \
-	evp/evp_err.c \
-	evp/evp_key.c \
-	evp/evp_lib.c \
-	evp/evp_pbe.c \
-	evp/evp_pkey.c \
-	evp/m_dss.c \
-	evp/m_dss1.c \
-	evp/m_ecdsa.c \
-	evp/m_md4.c \
-	evp/m_md5.c \
-	evp/m_mdc2.c \
-	evp/m_null.c \
-	evp/m_ripemd.c \
-	evp/m_sha1.c \
-	evp/m_sigver.c \
-	evp/m_wp.c \
-	evp/names.c \
-	evp/p5_crpt.c \
-	evp/p5_crpt2.c \
-	evp/p_dec.c \
-	evp/p_enc.c \
-	evp/p_lib.c \
-	evp/p_open.c \
-	evp/p_seal.c \
-	evp/p_sign.c \
-	evp/p_verify.c \
-	evp/pmeth_fn.c \
-	evp/pmeth_gn.c \
-	evp/pmeth_lib.c \
-	hmac/hm_ameth.c \
-	hmac/hm_pmeth.c \
-	hmac/hmac.c \
-	krb5/krb5_asn.c \
-	lhash/lh_stats.c \
-	lhash/lhash.c \
-	md4/md4_dgst.c \
-	md4/md4_one.c \
-	md5/md5_dgst.c \
-	md5/md5_one.c \
-	modes/cbc128.c \
-	modes/cfb128.c \
-	modes/ctr128.c \
-	modes/ofb128.c \
-	objects/o_names.c \
-	objects/obj_dat.c \
-	objects/obj_err.c \
-	objects/obj_lib.c \
-	objects/obj_xref.c \
-	ocsp/ocsp_asn.c \
-	ocsp/ocsp_cl.c \
-	ocsp/ocsp_err.c \
-	ocsp/ocsp_ext.c \
-	ocsp/ocsp_ht.c \
-	ocsp/ocsp_lib.c \
-	ocsp/ocsp_prn.c \
-	ocsp/ocsp_srv.c \
-	ocsp/ocsp_vfy.c \
-	pem/pem_all.c \
-	pem/pem_err.c \
-	pem/pem_info.c \
-	pem/pem_lib.c \
-	pem/pem_oth.c \
-	pem/pem_pk8.c \
-	pem/pem_pkey.c \
-	pem/pem_seal.c \
-	pem/pem_sign.c \
-	pem/pem_x509.c \
-	pem/pem_xaux.c \
-	pem/pvkfmt.c \
-	pkcs12/p12_add.c \
-	pkcs12/p12_asn.c \
-	pkcs12/p12_attr.c \
-	pkcs12/p12_crpt.c \
-	pkcs12/p12_crt.c \
-	pkcs12/p12_decr.c \
-	pkcs12/p12_init.c \
-	pkcs12/p12_key.c \
-	pkcs12/p12_kiss.c \
-	pkcs12/p12_mutl.c \
-	pkcs12/p12_npas.c \
-	pkcs12/p12_p8d.c \
-	pkcs12/p12_p8e.c \
-	pkcs12/p12_utl.c \
-	pkcs12/pk12err.c \
-	pkcs7/pk7_asn1.c \
-	pkcs7/pk7_attr.c \
-	pkcs7/pk7_doit.c \
-	pkcs7/pk7_lib.c	\
-	pkcs7/pk7_mime.c \
-	pkcs7/pk7_smime.c \
-	pkcs7/pkcs7err.c \
-	rand/md_rand.c \
-	rand/rand_egd.c \
-	rand/rand_err.c \
-	rand/rand_lib.c \
-	rand/rand_unix.c \
-	rand/randfile.c \
-	rc2/rc2_cbc.c \
-	rc2/rc2_ecb.c \
-	rc2/rc2_skey.c \
-	rc2/rc2cfb64.c \
-	rc2/rc2ofb64.c \
-	rc4/rc4_enc.c \
-	rc4/rc4_skey.c \
-	ripemd/rmd_dgst.c \
-	ripemd/rmd_one.c \
-	rsa/rsa_ameth.c \
-	rsa/rsa_asn1.c \
-	rsa/rsa_chk.c \
-	rsa/rsa_eay.c \
-	rsa/rsa_err.c \
-	rsa/rsa_gen.c \
-	rsa/rsa_lib.c \
-	rsa/rsa_none.c \
-	rsa/rsa_null.c \
-	rsa/rsa_oaep.c \
-	rsa/rsa_pk1.c \
-	rsa/rsa_pmeth.c \
-	rsa/rsa_prn.c \
-	rsa/rsa_pss.c \
-	rsa/rsa_saos.c \
-	rsa/rsa_sign.c \
-	rsa/rsa_ssl.c \
-	rsa/rsa_x931.c \
-	sha/sha1_one.c \
-	sha/sha1dgst.c \
-	sha/sha256.c \
-	sha/sha512.c \
-	sha/sha_dgst.c \
-	stack/stack.c \
-	ts/ts_err.c \
-	txt_db/txt_db.c \
-	ui/ui_compat.c \
-	ui/ui_err.c \
-	ui/ui_lib.c \
-	ui/ui_openssl.c \
-	ui/ui_util.c \
-	x509/by_dir.c \
-	x509/by_file.c \
-	x509/x509_att.c \
-	x509/x509_cmp.c \
-	x509/x509_d2.c \
-	x509/x509_def.c \
-	x509/x509_err.c \
-	x509/x509_ext.c \
-	x509/x509_lu.c \
-	x509/x509_obj.c \
-	x509/x509_r2x.c \
-	x509/x509_req.c \
-	x509/x509_set.c \
-	x509/x509_trs.c \
-	x509/x509_txt.c \
-	x509/x509_v3.c \
-	x509/x509_vfy.c \
-	x509/x509_vpm.c \
-	x509/x509cset.c \
-	x509/x509name.c \
-	x509/x509rset.c \
-	x509/x509spki.c \
-	x509/x509type.c \
-	x509/x_all.c \
-	x509v3/pcy_cache.c \
-	x509v3/pcy_data.c \
-	x509v3/pcy_lib.c \
-	x509v3/pcy_map.c \
-	x509v3/pcy_node.c \
-	x509v3/pcy_tree.c \
-	x509v3/v3_akey.c \
-	x509v3/v3_akeya.c \
-	x509v3/v3_alt.c \
-	x509v3/v3_bcons.c \
-	x509v3/v3_bitst.c \
-	x509v3/v3_conf.c \
-	x509v3/v3_cpols.c \
-	x509v3/v3_crld.c \
-	x509v3/v3_enum.c \
-	x509v3/v3_extku.c \
-	x509v3/v3_genn.c \
-	x509v3/v3_ia5.c \
-	x509v3/v3_info.c \
-	x509v3/v3_int.c \
-	x509v3/v3_lib.c \
-	x509v3/v3_ncons.c \
-	x509v3/v3_ocsp.c \
-	x509v3/v3_pci.c \
-	x509v3/v3_pcia.c \
-	x509v3/v3_pcons.c \
-	x509v3/v3_pku.c \
-	x509v3/v3_pmaps.c \
-	x509v3/v3_prn.c \
-	x509v3/v3_purp.c \
-	x509v3/v3_skey.c \
-	x509v3/v3_sxnet.c \
-	x509v3/v3_utl.c \
-	x509v3/v3err.c
-
-local_c_includes := \
-	openssl \
-	openssl/crypto/asn1 \
-	openssl/crypto/evp \
-	openssl/include \
-	openssl/include/openssl \
-	zlib
-
-local_c_flags := -DNO_WINDOWS_BRAINDEATH
-
-#######################################
-# target static library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifneq ($(TARGET_ARCH),x86)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
-	LOCAL_SRC_FILES += $(arm_src_files)
-	LOCAL_CFLAGS += $(arm_cflags)
-else
-	LOCAL_SRC_FILES += $(non_arm_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto_static
-include $(BUILD_STATIC_LIBRARY)
-
-#######################################
-# target shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-#ifneq ($(TARGET_ARCH),x86)
-#LOCAL_NDK_VERSION := 5
-#LOCAL_SDK_VERSION := 9
-## Use the NDK prebuilt libz and libdl.
-LOCAL_LDFLAGS += -lz -ldl
-#else
-#LOCAL_SHARED_LIBRARIES += libz libdl
-#endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
-	LOCAL_SRC_FILES += $(arm_src_files)
-	LOCAL_CFLAGS += $(arm_cflags)
-else
-	LOCAL_SRC_FILES += $(non_arm_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto
-include $(BUILD_SHARED_LIBRARY)
-
-#######################################
-# Host shared library
-#include $(CLEAR_VARS)
-##include $(LOCAL_PATH)/../android-config.mk
-#LOCAL_SRC_FILES += $(local_src_files)
-#LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-#LOCAL_C_INCLUDES += $(local_c_includes)
-#LOCAL_SRC_FILES += $(non_arm_src_files)
-#LOCAL_STATIC_LIBRARIES += libz
-#LOCAL_LDLIBS += -ldl
-#LOCAL_MODULE_TAGS := optional
-#LOCAL_MODULE:= libcrypto
-#include $(BUILD_HOST_SHARED_LIBRARY)
-
-########################################
-# host static library, which is used by some SDK tools.
-
-#include $(CLEAR_VARS)
-#include $(LOCAL_PATH)/../android-config.mk
-#LOCAL_SRC_FILES += $(local_src_files)
-#LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-#LOCAL_C_INCLUDES += $(local_c_includes)
-#LOCAL_SRC_FILES += $(non_arm_src_files)
-#LOCAL_STATIC_LIBRARIES += libz
-#LOCAL_LDLIBS += -ldl
-#LOCAL_MODULE_TAGS := optional
-#LOCAL_MODULE:= libcrypto_static
-#include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/main/openssl/crypto/LPdir_win32.c b/main/openssl/crypto/LPdir_win32.c
new file mode 100644
index 00000000..e39872da
--- /dev/null
+++ b/main/openssl/crypto/LPdir_win32.c
@@ -0,0 +1,30 @@
+/* $LP: LPlib/source/LPdir_win32.c,v 1.3 2004/08/26 13:36:05 _cvs_levitte Exp $ */
+/*
+ * Copyright (c) 2004, Richard Levitte <richard@levitte.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define LP_SYS_WIN32
+#define LP_MULTIBYTE_AVAILABLE
+#include "LPdir_win.c"
diff --git a/main/openssl/crypto/aes/aes.h b/main/openssl/crypto/aes/aes.h
index d2c99730..031abf01 100644
--- a/main/openssl/crypto/aes/aes.h
+++ b/main/openssl/crypto/aes/aes.h
@@ -90,6 +90,11 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	AES_KEY *key);
 
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+
 void AES_encrypt(const unsigned char *in, unsigned char *out,
 	const AES_KEY *key);
 void AES_decrypt(const unsigned char *in, unsigned char *out,
diff --git a/main/openssl/crypto/aes/aes_core.c b/main/openssl/crypto/aes/aes_core.c
index a7ec54f4..8f5210ac 100644
--- a/main/openssl/crypto/aes/aes_core.c
+++ b/main/openssl/crypto/aes/aes_core.c
@@ -625,7 +625,7 @@ static const u32 rcon[] = {
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 			AES_KEY *key) {
 
 	u32 *rk;
@@ -726,7 +726,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 			 AES_KEY *key) {
 
         u32 *rk;
@@ -734,7 +734,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	u32 temp;
 
 	/* first, start with an encryption schedule */
-	status = AES_set_encrypt_key(userKey, bits, key);
+	status = private_AES_set_encrypt_key(userKey, bits, key);
 	if (status < 0)
 		return status;
 
@@ -1201,7 +1201,7 @@ static const u32 rcon[] = {
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 			AES_KEY *key) {
 	u32 *rk;
    	int i = 0;
@@ -1301,7 +1301,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 			 AES_KEY *key) {
 
         u32 *rk;
@@ -1309,7 +1309,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	u32 temp;
 
 	/* first, start with an encryption schedule */
-	status = AES_set_encrypt_key(userKey, bits, key);
+	status = private_AES_set_encrypt_key(userKey, bits, key);
 	if (status < 0)
 		return status;
 
diff --git a/main/openssl/crypto/aes/aes_misc.c b/main/openssl/crypto/aes/aes_misc.c
index 4fead1b4..f083488e 100644
--- a/main/openssl/crypto/aes/aes_misc.c
+++ b/main/openssl/crypto/aes/aes_misc.c
@@ -50,6 +50,7 @@
  */
 
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 #include <openssl/aes.h>
 #include "aes_locl.h"
 
@@ -62,3 +63,23 @@ const char *AES_options(void) {
         return "aes(partial)";
 #endif
 }
+
+/* FIPS wrapper functions to block low level AES calls in FIPS mode */
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+			AES_KEY *key)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(AES);
+#endif
+	return private_AES_set_encrypt_key(userKey, bits, key);
+	}
+
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+			AES_KEY *key)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(AES);
+#endif
+	return private_AES_set_decrypt_key(userKey, bits, key);
+	}
diff --git a/main/openssl/crypto/aes/asm/aes-586.S b/main/openssl/crypto/aes/asm/aes-586.S
new file mode 100644
index 00000000..20c0238f
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-586.S
@@ -0,0 +1,3239 @@
+.file	"aes-586.s"
+.text
+.type	_x86_AES_encrypt_compact,@function
+.align	16
+_x86_AES_encrypt_compact:
+	movl	%edi,20(%esp)
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+	movl	-128(%ebp),%edi
+	movl	-96(%ebp),%esi
+	movl	-64(%ebp),%edi
+	movl	-32(%ebp),%esi
+	movl	(%ebp),%edi
+	movl	32(%ebp),%esi
+	movl	64(%ebp),%edi
+	movl	96(%ebp),%esi
+.align	16
+.L000loop:
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+
+	movl	%ebx,%esi
+	andl	$255,%esi
+	shrl	$16,%ebx
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%ch,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+
+	movl	%ecx,%esi
+	andl	$255,%esi
+	shrl	$24,%ecx
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edx
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+
+	andl	$255,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	movzbl	%ah,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$8,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	movzbl	-128(%ebp,%ecx,1),%ecx
+	shll	$24,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+
+	movl	%ecx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ecx,%ecx,1),%edi
+	subl	%ebp,%esi
+	andl	$4278124286,%edi
+	andl	$454761243,%esi
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	xorl	%esi,%ecx
+	roll	$24,%ecx
+	xorl	%esi,%ecx
+	rorl	$16,%ebp
+	xorl	%ebp,%ecx
+	rorl	$8,%ebp
+	xorl	%ebp,%ecx
+	movl	%edx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%edx,%edx,1),%edi
+	subl	%ebp,%esi
+	andl	$4278124286,%edi
+	andl	$454761243,%esi
+	movl	%edx,%ebp
+	xorl	%edi,%esi
+	xorl	%esi,%edx
+	roll	$24,%edx
+	xorl	%esi,%edx
+	rorl	$16,%ebp
+	xorl	%ebp,%edx
+	rorl	$8,%ebp
+	xorl	%ebp,%edx
+	movl	%eax,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%eax,%eax,1),%edi
+	subl	%ebp,%esi
+	andl	$4278124286,%edi
+	andl	$454761243,%esi
+	movl	%eax,%ebp
+	xorl	%edi,%esi
+	xorl	%esi,%eax
+	roll	$24,%eax
+	xorl	%esi,%eax
+	rorl	$16,%ebp
+	xorl	%ebp,%eax
+	rorl	$8,%ebp
+	xorl	%ebp,%eax
+	movl	%ebx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ebx,%ebx,1),%edi
+	subl	%ebp,%esi
+	andl	$4278124286,%edi
+	andl	$454761243,%esi
+	movl	%ebx,%ebp
+	xorl	%edi,%esi
+	xorl	%esi,%ebx
+	roll	$24,%ebx
+	xorl	%esi,%ebx
+	rorl	$16,%ebp
+	xorl	%ebp,%ebx
+	rorl	$8,%ebp
+	xorl	%ebp,%ebx
+	movl	20(%esp),%edi
+	movl	28(%esp),%ebp
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	cmpl	24(%esp),%edi
+	movl	%edi,20(%esp)
+	jb	.L000loop
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+
+	movl	%ebx,%esi
+	andl	$255,%esi
+	shrl	$16,%ebx
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%ch,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+
+	movl	%ecx,%esi
+	andl	$255,%esi
+	shrl	$24,%ecx
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edx
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	movzbl	%ah,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$8,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	movzbl	-128(%ebp,%ecx,1),%ecx
+	shll	$24,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+
+	xorl	16(%edi),%eax
+	xorl	20(%edi),%ebx
+	xorl	24(%edi),%ecx
+	xorl	28(%edi),%edx
+	ret
+.size	_x86_AES_encrypt_compact,.-_x86_AES_encrypt_compact
+.type	_sse_AES_encrypt_compact,@function
+.align	16
+_sse_AES_encrypt_compact:
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+	movl	$454761243,%eax
+	movl	%eax,8(%esp)
+	movl	%eax,12(%esp)
+	movl	-128(%ebp),%eax
+	movl	-96(%ebp),%ebx
+	movl	-64(%ebp),%ecx
+	movl	-32(%ebp),%edx
+	movl	(%ebp),%eax
+	movl	32(%ebp),%ebx
+	movl	64(%ebp),%ecx
+	movl	96(%ebp),%edx
+.align	16
+.L001loop:
+	pshufw	$8,%mm0,%mm1
+	pshufw	$13,%mm4,%mm5
+	movd	%mm1,%eax
+	movd	%mm5,%ebx
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%ecx
+	pshufw	$13,%mm0,%mm2
+	movzbl	%ah,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	shll	$8,%edx
+	shrl	$16,%eax
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%ecx
+	pshufw	$8,%mm4,%mm6
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%edx
+	shrl	$16,%ebx
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%ecx
+	movd	%ecx,%mm0
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%ecx
+	movd	%mm2,%eax
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%ecx
+	movd	%mm6,%ebx
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%ecx
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	movd	%ecx,%mm1
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%ecx
+	shrl	$16,%ebx
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%ecx
+	shrl	$16,%eax
+	punpckldq	%mm1,%mm0
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%ecx
+	andl	$255,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$16,%eax
+	orl	%eax,%edx
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	movd	%ecx,%mm4
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	orl	%ebx,%edx
+	movd	%edx,%mm5
+	punpckldq	%mm5,%mm4
+	addl	$16,%edi
+	cmpl	24(%esp),%edi
+	ja	.L002out
+	movq	8(%esp),%mm2
+	pxor	%mm3,%mm3
+	pxor	%mm7,%mm7
+	movq	%mm0,%mm1
+	movq	%mm4,%mm5
+	pcmpgtb	%mm0,%mm3
+	pcmpgtb	%mm4,%mm7
+	pand	%mm2,%mm3
+	pand	%mm2,%mm7
+	pshufw	$177,%mm0,%mm2
+	pshufw	$177,%mm4,%mm6
+	paddb	%mm0,%mm0
+	paddb	%mm4,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pshufw	$177,%mm2,%mm3
+	pshufw	$177,%mm6,%mm7
+	pxor	%mm0,%mm1
+	pxor	%mm4,%mm5
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	movq	%mm3,%mm2
+	movq	%mm7,%mm6
+	pslld	$8,%mm3
+	pslld	$8,%mm7
+	psrld	$24,%mm2
+	psrld	$24,%mm6
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	movq	%mm1,%mm3
+	movq	%mm5,%mm7
+	movq	(%edi),%mm2
+	movq	8(%edi),%mm6
+	psrld	$8,%mm1
+	psrld	$8,%mm5
+	movl	-128(%ebp),%eax
+	pslld	$24,%mm3
+	pslld	$24,%mm7
+	movl	-64(%ebp),%ebx
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	movl	(%ebp),%ecx
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	movl	64(%ebp),%edx
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	jmp	.L001loop
+.align	16
+.L002out:
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	ret
+.size	_sse_AES_encrypt_compact,.-_sse_AES_encrypt_compact
+.type	_x86_AES_encrypt,@function
+.align	16
+_x86_AES_encrypt:
+	movl	%edi,20(%esp)
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+.align	16
+.L003loop:
+	movl	%eax,%esi
+	andl	$255,%esi
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%bh,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+	movl	%esi,4(%esp)
+
+	movl	%ebx,%esi
+	andl	$255,%esi
+	shrl	$16,%ebx
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%ch,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%eax,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+	movl	%esi,8(%esp)
+
+	movl	%ecx,%esi
+	andl	$255,%esi
+	shrl	$24,%ecx
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%dh,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edx
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movzbl	%bh,%edi
+	xorl	1(%ebp,%edi,8),%esi
+
+	movl	20(%esp),%edi
+	movl	(%ebp,%edx,8),%edx
+	movzbl	%ah,%eax
+	xorl	3(%ebp,%eax,8),%edx
+	movl	4(%esp),%eax
+	andl	$255,%ebx
+	xorl	2(%ebp,%ebx,8),%edx
+	movl	8(%esp),%ebx
+	xorl	1(%ebp,%ecx,8),%edx
+	movl	%esi,%ecx
+
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	cmpl	24(%esp),%edi
+	movl	%edi,20(%esp)
+	jb	.L003loop
+	movl	%eax,%esi
+	andl	$255,%esi
+	movl	2(%ebp,%esi,8),%esi
+	andl	$255,%esi
+	movzbl	%bh,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$65280,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$16711680,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movl	2(%ebp,%edi,8),%edi
+	andl	$4278190080,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+	movl	%ebx,%esi
+	andl	$255,%esi
+	shrl	$16,%ebx
+	movl	2(%ebp,%esi,8),%esi
+	andl	$255,%esi
+	movzbl	%ch,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$65280,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$16711680,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$24,%edi
+	movl	2(%ebp,%edi,8),%edi
+	andl	$4278190080,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+	movl	%ecx,%esi
+	andl	$255,%esi
+	shrl	$24,%ecx
+	movl	2(%ebp,%esi,8),%esi
+	andl	$255,%esi
+	movzbl	%dh,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$65280,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edx
+	andl	$255,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$16711680,%edi
+	xorl	%edi,%esi
+	movzbl	%bh,%edi
+	movl	2(%ebp,%edi,8),%edi
+	andl	$4278190080,%edi
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movl	2(%ebp,%edx,8),%edx
+	andl	$255,%edx
+	movzbl	%ah,%eax
+	movl	(%ebp,%eax,8),%eax
+	andl	$65280,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	andl	$255,%ebx
+	movl	(%ebp,%ebx,8),%ebx
+	andl	$16711680,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	movl	2(%ebp,%ecx,8),%ecx
+	andl	$4278190080,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	ret
+.align	64
+.LAES_Te:
+.long	2774754246,2774754246
+.long	2222750968,2222750968
+.long	2574743534,2574743534
+.long	2373680118,2373680118
+.long	234025727,234025727
+.long	3177933782,3177933782
+.long	2976870366,2976870366
+.long	1422247313,1422247313
+.long	1345335392,1345335392
+.long	50397442,50397442
+.long	2842126286,2842126286
+.long	2099981142,2099981142
+.long	436141799,436141799
+.long	1658312629,1658312629
+.long	3870010189,3870010189
+.long	2591454956,2591454956
+.long	1170918031,1170918031
+.long	2642575903,2642575903
+.long	1086966153,1086966153
+.long	2273148410,2273148410
+.long	368769775,368769775
+.long	3948501426,3948501426
+.long	3376891790,3376891790
+.long	200339707,200339707
+.long	3970805057,3970805057
+.long	1742001331,1742001331
+.long	4255294047,4255294047
+.long	3937382213,3937382213
+.long	3214711843,3214711843
+.long	4154762323,4154762323
+.long	2524082916,2524082916
+.long	1539358875,1539358875
+.long	3266819957,3266819957
+.long	486407649,486407649
+.long	2928907069,2928907069
+.long	1780885068,1780885068
+.long	1513502316,1513502316
+.long	1094664062,1094664062
+.long	49805301,49805301
+.long	1338821763,1338821763
+.long	1546925160,1546925160
+.long	4104496465,4104496465
+.long	887481809,887481809
+.long	150073849,150073849
+.long	2473685474,2473685474
+.long	1943591083,1943591083
+.long	1395732834,1395732834
+.long	1058346282,1058346282
+.long	201589768,201589768
+.long	1388824469,1388824469
+.long	1696801606,1696801606
+.long	1589887901,1589887901
+.long	672667696,672667696
+.long	2711000631,2711000631
+.long	251987210,251987210
+.long	3046808111,3046808111
+.long	151455502,151455502
+.long	907153956,907153956
+.long	2608889883,2608889883
+.long	1038279391,1038279391
+.long	652995533,652995533
+.long	1764173646,1764173646
+.long	3451040383,3451040383
+.long	2675275242,2675275242
+.long	453576978,453576978
+.long	2659418909,2659418909
+.long	1949051992,1949051992
+.long	773462580,773462580
+.long	756751158,756751158
+.long	2993581788,2993581788
+.long	3998898868,3998898868
+.long	4221608027,4221608027
+.long	4132590244,4132590244
+.long	1295727478,1295727478
+.long	1641469623,1641469623
+.long	3467883389,3467883389
+.long	2066295122,2066295122
+.long	1055122397,1055122397
+.long	1898917726,1898917726
+.long	2542044179,2542044179
+.long	4115878822,4115878822
+.long	1758581177,1758581177
+.long	0,0
+.long	753790401,753790401
+.long	1612718144,1612718144
+.long	536673507,536673507
+.long	3367088505,3367088505
+.long	3982187446,3982187446
+.long	3194645204,3194645204
+.long	1187761037,1187761037
+.long	3653156455,3653156455
+.long	1262041458,1262041458
+.long	3729410708,3729410708
+.long	3561770136,3561770136
+.long	3898103984,3898103984
+.long	1255133061,1255133061
+.long	1808847035,1808847035
+.long	720367557,720367557
+.long	3853167183,3853167183
+.long	385612781,385612781
+.long	3309519750,3309519750
+.long	3612167578,3612167578
+.long	1429418854,1429418854
+.long	2491778321,2491778321
+.long	3477423498,3477423498
+.long	284817897,284817897
+.long	100794884,100794884
+.long	2172616702,2172616702
+.long	4031795360,4031795360
+.long	1144798328,1144798328
+.long	3131023141,3131023141
+.long	3819481163,3819481163
+.long	4082192802,4082192802
+.long	4272137053,4272137053
+.long	3225436288,3225436288
+.long	2324664069,2324664069
+.long	2912064063,2912064063
+.long	3164445985,3164445985
+.long	1211644016,1211644016
+.long	83228145,83228145
+.long	3753688163,3753688163
+.long	3249976951,3249976951
+.long	1977277103,1977277103
+.long	1663115586,1663115586
+.long	806359072,806359072
+.long	452984805,452984805
+.long	250868733,250868733
+.long	1842533055,1842533055
+.long	1288555905,1288555905
+.long	336333848,336333848
+.long	890442534,890442534
+.long	804056259,804056259
+.long	3781124030,3781124030
+.long	2727843637,2727843637
+.long	3427026056,3427026056
+.long	957814574,957814574
+.long	1472513171,1472513171
+.long	4071073621,4071073621
+.long	2189328124,2189328124
+.long	1195195770,1195195770
+.long	2892260552,2892260552
+.long	3881655738,3881655738
+.long	723065138,723065138
+.long	2507371494,2507371494
+.long	2690670784,2690670784
+.long	2558624025,2558624025
+.long	3511635870,3511635870
+.long	2145180835,2145180835
+.long	1713513028,1713513028
+.long	2116692564,2116692564
+.long	2878378043,2878378043
+.long	2206763019,2206763019
+.long	3393603212,3393603212
+.long	703524551,703524551
+.long	3552098411,3552098411
+.long	1007948840,1007948840
+.long	2044649127,2044649127
+.long	3797835452,3797835452
+.long	487262998,487262998
+.long	1994120109,1994120109
+.long	1004593371,1004593371
+.long	1446130276,1446130276
+.long	1312438900,1312438900
+.long	503974420,503974420
+.long	3679013266,3679013266
+.long	168166924,168166924
+.long	1814307912,1814307912
+.long	3831258296,3831258296
+.long	1573044895,1573044895
+.long	1859376061,1859376061
+.long	4021070915,4021070915
+.long	2791465668,2791465668
+.long	2828112185,2828112185
+.long	2761266481,2761266481
+.long	937747667,937747667
+.long	2339994098,2339994098
+.long	854058965,854058965
+.long	1137232011,1137232011
+.long	1496790894,1496790894
+.long	3077402074,3077402074
+.long	2358086913,2358086913
+.long	1691735473,1691735473
+.long	3528347292,3528347292
+.long	3769215305,3769215305
+.long	3027004632,3027004632
+.long	4199962284,4199962284
+.long	133494003,133494003
+.long	636152527,636152527
+.long	2942657994,2942657994
+.long	2390391540,2390391540
+.long	3920539207,3920539207
+.long	403179536,403179536
+.long	3585784431,3585784431
+.long	2289596656,2289596656
+.long	1864705354,1864705354
+.long	1915629148,1915629148
+.long	605822008,605822008
+.long	4054230615,4054230615
+.long	3350508659,3350508659
+.long	1371981463,1371981463
+.long	602466507,602466507
+.long	2094914977,2094914977
+.long	2624877800,2624877800
+.long	555687742,555687742
+.long	3712699286,3712699286
+.long	3703422305,3703422305
+.long	2257292045,2257292045
+.long	2240449039,2240449039
+.long	2423288032,2423288032
+.long	1111375484,1111375484
+.long	3300242801,3300242801
+.long	2858837708,2858837708
+.long	3628615824,3628615824
+.long	84083462,84083462
+.long	32962295,32962295
+.long	302911004,302911004
+.long	2741068226,2741068226
+.long	1597322602,1597322602
+.long	4183250862,4183250862
+.long	3501832553,3501832553
+.long	2441512471,2441512471
+.long	1489093017,1489093017
+.long	656219450,656219450
+.long	3114180135,3114180135
+.long	954327513,954327513
+.long	335083755,335083755
+.long	3013122091,3013122091
+.long	856756514,856756514
+.long	3144247762,3144247762
+.long	1893325225,1893325225
+.long	2307821063,2307821063
+.long	2811532339,2811532339
+.long	3063651117,3063651117
+.long	572399164,572399164
+.long	2458355477,2458355477
+.long	552200649,552200649
+.long	1238290055,1238290055
+.long	4283782570,4283782570
+.long	2015897680,2015897680
+.long	2061492133,2061492133
+.long	2408352771,2408352771
+.long	4171342169,4171342169
+.long	2156497161,2156497161
+.long	386731290,386731290
+.long	3669999461,3669999461
+.long	837215959,837215959
+.long	3326231172,3326231172
+.long	3093850320,3093850320
+.long	3275833730,3275833730
+.long	2962856233,2962856233
+.long	1999449434,1999449434
+.long	286199582,286199582
+.long	3417354363,3417354363
+.long	4233385128,4233385128
+.long	3602627437,3602627437
+.long	974525996,974525996
+.byte	99,124,119,123,242,107,111,197
+.byte	48,1,103,43,254,215,171,118
+.byte	202,130,201,125,250,89,71,240
+.byte	173,212,162,175,156,164,114,192
+.byte	183,253,147,38,54,63,247,204
+.byte	52,165,229,241,113,216,49,21
+.byte	4,199,35,195,24,150,5,154
+.byte	7,18,128,226,235,39,178,117
+.byte	9,131,44,26,27,110,90,160
+.byte	82,59,214,179,41,227,47,132
+.byte	83,209,0,237,32,252,177,91
+.byte	106,203,190,57,74,76,88,207
+.byte	208,239,170,251,67,77,51,133
+.byte	69,249,2,127,80,60,159,168
+.byte	81,163,64,143,146,157,56,245
+.byte	188,182,218,33,16,255,243,210
+.byte	205,12,19,236,95,151,68,23
+.byte	196,167,126,61,100,93,25,115
+.byte	96,129,79,220,34,42,144,136
+.byte	70,238,184,20,222,94,11,219
+.byte	224,50,58,10,73,6,36,92
+.byte	194,211,172,98,145,149,228,121
+.byte	231,200,55,109,141,213,78,169
+.byte	108,86,244,234,101,122,174,8
+.byte	186,120,37,46,28,166,180,198
+.byte	232,221,116,31,75,189,139,138
+.byte	112,62,181,102,72,3,246,14
+.byte	97,53,87,185,134,193,29,158
+.byte	225,248,152,17,105,217,142,148
+.byte	155,30,135,233,206,85,40,223
+.byte	140,161,137,13,191,230,66,104
+.byte	65,153,45,15,176,84,187,22
+.byte	99,124,119,123,242,107,111,197
+.byte	48,1,103,43,254,215,171,118
+.byte	202,130,201,125,250,89,71,240
+.byte	173,212,162,175,156,164,114,192
+.byte	183,253,147,38,54,63,247,204
+.byte	52,165,229,241,113,216,49,21
+.byte	4,199,35,195,24,150,5,154
+.byte	7,18,128,226,235,39,178,117
+.byte	9,131,44,26,27,110,90,160
+.byte	82,59,214,179,41,227,47,132
+.byte	83,209,0,237,32,252,177,91
+.byte	106,203,190,57,74,76,88,207
+.byte	208,239,170,251,67,77,51,133
+.byte	69,249,2,127,80,60,159,168
+.byte	81,163,64,143,146,157,56,245
+.byte	188,182,218,33,16,255,243,210
+.byte	205,12,19,236,95,151,68,23
+.byte	196,167,126,61,100,93,25,115
+.byte	96,129,79,220,34,42,144,136
+.byte	70,238,184,20,222,94,11,219
+.byte	224,50,58,10,73,6,36,92
+.byte	194,211,172,98,145,149,228,121
+.byte	231,200,55,109,141,213,78,169
+.byte	108,86,244,234,101,122,174,8
+.byte	186,120,37,46,28,166,180,198
+.byte	232,221,116,31,75,189,139,138
+.byte	112,62,181,102,72,3,246,14
+.byte	97,53,87,185,134,193,29,158
+.byte	225,248,152,17,105,217,142,148
+.byte	155,30,135,233,206,85,40,223
+.byte	140,161,137,13,191,230,66,104
+.byte	65,153,45,15,176,84,187,22
+.byte	99,124,119,123,242,107,111,197
+.byte	48,1,103,43,254,215,171,118
+.byte	202,130,201,125,250,89,71,240
+.byte	173,212,162,175,156,164,114,192
+.byte	183,253,147,38,54,63,247,204
+.byte	52,165,229,241,113,216,49,21
+.byte	4,199,35,195,24,150,5,154
+.byte	7,18,128,226,235,39,178,117
+.byte	9,131,44,26,27,110,90,160
+.byte	82,59,214,179,41,227,47,132
+.byte	83,209,0,237,32,252,177,91
+.byte	106,203,190,57,74,76,88,207
+.byte	208,239,170,251,67,77,51,133
+.byte	69,249,2,127,80,60,159,168
+.byte	81,163,64,143,146,157,56,245
+.byte	188,182,218,33,16,255,243,210
+.byte	205,12,19,236,95,151,68,23
+.byte	196,167,126,61,100,93,25,115
+.byte	96,129,79,220,34,42,144,136
+.byte	70,238,184,20,222,94,11,219
+.byte	224,50,58,10,73,6,36,92
+.byte	194,211,172,98,145,149,228,121
+.byte	231,200,55,109,141,213,78,169
+.byte	108,86,244,234,101,122,174,8
+.byte	186,120,37,46,28,166,180,198
+.byte	232,221,116,31,75,189,139,138
+.byte	112,62,181,102,72,3,246,14
+.byte	97,53,87,185,134,193,29,158
+.byte	225,248,152,17,105,217,142,148
+.byte	155,30,135,233,206,85,40,223
+.byte	140,161,137,13,191,230,66,104
+.byte	65,153,45,15,176,84,187,22
+.byte	99,124,119,123,242,107,111,197
+.byte	48,1,103,43,254,215,171,118
+.byte	202,130,201,125,250,89,71,240
+.byte	173,212,162,175,156,164,114,192
+.byte	183,253,147,38,54,63,247,204
+.byte	52,165,229,241,113,216,49,21
+.byte	4,199,35,195,24,150,5,154
+.byte	7,18,128,226,235,39,178,117
+.byte	9,131,44,26,27,110,90,160
+.byte	82,59,214,179,41,227,47,132
+.byte	83,209,0,237,32,252,177,91
+.byte	106,203,190,57,74,76,88,207
+.byte	208,239,170,251,67,77,51,133
+.byte	69,249,2,127,80,60,159,168
+.byte	81,163,64,143,146,157,56,245
+.byte	188,182,218,33,16,255,243,210
+.byte	205,12,19,236,95,151,68,23
+.byte	196,167,126,61,100,93,25,115
+.byte	96,129,79,220,34,42,144,136
+.byte	70,238,184,20,222,94,11,219
+.byte	224,50,58,10,73,6,36,92
+.byte	194,211,172,98,145,149,228,121
+.byte	231,200,55,109,141,213,78,169
+.byte	108,86,244,234,101,122,174,8
+.byte	186,120,37,46,28,166,180,198
+.byte	232,221,116,31,75,189,139,138
+.byte	112,62,181,102,72,3,246,14
+.byte	97,53,87,185,134,193,29,158
+.byte	225,248,152,17,105,217,142,148
+.byte	155,30,135,233,206,85,40,223
+.byte	140,161,137,13,191,230,66,104
+.byte	65,153,45,15,176,84,187,22
+.long	1,2,4,8
+.long	16,32,64,128
+.long	27,54,0,0
+.long	0,0,0,0
+.size	_x86_AES_encrypt,.-_x86_AES_encrypt
+.globl	AES_encrypt
+.type	AES_encrypt,@function
+.align	16
+AES_encrypt:
+.L_AES_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	28(%esp),%edi
+	movl	%esp,%eax
+	subl	$36,%esp
+	andl	$-64,%esp
+	leal	-127(%edi),%ebx
+	subl	%esp,%ebx
+	negl	%ebx
+	andl	$960,%ebx
+	subl	%ebx,%esp
+	addl	$4,%esp
+	movl	%eax,28(%esp)
+	call	.L004pic_point
+.L004pic_point:
+	popl	%ebp
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L004pic_point](%ebp),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	leal	.LAES_Te-.L004pic_point(%ebp),%ebp
+	leal	764(%esp),%ebx
+	subl	%ebp,%ebx
+	andl	$768,%ebx
+	leal	2176(%ebp,%ebx,1),%ebp
+	btl	$25,(%eax)
+	jnc	.L005x86
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm4
+	call	_sse_AES_encrypt_compact
+	movl	28(%esp),%esp
+	movl	24(%esp),%esi
+	movq	%mm0,(%esi)
+	movq	%mm4,8(%esi)
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	16
+.L005x86:
+	movl	%ebp,24(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	call	_x86_AES_encrypt_compact
+	movl	28(%esp),%esp
+	movl	24(%esp),%esi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	AES_encrypt,.-.L_AES_encrypt_begin
+.type	_x86_AES_decrypt_compact,@function
+.align	16
+_x86_AES_decrypt_compact:
+	movl	%edi,20(%esp)
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+	movl	-128(%ebp),%edi
+	movl	-96(%ebp),%esi
+	movl	-64(%ebp),%edi
+	movl	-32(%ebp),%esi
+	movl	(%ebp),%edi
+	movl	32(%ebp),%esi
+	movl	64(%ebp),%edi
+	movl	96(%ebp),%esi
+.align	16
+.L006loop:
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ebx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+	movl	%ebx,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%ah,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+	movl	%ecx,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	andl	$255,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	movzbl	%ch,%ecx
+	movzbl	-128(%ebp,%ecx,1),%ecx
+	shll	$8,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	shrl	$24,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$24,%eax
+	xorl	%eax,%edx
+	movl	%ecx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ecx,%ecx,1),%eax
+	subl	%edi,%esi
+	andl	$4278124286,%eax
+	andl	$454761243,%esi
+	xorl	%eax,%esi
+	movl	%esi,%eax
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%eax,%eax,1),%ebx
+	subl	%edi,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	xorl	%ecx,%eax
+	xorl	%ebx,%esi
+	movl	%esi,%ebx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ebx,%ebx,1),%ebp
+	subl	%edi,%esi
+	andl	$4278124286,%ebp
+	andl	$454761243,%esi
+	xorl	%ecx,%ebx
+	roll	$8,%ecx
+	xorl	%esi,%ebp
+	xorl	%eax,%ecx
+	xorl	%ebp,%eax
+	roll	$24,%eax
+	xorl	%ebx,%ecx
+	xorl	%ebp,%ebx
+	roll	$16,%ebx
+	xorl	%ebp,%ecx
+	roll	$8,%ebp
+	xorl	%eax,%ecx
+	xorl	%ebx,%ecx
+	movl	4(%esp),%eax
+	xorl	%ebp,%ecx
+	movl	%ecx,12(%esp)
+	movl	%edx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%edx,%edx,1),%ebx
+	subl	%edi,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	xorl	%ebx,%esi
+	movl	%esi,%ebx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ebx,%ebx,1),%ecx
+	subl	%edi,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%edx,%ebx
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ecx,%ecx,1),%ebp
+	subl	%edi,%esi
+	andl	$4278124286,%ebp
+	andl	$454761243,%esi
+	xorl	%edx,%ecx
+	roll	$8,%edx
+	xorl	%esi,%ebp
+	xorl	%ebx,%edx
+	xorl	%ebp,%ebx
+	roll	$24,%ebx
+	xorl	%ecx,%edx
+	xorl	%ebp,%ecx
+	roll	$16,%ecx
+	xorl	%ebp,%edx
+	roll	$8,%ebp
+	xorl	%ebx,%edx
+	xorl	%ecx,%edx
+	movl	8(%esp),%ebx
+	xorl	%ebp,%edx
+	movl	%edx,16(%esp)
+	movl	%eax,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%eax,%eax,1),%ecx
+	subl	%edi,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ecx,%ecx,1),%edx
+	subl	%edi,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	xorl	%eax,%ecx
+	xorl	%edx,%esi
+	movl	%esi,%edx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%edx,%edx,1),%ebp
+	subl	%edi,%esi
+	andl	$4278124286,%ebp
+	andl	$454761243,%esi
+	xorl	%eax,%edx
+	roll	$8,%eax
+	xorl	%esi,%ebp
+	xorl	%ecx,%eax
+	xorl	%ebp,%ecx
+	roll	$24,%ecx
+	xorl	%edx,%eax
+	xorl	%ebp,%edx
+	roll	$16,%edx
+	xorl	%ebp,%eax
+	roll	$8,%ebp
+	xorl	%ecx,%eax
+	xorl	%edx,%eax
+	xorl	%ebp,%eax
+	movl	%ebx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ebx,%ebx,1),%ecx
+	subl	%edi,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ecx,%ecx,1),%edx
+	subl	%edi,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	xorl	%ebx,%ecx
+	xorl	%edx,%esi
+	movl	%esi,%edx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%edx,%edx,1),%ebp
+	subl	%edi,%esi
+	andl	$4278124286,%ebp
+	andl	$454761243,%esi
+	xorl	%ebx,%edx
+	roll	$8,%ebx
+	xorl	%esi,%ebp
+	xorl	%ecx,%ebx
+	xorl	%ebp,%ecx
+	roll	$24,%ecx
+	xorl	%edx,%ebx
+	xorl	%ebp,%edx
+	roll	$16,%edx
+	xorl	%ebp,%ebx
+	roll	$8,%ebp
+	xorl	%ecx,%ebx
+	xorl	%edx,%ebx
+	movl	12(%esp),%ecx
+	xorl	%ebp,%ebx
+	movl	16(%esp),%edx
+	movl	20(%esp),%edi
+	movl	28(%esp),%ebp
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	cmpl	24(%esp),%edi
+	movl	%edi,20(%esp)
+	jb	.L006loop
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ebx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+	movl	%ebx,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%ah,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+	movl	%ecx,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	movzbl	%ch,%ecx
+	movzbl	-128(%ebp,%ecx,1),%ecx
+	shll	$8,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	shrl	$24,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$24,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	xorl	16(%edi),%eax
+	xorl	20(%edi),%ebx
+	xorl	24(%edi),%ecx
+	xorl	28(%edi),%edx
+	ret
+.size	_x86_AES_decrypt_compact,.-_x86_AES_decrypt_compact
+.type	_sse_AES_decrypt_compact,@function
+.align	16
+_sse_AES_decrypt_compact:
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+	movl	$454761243,%eax
+	movl	%eax,8(%esp)
+	movl	%eax,12(%esp)
+	movl	-128(%ebp),%eax
+	movl	-96(%ebp),%ebx
+	movl	-64(%ebp),%ecx
+	movl	-32(%ebp),%edx
+	movl	(%ebp),%eax
+	movl	32(%ebp),%ebx
+	movl	64(%ebp),%ecx
+	movl	96(%ebp),%edx
+.align	16
+.L007loop:
+	pshufw	$12,%mm0,%mm1
+	movd	%mm1,%eax
+	pshufw	$9,%mm4,%mm5
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%ecx
+	movd	%mm5,%ebx
+	movzbl	%ah,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	shll	$8,%edx
+	pshufw	$6,%mm0,%mm2
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%ecx
+	shrl	$16,%eax
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%edx
+	shrl	$16,%ebx
+	pshufw	$3,%mm4,%mm6
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%ecx
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	movd	%ecx,%mm0
+	movzbl	%al,%esi
+	movd	%mm2,%eax
+	movzbl	-128(%ebp,%esi,1),%ecx
+	shll	$16,%ecx
+	movzbl	%bl,%esi
+	movd	%mm6,%ebx
+	movzbl	-128(%ebp,%esi,1),%esi
+	orl	%esi,%ecx
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	orl	%esi,%edx
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%edx
+	movd	%edx,%mm1
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%edx
+	shll	$8,%edx
+	movzbl	%bh,%esi
+	shrl	$16,%eax
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%edx
+	shrl	$16,%ebx
+	punpckldq	%mm1,%mm0
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	orl	%ebx,%edx
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%edx
+	movd	%edx,%mm4
+	movzbl	%ah,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$24,%eax
+	orl	%eax,%ecx
+	movd	%ecx,%mm5
+	punpckldq	%mm5,%mm4
+	addl	$16,%edi
+	cmpl	24(%esp),%edi
+	ja	.L008out
+	movq	%mm0,%mm3
+	movq	%mm4,%mm7
+	pshufw	$228,%mm0,%mm2
+	pshufw	$228,%mm4,%mm6
+	movq	%mm0,%mm1
+	movq	%mm4,%mm5
+	pshufw	$177,%mm0,%mm0
+	pshufw	$177,%mm4,%mm4
+	pslld	$8,%mm2
+	pslld	$8,%mm6
+	psrld	$8,%mm3
+	psrld	$8,%mm7
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pslld	$16,%mm2
+	pslld	$16,%mm6
+	psrld	$16,%mm3
+	psrld	$16,%mm7
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	movq	8(%esp),%mm3
+	pxor	%mm2,%mm2
+	pxor	%mm6,%mm6
+	pcmpgtb	%mm1,%mm2
+	pcmpgtb	%mm5,%mm6
+	pand	%mm3,%mm2
+	pand	%mm3,%mm6
+	paddb	%mm1,%mm1
+	paddb	%mm5,%mm5
+	pxor	%mm2,%mm1
+	pxor	%mm6,%mm5
+	movq	%mm1,%mm3
+	movq	%mm5,%mm7
+	movq	%mm1,%mm2
+	movq	%mm5,%mm6
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	pslld	$24,%mm3
+	pslld	$24,%mm7
+	psrld	$8,%mm2
+	psrld	$8,%mm6
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	movq	8(%esp),%mm2
+	pxor	%mm3,%mm3
+	pxor	%mm7,%mm7
+	pcmpgtb	%mm1,%mm3
+	pcmpgtb	%mm5,%mm7
+	pand	%mm2,%mm3
+	pand	%mm2,%mm7
+	paddb	%mm1,%mm1
+	paddb	%mm5,%mm5
+	pxor	%mm3,%mm1
+	pxor	%mm7,%mm5
+	pshufw	$177,%mm1,%mm3
+	pshufw	$177,%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pxor	%mm3,%mm3
+	pxor	%mm7,%mm7
+	pcmpgtb	%mm1,%mm3
+	pcmpgtb	%mm5,%mm7
+	pand	%mm2,%mm3
+	pand	%mm2,%mm7
+	paddb	%mm1,%mm1
+	paddb	%mm5,%mm5
+	pxor	%mm3,%mm1
+	pxor	%mm7,%mm5
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	movq	%mm1,%mm3
+	movq	%mm5,%mm7
+	pshufw	$177,%mm1,%mm2
+	pshufw	$177,%mm5,%mm6
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	pslld	$8,%mm1
+	pslld	$8,%mm5
+	psrld	$8,%mm3
+	psrld	$8,%mm7
+	movq	(%edi),%mm2
+	movq	8(%edi),%mm6
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	movl	-128(%ebp),%eax
+	pslld	$16,%mm1
+	pslld	$16,%mm5
+	movl	-64(%ebp),%ebx
+	psrld	$16,%mm3
+	psrld	$16,%mm7
+	movl	(%ebp),%ecx
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	movl	64(%ebp),%edx
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	jmp	.L007loop
+.align	16
+.L008out:
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	ret
+.size	_sse_AES_decrypt_compact,.-_sse_AES_decrypt_compact
+.type	_x86_AES_decrypt,@function
+.align	16
+_x86_AES_decrypt:
+	movl	%edi,20(%esp)
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+.align	16
+.L009loop:
+	movl	%eax,%esi
+	andl	$255,%esi
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%dh,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%ebx,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+	movl	%esi,4(%esp)
+
+	movl	%ebx,%esi
+	andl	$255,%esi
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%ah,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%ecx,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+	movl	%esi,8(%esp)
+
+	movl	%ecx,%esi
+	andl	$255,%esi
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%bh,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movl	(%ebp,%edx,8),%edx
+	movzbl	%ch,%ecx
+	xorl	3(%ebp,%ecx,8),%edx
+	movl	%esi,%ecx
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	xorl	2(%ebp,%ebx,8),%edx
+	movl	8(%esp),%ebx
+	shrl	$24,%eax
+	xorl	1(%ebp,%eax,8),%edx
+	movl	4(%esp),%eax
+
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	cmpl	24(%esp),%edi
+	movl	%edi,20(%esp)
+	jb	.L009loop
+	leal	2176(%ebp),%ebp
+	movl	-128(%ebp),%edi
+	movl	-96(%ebp),%esi
+	movl	-64(%ebp),%edi
+	movl	-32(%ebp),%esi
+	movl	(%ebp),%edi
+	movl	32(%ebp),%esi
+	movl	64(%ebp),%edi
+	movl	96(%ebp),%esi
+	leal	-128(%ebp),%ebp
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ebx,%edi
+	shrl	$24,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+	movl	%ebx,%esi
+	andl	$255,%esi
+	movzbl	(%ebp,%esi,1),%esi
+	movzbl	%ah,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$24,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+	movl	%ecx,%esi
+	andl	$255,%esi
+	movzbl	(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movzbl	(%ebp,%edx,1),%edx
+	movzbl	%ch,%ecx
+	movzbl	(%ebp,%ecx,1),%ecx
+	shll	$8,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	movzbl	(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	shrl	$24,%eax
+	movzbl	(%ebp,%eax,1),%eax
+	shll	$24,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	leal	-2048(%ebp),%ebp
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	ret
+.align	64
+.LAES_Td:
+.long	1353184337,1353184337
+.long	1399144830,1399144830
+.long	3282310938,3282310938
+.long	2522752826,2522752826
+.long	3412831035,3412831035
+.long	4047871263,4047871263
+.long	2874735276,2874735276
+.long	2466505547,2466505547
+.long	1442459680,1442459680
+.long	4134368941,4134368941
+.long	2440481928,2440481928
+.long	625738485,625738485
+.long	4242007375,4242007375
+.long	3620416197,3620416197
+.long	2151953702,2151953702
+.long	2409849525,2409849525
+.long	1230680542,1230680542
+.long	1729870373,1729870373
+.long	2551114309,2551114309
+.long	3787521629,3787521629
+.long	41234371,41234371
+.long	317738113,317738113
+.long	2744600205,2744600205
+.long	3338261355,3338261355
+.long	3881799427,3881799427
+.long	2510066197,2510066197
+.long	3950669247,3950669247
+.long	3663286933,3663286933
+.long	763608788,763608788
+.long	3542185048,3542185048
+.long	694804553,694804553
+.long	1154009486,1154009486
+.long	1787413109,1787413109
+.long	2021232372,2021232372
+.long	1799248025,1799248025
+.long	3715217703,3715217703
+.long	3058688446,3058688446
+.long	397248752,397248752
+.long	1722556617,1722556617
+.long	3023752829,3023752829
+.long	407560035,407560035
+.long	2184256229,2184256229
+.long	1613975959,1613975959
+.long	1165972322,1165972322
+.long	3765920945,3765920945
+.long	2226023355,2226023355
+.long	480281086,480281086
+.long	2485848313,2485848313
+.long	1483229296,1483229296
+.long	436028815,436028815
+.long	2272059028,2272059028
+.long	3086515026,3086515026
+.long	601060267,601060267
+.long	3791801202,3791801202
+.long	1468997603,1468997603
+.long	715871590,715871590
+.long	120122290,120122290
+.long	63092015,63092015
+.long	2591802758,2591802758
+.long	2768779219,2768779219
+.long	4068943920,4068943920
+.long	2997206819,2997206819
+.long	3127509762,3127509762
+.long	1552029421,1552029421
+.long	723308426,723308426
+.long	2461301159,2461301159
+.long	4042393587,4042393587
+.long	2715969870,2715969870
+.long	3455375973,3455375973
+.long	3586000134,3586000134
+.long	526529745,526529745
+.long	2331944644,2331944644
+.long	2639474228,2639474228
+.long	2689987490,2689987490
+.long	853641733,853641733
+.long	1978398372,1978398372
+.long	971801355,971801355
+.long	2867814464,2867814464
+.long	111112542,111112542
+.long	1360031421,1360031421
+.long	4186579262,4186579262
+.long	1023860118,1023860118
+.long	2919579357,2919579357
+.long	1186850381,1186850381
+.long	3045938321,3045938321
+.long	90031217,90031217
+.long	1876166148,1876166148
+.long	4279586912,4279586912
+.long	620468249,620468249
+.long	2548678102,2548678102
+.long	3426959497,3426959497
+.long	2006899047,2006899047
+.long	3175278768,3175278768
+.long	2290845959,2290845959
+.long	945494503,945494503
+.long	3689859193,3689859193
+.long	1191869601,1191869601
+.long	3910091388,3910091388
+.long	3374220536,3374220536
+.long	0,0
+.long	2206629897,2206629897
+.long	1223502642,1223502642
+.long	2893025566,2893025566
+.long	1316117100,1316117100
+.long	4227796733,4227796733
+.long	1446544655,1446544655
+.long	517320253,517320253
+.long	658058550,658058550
+.long	1691946762,1691946762
+.long	564550760,564550760
+.long	3511966619,3511966619
+.long	976107044,976107044
+.long	2976320012,2976320012
+.long	266819475,266819475
+.long	3533106868,3533106868
+.long	2660342555,2660342555
+.long	1338359936,1338359936
+.long	2720062561,2720062561
+.long	1766553434,1766553434
+.long	370807324,370807324
+.long	179999714,179999714
+.long	3844776128,3844776128
+.long	1138762300,1138762300
+.long	488053522,488053522
+.long	185403662,185403662
+.long	2915535858,2915535858
+.long	3114841645,3114841645
+.long	3366526484,3366526484
+.long	2233069911,2233069911
+.long	1275557295,1275557295
+.long	3151862254,3151862254
+.long	4250959779,4250959779
+.long	2670068215,2670068215
+.long	3170202204,3170202204
+.long	3309004356,3309004356
+.long	880737115,880737115
+.long	1982415755,1982415755
+.long	3703972811,3703972811
+.long	1761406390,1761406390
+.long	1676797112,1676797112
+.long	3403428311,3403428311
+.long	277177154,277177154
+.long	1076008723,1076008723
+.long	538035844,538035844
+.long	2099530373,2099530373
+.long	4164795346,4164795346
+.long	288553390,288553390
+.long	1839278535,1839278535
+.long	1261411869,1261411869
+.long	4080055004,4080055004
+.long	3964831245,3964831245
+.long	3504587127,3504587127
+.long	1813426987,1813426987
+.long	2579067049,2579067049
+.long	4199060497,4199060497
+.long	577038663,577038663
+.long	3297574056,3297574056
+.long	440397984,440397984
+.long	3626794326,3626794326
+.long	4019204898,4019204898
+.long	3343796615,3343796615
+.long	3251714265,3251714265
+.long	4272081548,4272081548
+.long	906744984,906744984
+.long	3481400742,3481400742
+.long	685669029,685669029
+.long	646887386,646887386
+.long	2764025151,2764025151
+.long	3835509292,3835509292
+.long	227702864,227702864
+.long	2613862250,2613862250
+.long	1648787028,1648787028
+.long	3256061430,3256061430
+.long	3904428176,3904428176
+.long	1593260334,1593260334
+.long	4121936770,4121936770
+.long	3196083615,3196083615
+.long	2090061929,2090061929
+.long	2838353263,2838353263
+.long	3004310991,3004310991
+.long	999926984,999926984
+.long	2809993232,2809993232
+.long	1852021992,1852021992
+.long	2075868123,2075868123
+.long	158869197,158869197
+.long	4095236462,4095236462
+.long	28809964,28809964
+.long	2828685187,2828685187
+.long	1701746150,1701746150
+.long	2129067946,2129067946
+.long	147831841,147831841
+.long	3873969647,3873969647
+.long	3650873274,3650873274
+.long	3459673930,3459673930
+.long	3557400554,3557400554
+.long	3598495785,3598495785
+.long	2947720241,2947720241
+.long	824393514,824393514
+.long	815048134,815048134
+.long	3227951669,3227951669
+.long	935087732,935087732
+.long	2798289660,2798289660
+.long	2966458592,2966458592
+.long	366520115,366520115
+.long	1251476721,1251476721
+.long	4158319681,4158319681
+.long	240176511,240176511
+.long	804688151,804688151
+.long	2379631990,2379631990
+.long	1303441219,1303441219
+.long	1414376140,1414376140
+.long	3741619940,3741619940
+.long	3820343710,3820343710
+.long	461924940,461924940
+.long	3089050817,3089050817
+.long	2136040774,2136040774
+.long	82468509,82468509
+.long	1563790337,1563790337
+.long	1937016826,1937016826
+.long	776014843,776014843
+.long	1511876531,1511876531
+.long	1389550482,1389550482
+.long	861278441,861278441
+.long	323475053,323475053
+.long	2355222426,2355222426
+.long	2047648055,2047648055
+.long	2383738969,2383738969
+.long	2302415851,2302415851
+.long	3995576782,3995576782
+.long	902390199,902390199
+.long	3991215329,3991215329
+.long	1018251130,1018251130
+.long	1507840668,1507840668
+.long	1064563285,1064563285
+.long	2043548696,2043548696
+.long	3208103795,3208103795
+.long	3939366739,3939366739
+.long	1537932639,1537932639
+.long	342834655,342834655
+.long	2262516856,2262516856
+.long	2180231114,2180231114
+.long	1053059257,1053059257
+.long	741614648,741614648
+.long	1598071746,1598071746
+.long	1925389590,1925389590
+.long	203809468,203809468
+.long	2336832552,2336832552
+.long	1100287487,1100287487
+.long	1895934009,1895934009
+.long	3736275976,3736275976
+.long	2632234200,2632234200
+.long	2428589668,2428589668
+.long	1636092795,1636092795
+.long	1890988757,1890988757
+.long	1952214088,1952214088
+.long	1113045200,1113045200
+.byte	82,9,106,213,48,54,165,56
+.byte	191,64,163,158,129,243,215,251
+.byte	124,227,57,130,155,47,255,135
+.byte	52,142,67,68,196,222,233,203
+.byte	84,123,148,50,166,194,35,61
+.byte	238,76,149,11,66,250,195,78
+.byte	8,46,161,102,40,217,36,178
+.byte	118,91,162,73,109,139,209,37
+.byte	114,248,246,100,134,104,152,22
+.byte	212,164,92,204,93,101,182,146
+.byte	108,112,72,80,253,237,185,218
+.byte	94,21,70,87,167,141,157,132
+.byte	144,216,171,0,140,188,211,10
+.byte	247,228,88,5,184,179,69,6
+.byte	208,44,30,143,202,63,15,2
+.byte	193,175,189,3,1,19,138,107
+.byte	58,145,17,65,79,103,220,234
+.byte	151,242,207,206,240,180,230,115
+.byte	150,172,116,34,231,173,53,133
+.byte	226,249,55,232,28,117,223,110
+.byte	71,241,26,113,29,41,197,137
+.byte	111,183,98,14,170,24,190,27
+.byte	252,86,62,75,198,210,121,32
+.byte	154,219,192,254,120,205,90,244
+.byte	31,221,168,51,136,7,199,49
+.byte	177,18,16,89,39,128,236,95
+.byte	96,81,127,169,25,181,74,13
+.byte	45,229,122,159,147,201,156,239
+.byte	160,224,59,77,174,42,245,176
+.byte	200,235,187,60,131,83,153,97
+.byte	23,43,4,126,186,119,214,38
+.byte	225,105,20,99,85,33,12,125
+.byte	82,9,106,213,48,54,165,56
+.byte	191,64,163,158,129,243,215,251
+.byte	124,227,57,130,155,47,255,135
+.byte	52,142,67,68,196,222,233,203
+.byte	84,123,148,50,166,194,35,61
+.byte	238,76,149,11,66,250,195,78
+.byte	8,46,161,102,40,217,36,178
+.byte	118,91,162,73,109,139,209,37
+.byte	114,248,246,100,134,104,152,22
+.byte	212,164,92,204,93,101,182,146
+.byte	108,112,72,80,253,237,185,218
+.byte	94,21,70,87,167,141,157,132
+.byte	144,216,171,0,140,188,211,10
+.byte	247,228,88,5,184,179,69,6
+.byte	208,44,30,143,202,63,15,2
+.byte	193,175,189,3,1,19,138,107
+.byte	58,145,17,65,79,103,220,234
+.byte	151,242,207,206,240,180,230,115
+.byte	150,172,116,34,231,173,53,133
+.byte	226,249,55,232,28,117,223,110
+.byte	71,241,26,113,29,41,197,137
+.byte	111,183,98,14,170,24,190,27
+.byte	252,86,62,75,198,210,121,32
+.byte	154,219,192,254,120,205,90,244
+.byte	31,221,168,51,136,7,199,49
+.byte	177,18,16,89,39,128,236,95
+.byte	96,81,127,169,25,181,74,13
+.byte	45,229,122,159,147,201,156,239
+.byte	160,224,59,77,174,42,245,176
+.byte	200,235,187,60,131,83,153,97
+.byte	23,43,4,126,186,119,214,38
+.byte	225,105,20,99,85,33,12,125
+.byte	82,9,106,213,48,54,165,56
+.byte	191,64,163,158,129,243,215,251
+.byte	124,227,57,130,155,47,255,135
+.byte	52,142,67,68,196,222,233,203
+.byte	84,123,148,50,166,194,35,61
+.byte	238,76,149,11,66,250,195,78
+.byte	8,46,161,102,40,217,36,178
+.byte	118,91,162,73,109,139,209,37
+.byte	114,248,246,100,134,104,152,22
+.byte	212,164,92,204,93,101,182,146
+.byte	108,112,72,80,253,237,185,218
+.byte	94,21,70,87,167,141,157,132
+.byte	144,216,171,0,140,188,211,10
+.byte	247,228,88,5,184,179,69,6
+.byte	208,44,30,143,202,63,15,2
+.byte	193,175,189,3,1,19,138,107
+.byte	58,145,17,65,79,103,220,234
+.byte	151,242,207,206,240,180,230,115
+.byte	150,172,116,34,231,173,53,133
+.byte	226,249,55,232,28,117,223,110
+.byte	71,241,26,113,29,41,197,137
+.byte	111,183,98,14,170,24,190,27
+.byte	252,86,62,75,198,210,121,32
+.byte	154,219,192,254,120,205,90,244
+.byte	31,221,168,51,136,7,199,49
+.byte	177,18,16,89,39,128,236,95
+.byte	96,81,127,169,25,181,74,13
+.byte	45,229,122,159,147,201,156,239
+.byte	160,224,59,77,174,42,245,176
+.byte	200,235,187,60,131,83,153,97
+.byte	23,43,4,126,186,119,214,38
+.byte	225,105,20,99,85,33,12,125
+.byte	82,9,106,213,48,54,165,56
+.byte	191,64,163,158,129,243,215,251
+.byte	124,227,57,130,155,47,255,135
+.byte	52,142,67,68,196,222,233,203
+.byte	84,123,148,50,166,194,35,61
+.byte	238,76,149,11,66,250,195,78
+.byte	8,46,161,102,40,217,36,178
+.byte	118,91,162,73,109,139,209,37
+.byte	114,248,246,100,134,104,152,22
+.byte	212,164,92,204,93,101,182,146
+.byte	108,112,72,80,253,237,185,218
+.byte	94,21,70,87,167,141,157,132
+.byte	144,216,171,0,140,188,211,10
+.byte	247,228,88,5,184,179,69,6
+.byte	208,44,30,143,202,63,15,2
+.byte	193,175,189,3,1,19,138,107
+.byte	58,145,17,65,79,103,220,234
+.byte	151,242,207,206,240,180,230,115
+.byte	150,172,116,34,231,173,53,133
+.byte	226,249,55,232,28,117,223,110
+.byte	71,241,26,113,29,41,197,137
+.byte	111,183,98,14,170,24,190,27
+.byte	252,86,62,75,198,210,121,32
+.byte	154,219,192,254,120,205,90,244
+.byte	31,221,168,51,136,7,199,49
+.byte	177,18,16,89,39,128,236,95
+.byte	96,81,127,169,25,181,74,13
+.byte	45,229,122,159,147,201,156,239
+.byte	160,224,59,77,174,42,245,176
+.byte	200,235,187,60,131,83,153,97
+.byte	23,43,4,126,186,119,214,38
+.byte	225,105,20,99,85,33,12,125
+.size	_x86_AES_decrypt,.-_x86_AES_decrypt
+.globl	AES_decrypt
+.type	AES_decrypt,@function
+.align	16
+AES_decrypt:
+.L_AES_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	28(%esp),%edi
+	movl	%esp,%eax
+	subl	$36,%esp
+	andl	$-64,%esp
+	leal	-127(%edi),%ebx
+	subl	%esp,%ebx
+	negl	%ebx
+	andl	$960,%ebx
+	subl	%ebx,%esp
+	addl	$4,%esp
+	movl	%eax,28(%esp)
+	call	.L010pic_point
+.L010pic_point:
+	popl	%ebp
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010pic_point](%ebp),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	leal	.LAES_Td-.L010pic_point(%ebp),%ebp
+	leal	764(%esp),%ebx
+	subl	%ebp,%ebx
+	andl	$768,%ebx
+	leal	2176(%ebp,%ebx,1),%ebp
+	btl	$25,(%eax)
+	jnc	.L011x86
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm4
+	call	_sse_AES_decrypt_compact
+	movl	28(%esp),%esp
+	movl	24(%esp),%esi
+	movq	%mm0,(%esi)
+	movq	%mm4,8(%esi)
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	16
+.L011x86:
+	movl	%ebp,24(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	call	_x86_AES_decrypt_compact
+	movl	28(%esp),%esp
+	movl	24(%esp),%esi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	AES_decrypt,.-.L_AES_decrypt_begin
+.globl	AES_cbc_encrypt
+.type	AES_cbc_encrypt,@function
+.align	16
+AES_cbc_encrypt:
+.L_AES_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%ecx
+	cmpl	$0,%ecx
+	je	.L012drop_out
+	call	.L013pic_point
+.L013pic_point:
+	popl	%ebp
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L013pic_point](%ebp),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	cmpl	$0,40(%esp)
+	leal	.LAES_Te-.L013pic_point(%ebp),%ebp
+	jne	.L014picked_te
+	leal	.LAES_Td-.LAES_Te(%ebp),%ebp
+.L014picked_te:
+	pushfl
+	cld
+	cmpl	$512,%ecx
+	jb	.L015slow_way
+	testl	$15,%ecx
+	jnz	.L015slow_way
+	btl	$28,(%eax)
+	jc	.L015slow_way
+	leal	-324(%esp),%esi
+	andl	$-64,%esi
+	movl	%ebp,%eax
+	leal	2304(%ebp),%ebx
+	movl	%esi,%edx
+	andl	$4095,%eax
+	andl	$4095,%ebx
+	andl	$4095,%edx
+	cmpl	%ebx,%edx
+	jb	.L016tbl_break_out
+	subl	%ebx,%edx
+	subl	%edx,%esi
+	jmp	.L017tbl_ok
+.align	4
+.L016tbl_break_out:
+	subl	%eax,%edx
+	andl	$4095,%edx
+	addl	$384,%edx
+	subl	%edx,%esi
+.align	4
+.L017tbl_ok:
+	leal	24(%esp),%edx
+	xchgl	%esi,%esp
+	addl	$4,%esp
+	movl	%ebp,24(%esp)
+	movl	%esi,28(%esp)
+	movl	(%edx),%eax
+	movl	4(%edx),%ebx
+	movl	12(%edx),%edi
+	movl	16(%edx),%esi
+	movl	20(%edx),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edi,44(%esp)
+	movl	%esi,48(%esp)
+	movl	$0,316(%esp)
+	movl	%edi,%ebx
+	movl	$61,%ecx
+	subl	%ebp,%ebx
+	movl	%edi,%esi
+	andl	$4095,%ebx
+	leal	76(%esp),%edi
+	cmpl	$2304,%ebx
+	jb	.L018do_copy
+	cmpl	$3852,%ebx
+	jb	.L019skip_copy
+.align	4
+.L018do_copy:
+	movl	%edi,44(%esp)
+.long	2784229001
+.L019skip_copy:
+	movl	$16,%edi
+.align	4
+.L020prefetch_tbl:
+	movl	(%ebp),%eax
+	movl	32(%ebp),%ebx
+	movl	64(%ebp),%ecx
+	movl	96(%ebp),%esi
+	leal	128(%ebp),%ebp
+	subl	$1,%edi
+	jnz	.L020prefetch_tbl
+	subl	$2048,%ebp
+	movl	32(%esp),%esi
+	movl	48(%esp),%edi
+	cmpl	$0,%edx
+	je	.L021fast_decrypt
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+.align	16
+.L022fast_enc_loop:
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	xorl	(%esi),%eax
+	xorl	4(%esi),%ebx
+	xorl	8(%esi),%ecx
+	xorl	12(%esi),%edx
+	movl	44(%esp),%edi
+	call	_x86_AES_encrypt
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	leal	16(%esi),%esi
+	movl	40(%esp),%ecx
+	movl	%esi,32(%esp)
+	leal	16(%edi),%edx
+	movl	%edx,36(%esp)
+	subl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jnz	.L022fast_enc_loop
+	movl	48(%esp),%esi
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	cmpl	$0,316(%esp)
+	movl	44(%esp),%edi
+	je	.L023skip_ezero
+	movl	$60,%ecx
+	xorl	%eax,%eax
+.align	4
+.long	2884892297
+.L023skip_ezero:
+	movl	28(%esp),%esp
+	popfl
+.L012drop_out:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L021fast_decrypt:
+	cmpl	36(%esp),%esi
+	je	.L024fast_dec_in_place
+	movl	%edi,52(%esp)
+.align	4
+.align	16
+.L025fast_dec_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	44(%esp),%edi
+	call	_x86_AES_decrypt
+	movl	52(%esp),%edi
+	movl	40(%esp),%esi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	36(%esp),%edi
+	movl	32(%esp),%esi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	40(%esp),%ecx
+	movl	%esi,52(%esp)
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	leal	16(%edi),%edi
+	movl	%edi,36(%esp)
+	subl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jnz	.L025fast_dec_loop
+	movl	52(%esp),%edi
+	movl	48(%esp),%esi
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	jmp	.L026fast_dec_out
+.align	16
+.L024fast_dec_in_place:
+.L027fast_dec_in_place_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	leal	60(%esp),%edi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	44(%esp),%edi
+	call	_x86_AES_decrypt
+	movl	48(%esp),%edi
+	movl	36(%esp),%esi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	leal	16(%esi),%esi
+	movl	%esi,36(%esp)
+	leal	60(%esp),%esi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	32(%esp),%esi
+	movl	40(%esp),%ecx
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	subl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jnz	.L027fast_dec_in_place_loop
+.align	4
+.L026fast_dec_out:
+	cmpl	$0,316(%esp)
+	movl	44(%esp),%edi
+	je	.L028skip_dzero
+	movl	$60,%ecx
+	xorl	%eax,%eax
+.align	4
+.long	2884892297
+.L028skip_dzero:
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L015slow_way:
+	movl	(%eax),%eax
+	movl	36(%esp),%edi
+	leal	-80(%esp),%esi
+	andl	$-64,%esi
+	leal	-143(%edi),%ebx
+	subl	%esi,%ebx
+	negl	%ebx
+	andl	$960,%ebx
+	subl	%ebx,%esi
+	leal	768(%esi),%ebx
+	subl	%ebp,%ebx
+	andl	$768,%ebx
+	leal	2176(%ebp,%ebx,1),%ebp
+	leal	24(%esp),%edx
+	xchgl	%esi,%esp
+	addl	$4,%esp
+	movl	%ebp,24(%esp)
+	movl	%esi,28(%esp)
+	movl	%eax,52(%esp)
+	movl	(%edx),%eax
+	movl	4(%edx),%ebx
+	movl	16(%edx),%esi
+	movl	20(%edx),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edi,44(%esp)
+	movl	%esi,48(%esp)
+	movl	%esi,%edi
+	movl	%eax,%esi
+	cmpl	$0,%edx
+	je	.L029slow_decrypt
+	cmpl	$16,%ecx
+	movl	%ebx,%edx
+	jb	.L030slow_enc_tail
+	btl	$25,52(%esp)
+	jnc	.L031slow_enc_x86
+	movq	(%edi),%mm0
+	movq	8(%edi),%mm4
+.align	16
+.L032slow_enc_loop_sse:
+	pxor	(%esi),%mm0
+	pxor	8(%esi),%mm4
+	movl	44(%esp),%edi
+	call	_sse_AES_encrypt_compact
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	movl	40(%esp),%ecx
+	movq	%mm0,(%edi)
+	movq	%mm4,8(%edi)
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	leal	16(%edi),%edx
+	movl	%edx,36(%esp)
+	subl	$16,%ecx
+	cmpl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jae	.L032slow_enc_loop_sse
+	testl	$15,%ecx
+	jnz	.L030slow_enc_tail
+	movl	48(%esp),%esi
+	movq	%mm0,(%esi)
+	movq	%mm4,8(%esi)
+	emms
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L031slow_enc_x86:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+.align	4
+.L033slow_enc_loop_x86:
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	xorl	(%esi),%eax
+	xorl	4(%esi),%ebx
+	xorl	8(%esi),%ecx
+	xorl	12(%esi),%edx
+	movl	44(%esp),%edi
+	call	_x86_AES_encrypt_compact
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	40(%esp),%ecx
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	leal	16(%edi),%edx
+	movl	%edx,36(%esp)
+	subl	$16,%ecx
+	cmpl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jae	.L033slow_enc_loop_x86
+	testl	$15,%ecx
+	jnz	.L030slow_enc_tail
+	movl	48(%esp),%esi
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L030slow_enc_tail:
+	emms
+	movl	%edx,%edi
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	cmpl	%esi,%edi
+	je	.L034enc_in_place
+.align	4
+.long	2767451785
+	jmp	.L035enc_skip_in_place
+.L034enc_in_place:
+	leal	(%edi,%ecx,1),%edi
+.L035enc_skip_in_place:
+	movl	%ebx,%ecx
+	xorl	%eax,%eax
+.align	4
+.long	2868115081
+	movl	48(%esp),%edi
+	movl	%edx,%esi
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	$16,40(%esp)
+	jmp	.L033slow_enc_loop_x86
+.align	16
+.L029slow_decrypt:
+	btl	$25,52(%esp)
+	jnc	.L036slow_dec_loop_x86
+.align	4
+.L037slow_dec_loop_sse:
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm4
+	movl	44(%esp),%edi
+	call	_sse_AES_decrypt_compact
+	movl	32(%esp),%esi
+	leal	60(%esp),%eax
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	48(%esp),%edi
+	movq	(%esi),%mm1
+	movq	8(%esi),%mm5
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	movq	%mm1,(%edi)
+	movq	%mm5,8(%edi)
+	subl	$16,%ecx
+	jc	.L038slow_dec_partial_sse
+	movq	%mm0,(%ebx)
+	movq	%mm4,8(%ebx)
+	leal	16(%ebx),%ebx
+	movl	%ebx,36(%esp)
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	movl	%ecx,40(%esp)
+	jnz	.L037slow_dec_loop_sse
+	emms
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L038slow_dec_partial_sse:
+	movq	%mm0,(%eax)
+	movq	%mm4,8(%eax)
+	emms
+	addl	$16,%ecx
+	movl	%ebx,%edi
+	movl	%eax,%esi
+.align	4
+.long	2767451785
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L036slow_dec_loop_x86:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	leal	60(%esp),%edi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	44(%esp),%edi
+	call	_x86_AES_decrypt_compact
+	movl	48(%esp),%edi
+	movl	40(%esp),%esi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	subl	$16,%esi
+	jc	.L039slow_dec_partial_x86
+	movl	%esi,40(%esp)
+	movl	36(%esp),%esi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	leal	16(%esi),%esi
+	movl	%esi,36(%esp)
+	leal	60(%esp),%esi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	32(%esp),%esi
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	jnz	.L036slow_dec_loop_x86
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L039slow_dec_partial_x86:
+	leal	60(%esp),%esi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	movl	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	40(%esp),%ecx
+	movl	36(%esp),%edi
+	leal	60(%esp),%esi
+.align	4
+.long	2767451785
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	AES_cbc_encrypt,.-.L_AES_cbc_encrypt_begin
+.type	_x86_AES_set_encrypt_key,@function
+.align	16
+_x86_AES_set_encrypt_key:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	24(%esp),%esi
+	movl	32(%esp),%edi
+	testl	$-1,%esi
+	jz	.L040badpointer
+	testl	$-1,%edi
+	jz	.L040badpointer
+	call	.L041pic_point
+.L041pic_point:
+	popl	%ebp
+	leal	.LAES_Te-.L041pic_point(%ebp),%ebp
+	leal	2176(%ebp),%ebp
+	movl	-128(%ebp),%eax
+	movl	-96(%ebp),%ebx
+	movl	-64(%ebp),%ecx
+	movl	-32(%ebp),%edx
+	movl	(%ebp),%eax
+	movl	32(%ebp),%ebx
+	movl	64(%ebp),%ecx
+	movl	96(%ebp),%edx
+	movl	28(%esp),%ecx
+	cmpl	$128,%ecx
+	je	.L04210rounds
+	cmpl	$192,%ecx
+	je	.L04312rounds
+	cmpl	$256,%ecx
+	je	.L04414rounds
+	movl	$-2,%eax
+	jmp	.L045exit
+.L04210rounds:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	xorl	%ecx,%ecx
+	jmp	.L04610shortcut
+.align	4
+.L04710loop:
+	movl	(%edi),%eax
+	movl	12(%edi),%edx
+.L04610shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+	xorl	896(%ebp,%ecx,4),%eax
+	movl	%eax,16(%edi)
+	xorl	4(%edi),%eax
+	movl	%eax,20(%edi)
+	xorl	8(%edi),%eax
+	movl	%eax,24(%edi)
+	xorl	12(%edi),%eax
+	movl	%eax,28(%edi)
+	incl	%ecx
+	addl	$16,%edi
+	cmpl	$10,%ecx
+	jl	.L04710loop
+	movl	$10,80(%edi)
+	xorl	%eax,%eax
+	jmp	.L045exit
+.L04312rounds:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	16(%esi),%ecx
+	movl	20(%esi),%edx
+	movl	%ecx,16(%edi)
+	movl	%edx,20(%edi)
+	xorl	%ecx,%ecx
+	jmp	.L04812shortcut
+.align	4
+.L04912loop:
+	movl	(%edi),%eax
+	movl	20(%edi),%edx
+.L04812shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+	xorl	896(%ebp,%ecx,4),%eax
+	movl	%eax,24(%edi)
+	xorl	4(%edi),%eax
+	movl	%eax,28(%edi)
+	xorl	8(%edi),%eax
+	movl	%eax,32(%edi)
+	xorl	12(%edi),%eax
+	movl	%eax,36(%edi)
+	cmpl	$7,%ecx
+	je	.L05012break
+	incl	%ecx
+	xorl	16(%edi),%eax
+	movl	%eax,40(%edi)
+	xorl	20(%edi),%eax
+	movl	%eax,44(%edi)
+	addl	$24,%edi
+	jmp	.L04912loop
+.L05012break:
+	movl	$12,72(%edi)
+	xorl	%eax,%eax
+	jmp	.L045exit
+.L04414rounds:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	movl	%eax,16(%edi)
+	movl	%ebx,20(%edi)
+	movl	%ecx,24(%edi)
+	movl	%edx,28(%edi)
+	xorl	%ecx,%ecx
+	jmp	.L05114shortcut
+.align	4
+.L05214loop:
+	movl	28(%edi),%edx
+.L05114shortcut:
+	movl	(%edi),%eax
+	movzbl	%dl,%esi
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+	xorl	896(%ebp,%ecx,4),%eax
+	movl	%eax,32(%edi)
+	xorl	4(%edi),%eax
+	movl	%eax,36(%edi)
+	xorl	8(%edi),%eax
+	movl	%eax,40(%edi)
+	xorl	12(%edi),%eax
+	movl	%eax,44(%edi)
+	cmpl	$6,%ecx
+	je	.L05314break
+	incl	%ecx
+	movl	%eax,%edx
+	movl	16(%edi),%eax
+	movzbl	%dl,%esi
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shrl	$16,%edx
+	shll	$8,%ebx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+	movl	%eax,48(%edi)
+	xorl	20(%edi),%eax
+	movl	%eax,52(%edi)
+	xorl	24(%edi),%eax
+	movl	%eax,56(%edi)
+	xorl	28(%edi),%eax
+	movl	%eax,60(%edi)
+	addl	$32,%edi
+	jmp	.L05214loop
+.L05314break:
+	movl	$14,48(%edi)
+	xorl	%eax,%eax
+	jmp	.L045exit
+.L040badpointer:
+	movl	$-1,%eax
+.L045exit:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,@function
+.align	16
+private_AES_set_encrypt_key:
+.L_private_AES_set_encrypt_key_begin:
+	call	_x86_AES_set_encrypt_key
+	ret
+.size	private_AES_set_encrypt_key,.-.L_private_AES_set_encrypt_key_begin
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,@function
+.align	16
+private_AES_set_decrypt_key:
+.L_private_AES_set_decrypt_key_begin:
+	call	_x86_AES_set_encrypt_key
+	cmpl	$0,%eax
+	je	.L054proceed
+	ret
+.L054proceed:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%esi
+	movl	240(%esi),%ecx
+	leal	(,%ecx,4),%ecx
+	leal	(%esi,%ecx,4),%edi
+.align	4
+.L055invert:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	(%edi),%ecx
+	movl	4(%edi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,(%esi)
+	movl	%edx,4(%esi)
+	movl	8(%esi),%eax
+	movl	12(%esi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	%eax,8(%edi)
+	movl	%ebx,12(%edi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	addl	$16,%esi
+	subl	$16,%edi
+	cmpl	%edi,%esi
+	jne	.L055invert
+	movl	28(%esp),%edi
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,28(%esp)
+	movl	16(%edi),%eax
+.align	4
+.L056permute:
+	addl	$16,%edi
+	movl	%eax,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%eax,%eax,1),%ebx
+	subl	%ebp,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	xorl	%ebx,%esi
+	movl	%esi,%ebx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	subl	%ebp,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%eax,%ebx
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	xorl	%eax,%ecx
+	subl	%ebp,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	roll	$8,%eax
+	xorl	%esi,%edx
+	movl	4(%edi),%ebp
+	xorl	%ebx,%eax
+	xorl	%edx,%ebx
+	xorl	%ecx,%eax
+	roll	$24,%ebx
+	xorl	%edx,%ecx
+	xorl	%edx,%eax
+	roll	$16,%ecx
+	xorl	%ebx,%eax
+	roll	$8,%edx
+	xorl	%ecx,%eax
+	movl	%ebp,%ebx
+	xorl	%edx,%eax
+	movl	%eax,(%edi)
+	movl	%ebx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	subl	%ebp,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	subl	%ebp,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	xorl	%ebx,%ecx
+	xorl	%edx,%esi
+	movl	%esi,%edx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%edx,%edx,1),%eax
+	xorl	%ebx,%edx
+	subl	%ebp,%esi
+	andl	$4278124286,%eax
+	andl	$454761243,%esi
+	roll	$8,%ebx
+	xorl	%esi,%eax
+	movl	8(%edi),%ebp
+	xorl	%ecx,%ebx
+	xorl	%eax,%ecx
+	xorl	%edx,%ebx
+	roll	$24,%ecx
+	xorl	%eax,%edx
+	xorl	%eax,%ebx
+	roll	$16,%edx
+	xorl	%ecx,%ebx
+	roll	$8,%eax
+	xorl	%edx,%ebx
+	movl	%ebp,%ecx
+	xorl	%eax,%ebx
+	movl	%ebx,4(%edi)
+	movl	%ecx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	subl	%ebp,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	xorl	%edx,%esi
+	movl	%esi,%edx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%edx,%edx,1),%eax
+	subl	%ebp,%esi
+	andl	$4278124286,%eax
+	andl	$454761243,%esi
+	xorl	%ecx,%edx
+	xorl	%eax,%esi
+	movl	%esi,%eax
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%eax,%eax,1),%ebx
+	xorl	%ecx,%eax
+	subl	%ebp,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	roll	$8,%ecx
+	xorl	%esi,%ebx
+	movl	12(%edi),%ebp
+	xorl	%edx,%ecx
+	xorl	%ebx,%edx
+	xorl	%eax,%ecx
+	roll	$24,%edx
+	xorl	%ebx,%eax
+	xorl	%ebx,%ecx
+	roll	$16,%eax
+	xorl	%edx,%ecx
+	roll	$8,%ebx
+	xorl	%eax,%ecx
+	movl	%ebp,%edx
+	xorl	%ebx,%ecx
+	movl	%ecx,8(%edi)
+	movl	%edx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%edx,%edx,1),%eax
+	subl	%ebp,%esi
+	andl	$4278124286,%eax
+	andl	$454761243,%esi
+	xorl	%eax,%esi
+	movl	%esi,%eax
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%eax,%eax,1),%ebx
+	subl	%ebp,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	xorl	%edx,%eax
+	xorl	%ebx,%esi
+	movl	%esi,%ebx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	xorl	%edx,%ebx
+	subl	%ebp,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	roll	$8,%edx
+	xorl	%esi,%ecx
+	movl	16(%edi),%ebp
+	xorl	%eax,%edx
+	xorl	%ecx,%eax
+	xorl	%ebx,%edx
+	roll	$24,%eax
+	xorl	%ecx,%ebx
+	xorl	%ecx,%edx
+	roll	$16,%ebx
+	xorl	%eax,%edx
+	roll	$8,%ecx
+	xorl	%ebx,%edx
+	movl	%ebp,%eax
+	xorl	%ecx,%edx
+	movl	%edx,12(%edi)
+	cmpl	28(%esp),%edi
+	jb	.L056permute
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	private_AES_set_decrypt_key,.-.L_private_AES_set_decrypt_key_begin
+.byte	65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89
+.byte	80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114
+.byte	111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.comm	OPENSSL_ia32cap_P,8,4
diff --git a/main/openssl/crypto/aes/asm/aes-586.pl b/main/openssl/crypto/aes/asm/aes-586.pl
index aab40e6f..51b500dd 100755..100644
--- a/main/openssl/crypto/aes/asm/aes-586.pl
+++ b/main/openssl/crypto/aes/asm/aes-586.pl
@@ -39,13 +39,13 @@
 # but exhibits up to 10% improvement on other cores.
 #
 # Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
 # This made it possible to implement little-endian variant of the
 # algorithm without modifying the base C code. Motivating factor for
 # the undertaken effort was that it appeared that in tight IA-32
 # register window little-endian flavor could achieve slightly higher
 # Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent �-archs...
+# better performance on most recent µ-archs...
 #
 # Third version adds AES_cbc_encrypt implementation, which resulted in
 # up to 40% performance imrovement of CBC benchmark results. 40% was
@@ -223,7 +223,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
 $speed_limit=512;	# chunks smaller than $speed_limit are
 			# processed with compact routine in CBC mode
 $small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
-			# recent �-archs], but ~5 times smaller!
+			# recent µ-archs], but ~5 times smaller!
 			# I favor compact code to minimize cache
 			# contention and in hope to "collect" 5% back
 			# in real-life applications...
@@ -562,7 +562,7 @@ sub enctransform()
 # Performance is not actually extraordinary in comparison to pure
 # x86 code. In particular encrypt performance is virtually the same.
 # Decrypt performance on the other hand is 15-20% better on newer
-# �-archs [but we're thankful for *any* improvement here], and ~50%
+# µ-archs [but we're thankful for *any* improvement here], and ~50%
 # better on PIII:-) And additionally on the pros side this code
 # eliminates redundant references to stack and thus relieves/
 # minimizes the pressure on the memory bus.
@@ -2854,12 +2854,12 @@ sub enckey()
     &set_label("exit");
 &function_end("_x86_AES_set_encrypt_key");
 
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("AES_set_encrypt_key");
+&function_begin_B("private_AES_set_encrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&ret	();
-&function_end_B("AES_set_encrypt_key");
+&function_end_B("private_AES_set_encrypt_key");
 
 sub deckey()
 { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2916,9 +2916,9 @@ sub deckey()
 	&mov	(&DWP(4*$i,$key),$tp1);
 }
 
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("AES_set_decrypt_key");
+&function_begin_B("private_AES_set_decrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&cmp	("eax",0);
 	&je	(&label("proceed"));
@@ -2974,7 +2974,7 @@ sub deckey()
 	&jb	(&label("permute"));
 
 	&xor	("eax","eax");			# return success
-&function_end("AES_set_decrypt_key");
+&function_end("private_AES_set_decrypt_key");
 &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
diff --git a/main/openssl/crypto/aes/asm/aes-armv4.pl b/main/openssl/crypto/aes/asm/aes-armv4.pl
index c51ee1fb..86b86c4a 100644
--- a/main/openssl/crypto/aes/asm/aes-armv4.pl
+++ b/main/openssl/crypto/aes/asm/aes-armv4.pl
@@ -27,6 +27,11 @@
 # Rescheduling for dual-issue pipeline resulted in 12% improvement on
 # Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~21.5 cycles per byte.
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
@@ -46,6 +51,7 @@ $key="r11";
 $rounds="r12";
 
 $code=<<___;
+#include "arm_arch.h"
 .text
 .code	32
 
@@ -166,7 +172,7 @@ AES_encrypt:
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
 	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
-
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -195,10 +201,33 @@ AES_encrypt:
 	orr	$s3,$s3,$t1,lsl#8
 	orr	$s3,$s3,$t2,lsl#16
 	orr	$s3,$s3,$t3,lsl#24
-
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+#endif
 	bl	_armv4_AES_encrypt
 
 	ldr	$rounds,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$rounds,#0]
+	str	$s1,[$rounds,#4]
+	str	$s2,[$rounds,#8]
+	str	$s3,[$rounds,#12]
+#else
 	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
 	mov	$t2,$s0,lsr#16		@ manner...
 	mov	$t3,$s0,lsr#8
@@ -227,11 +256,15 @@ AES_encrypt:
 	strb	$t2,[$rounds,#13]
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@ _armv4_AES_encrypt:
 	and	$i2,lr,$s2,lsr#16	@ i1
 	eor	$t3,$t3,$i3,ror#8
 	and	$i3,lr,$s2
-	eor	$s1,$s1,$t1,ror#24
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
+	eor	$s1,$s1,$t1,ror#24
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
 	mov	$s2,$s2,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
 	eor	$s0,$s0,$i1,ror#16
 	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
@@ -284,16 +317,16 @@ _armv4_AES_encrypt:
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$t3,$i3,ror#16
 	and	$i3,lr,$s3,lsr#16	@ i2
-	eor	$s2,$s2,$t2,ror#16
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
+	eor	$s2,$s2,$t2,ror#16
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
 	eor	$s0,$s0,$i1,ror#24
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
-	eor	$s1,$s1,$i2,ror#16
 	ldr	$i1,[$key],#16
+	eor	$s1,$s1,$i2,ror#16
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
 	eor	$s2,$s2,$i3,ror#8
 	ldr	$t1,[$key,#-12]
 	eor	$s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@ _armv4_AES_encrypt:
 	and	$i2,lr,$s2,lsr#16	@ i1
 	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s2
-	eor	$s1,$t1,$s1,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
+	eor	$s1,$t1,$s1,lsl#24
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
 	mov	$s2,$s2,lsr#24
 
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
 	eor	$s0,$i1,$s0,lsl#8
 	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
@@ -346,15 +379,15 @@ _armv4_AES_encrypt:
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s3,lsr#16	@ i2
-	eor	$s2,$t2,$s2,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
+	eor	$s2,$t2,$s2,lsl#24
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
 	eor	$s0,$i1,$s0,lsl#8
-	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
 	ldr	$i1,[$key,#0]
+	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
 	eor	$s1,$s1,$i2,lsl#8
 	ldr	$t1,[$key,#4]
 	eor	$s2,$s2,$i3,lsl#16
@@ -371,10 +404,11 @@ _armv4_AES_encrypt:
 	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
 
-.global AES_set_encrypt_key
-.type   AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type   private_AES_set_encrypt_key,%function
 .align	5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
 	sub	r3,pc,#8		@ AES_set_encrypt_key
 	teq	r0,#0
 	moveq	r0,#-1
@@ -392,12 +426,13 @@ AES_set_encrypt_key:
 	bne	.Labrt
 
 .Lok:	stmdb   sp!,{r4-r12,lr}
-	sub	$tbl,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+	sub	$tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
 
 	mov	$rounds,r0		@ inp
 	mov	lr,r1			@ bits
 	mov	$key,r2			@ key
 
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -430,6 +465,22 @@ AES_set_encrypt_key:
 	orr	$s3,$s3,$t3,lsl#24
 	str	$s2,[$key,#-8]
 	str	$s3,[$key,#-4]
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$key],#16
+	str	$s1,[$key,#-12]
+	str	$s2,[$key,#-8]
+	str	$s3,[$key,#-4]
+#endif
 
 	teq	lr,#128
 	bne	.Lnot128
@@ -466,6 +517,7 @@ AES_set_encrypt_key:
 	b	.Ldone
 
 .Lnot128:
+#if __ARM_ARCH__<7
 	ldrb	$i2,[$rounds,#19]
 	ldrb	$t1,[$rounds,#18]
 	ldrb	$t2,[$rounds,#17]
@@ -482,6 +534,16 @@ AES_set_encrypt_key:
 	str	$i2,[$key],#8
 	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
+#else
+	ldr	$i2,[$rounds,#16]
+	ldr	$i3,[$rounds,#20]
+#ifdef __ARMEL__
+	rev	$i2,$i2
+	rev	$i3,$i3
+#endif
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+#endif
 
 	teq	lr,#192
 	bne	.Lnot192
@@ -526,6 +588,7 @@ AES_set_encrypt_key:
 	b	.L192_loop
 
 .Lnot192:
+#if __ARM_ARCH__<7
 	ldrb	$i2,[$rounds,#27]
 	ldrb	$t1,[$rounds,#26]
 	ldrb	$t2,[$rounds,#25]
@@ -542,6 +605,16 @@ AES_set_encrypt_key:
 	str	$i2,[$key],#8
 	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
+#else
+	ldr	$i2,[$rounds,#24]
+	ldr	$i3,[$rounds,#28]
+#ifdef __ARMEL__
+	rev	$i2,$i2
+	rev	$i3,$i3
+#endif
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+#endif
 
 	mov	$rounds,#14
 	str	$rounds,[$key,#240-32]
@@ -606,14 +679,14 @@ AES_set_encrypt_key:
 .Labrt:	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
-.global AES_set_decrypt_key
-.type   AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type   private_AES_set_decrypt_key,%function
 .align	5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	str	lr,[sp,#-4]!            @ push lr
-	bl	AES_set_encrypt_key
+	bl	_armv4_AES_set_encrypt_key
 	teq	r0,#0
 	ldrne	lr,[sp],#4              @ pop lr
 	bne	.Labrt
@@ -692,11 +765,15 @@ $code.=<<___;
 	bne	.Lmix
 
 	mov	r0,#0
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 
 .type	AES_Td,%object
 .align	5
@@ -811,7 +888,7 @@ AES_decrypt:
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
 	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td
-
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -840,10 +917,33 @@ AES_decrypt:
 	orr	$s3,$s3,$t1,lsl#8
 	orr	$s3,$s3,$t2,lsl#16
 	orr	$s3,$s3,$t3,lsl#24
-
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+#endif
 	bl	_armv4_AES_decrypt
 
 	ldr	$rounds,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$rounds,#0]
+	str	$s1,[$rounds,#4]
+	str	$s2,[$rounds,#8]
+	str	$s3,[$rounds,#12]
+#else
 	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
 	mov	$t2,$s0,lsr#16		@ manner...
 	mov	$t3,$s0,lsr#8
@@ -872,11 +972,15 @@ AES_decrypt:
 	strb	$t2,[$rounds,#13]
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _armv4_AES_decrypt,%function
@@ -916,11 +1020,11 @@ _armv4_AES_decrypt:
 	and	$i2,lr,$s2		@ i1
 	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s2,lsr#16
-	eor	$s1,$s1,$t1,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
+	eor	$s1,$s1,$t1,ror#8
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
 	mov	$s2,$s2,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
 	eor	$s0,$s0,$i1,ror#16
 	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
@@ -929,22 +1033,22 @@ _armv4_AES_decrypt:
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s3		@ i2
-	eor	$s2,$s2,$t2,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
+	eor	$s2,$s2,$t2,ror#8
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
 	eor	$s0,$s0,$i1,ror#8
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
+	ldr	$i1,[$key],#16
 	eor	$s1,$s1,$i2,ror#16
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
 	eor	$s2,$s2,$i3,ror#24
-	ldr	$i1,[$key],#16
-	eor	$s3,$s3,$t3,ror#8
 
 	ldr	$t1,[$key,#-12]
-	ldr	$t2,[$key,#-8]
 	eor	$s0,$s0,$i1
+	ldr	$t2,[$key,#-8]
+	eor	$s3,$s3,$t3,ror#8
 	ldr	$t3,[$key,#-4]
 	and	$i1,lr,$s0,lsr#16
 	eor	$s1,$s1,$t1
@@ -985,11 +1089,11 @@ _armv4_AES_decrypt:
 	and	$i1,lr,$s2,lsr#8	@ i0
 	eor	$t2,$t2,$i2,lsl#8
 	and	$i2,lr,$s2		@ i1
-	eor	$t3,$t3,$i3,lsl#8
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
+	eor	$t3,$t3,$i3,lsl#8
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
 	and	$i3,lr,$s2,lsr#16
 
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
 	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
 	eor	$s0,$s0,$i1,lsl#8
 	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
@@ -997,11 +1101,11 @@ _armv4_AES_decrypt:
 	and	$i1,lr,$s3,lsr#16	@ i0
 	eor	$s2,$t2,$s2,lsl#16
 	and	$i2,lr,$s3,lsr#8	@ i1
-	eor	$t3,$t3,$i3,lsl#16
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
+	eor	$t3,$t3,$i3,lsl#16
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
 	and	$i3,lr,$s3		@ i2
 
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
 	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
 	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
 	eor	$s0,$s0,$i1,lsl#16
diff --git a/main/openssl/crypto/aes/asm/aes-armv4.s b/main/openssl/crypto/aes/asm/aes-armv4.s
index 27c681c7..2697d4ce 100644
--- a/main/openssl/crypto/aes/asm/aes-armv4.s
+++ b/main/openssl/crypto/aes/asm/aes-armv4.s
@@ -1,3 +1,4 @@
+#include "arm_arch.h"
 .text
 .code	32
 
@@ -118,7 +119,7 @@ AES_encrypt:
 	mov	r12,r0		@ inp
 	mov	r11,r2
 	sub	r10,r3,#AES_encrypt-AES_Te	@ Te
-
+#if __ARM_ARCH__<7
 	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
 	ldrb	r4,[r12,#2]	@ manner...
 	ldrb	r5,[r12,#1]
@@ -147,10 +148,33 @@ AES_encrypt:
 	orr	r3,r3,r4,lsl#8
 	orr	r3,r3,r5,lsl#16
 	orr	r3,r3,r6,lsl#24
-
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#endif
 	bl	_armv4_AES_encrypt
 
 	ldr	r12,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r12,#0]
+	str	r1,[r12,#4]
+	str	r2,[r12,#8]
+	str	r3,[r12,#12]
+#else
 	mov	r4,r0,lsr#24		@ write output in endian-neutral
 	mov	r5,r0,lsr#16		@ manner...
 	mov	r6,r0,lsr#8
@@ -179,11 +203,15 @@ AES_encrypt:
 	strb	r5,[r12,#13]
 	strb	r6,[r12,#14]
 	strb	r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _armv4_AES_encrypt,%function
@@ -223,11 +251,11 @@ _armv4_AES_encrypt:
 	and	r8,lr,r2,lsr#16	@ i1
 	eor	r6,r6,r9,ror#8
 	and	r9,lr,r2
-	eor	r1,r1,r4,ror#24
 	ldr	r7,[r10,r7,lsl#2]	@ Te2[s2>>8]
+	eor	r1,r1,r4,ror#24
+	ldr	r8,[r10,r8,lsl#2]	@ Te1[s2>>16]
 	mov	r2,r2,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Te1[s2>>16]
 	ldr	r9,[r10,r9,lsl#2]	@ Te3[s2>>0]
 	eor	r0,r0,r7,ror#16
 	ldr	r2,[r10,r2,lsl#2]	@ Te0[s2>>24]
@@ -236,16 +264,16 @@ _armv4_AES_encrypt:
 	and	r8,lr,r3,lsr#8	@ i1
 	eor	r6,r6,r9,ror#16
 	and	r9,lr,r3,lsr#16	@ i2
-	eor	r2,r2,r5,ror#16
 	ldr	r7,[r10,r7,lsl#2]	@ Te3[s3>>0]
+	eor	r2,r2,r5,ror#16
+	ldr	r8,[r10,r8,lsl#2]	@ Te2[s3>>8]
 	mov	r3,r3,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Te2[s3>>8]
 	ldr	r9,[r10,r9,lsl#2]	@ Te1[s3>>16]
 	eor	r0,r0,r7,ror#24
-	ldr	r3,[r10,r3,lsl#2]	@ Te0[s3>>24]
-	eor	r1,r1,r8,ror#16
 	ldr	r7,[r11],#16
+	eor	r1,r1,r8,ror#16
+	ldr	r3,[r10,r3,lsl#2]	@ Te0[s3>>24]
 	eor	r2,r2,r9,ror#8
 	ldr	r4,[r11,#-12]
 	eor	r3,r3,r6,ror#8
@@ -285,11 +313,11 @@ _armv4_AES_encrypt:
 	and	r8,lr,r2,lsr#16	@ i1
 	eor	r6,r9,r6,lsl#8
 	and	r9,lr,r2
-	eor	r1,r4,r1,lsl#24
 	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s2>>8]
+	eor	r1,r4,r1,lsl#24
+	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s2>>16]
 	mov	r2,r2,lsr#24
 
-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s2>>16]
 	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s2>>0]
 	eor	r0,r7,r0,lsl#8
 	ldrb	r2,[r10,r2,lsl#2]	@ Te4[s2>>24]
@@ -298,15 +326,15 @@ _armv4_AES_encrypt:
 	and	r8,lr,r3,lsr#8	@ i1
 	eor	r6,r9,r6,lsl#8
 	and	r9,lr,r3,lsr#16	@ i2
-	eor	r2,r5,r2,lsl#24
 	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s3>>0]
+	eor	r2,r5,r2,lsl#24
+	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s3>>8]
 	mov	r3,r3,lsr#24
 
-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s3>>8]
 	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s3>>16]
 	eor	r0,r7,r0,lsl#8
-	ldrb	r3,[r10,r3,lsl#2]	@ Te4[s3>>24]
 	ldr	r7,[r11,#0]
+	ldrb	r3,[r10,r3,lsl#2]	@ Te4[s3>>24]
 	eor	r1,r1,r8,lsl#8
 	ldr	r4,[r11,#4]
 	eor	r2,r2,r9,lsl#16
@@ -323,10 +351,11 @@ _armv4_AES_encrypt:
 	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
 
-.global AES_set_encrypt_key
-.type   AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type   private_AES_set_encrypt_key,%function
 .align	5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
 	sub	r3,pc,#8		@ AES_set_encrypt_key
 	teq	r0,#0
 	moveq	r0,#-1
@@ -344,12 +373,13 @@ AES_set_encrypt_key:
 	bne	.Labrt
 
 .Lok:	stmdb   sp!,{r4-r12,lr}
-	sub	r10,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+	sub	r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
 
 	mov	r12,r0		@ inp
 	mov	lr,r1			@ bits
 	mov	r11,r2			@ key
 
+#if __ARM_ARCH__<7
 	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
 	ldrb	r4,[r12,#2]	@ manner...
 	ldrb	r5,[r12,#1]
@@ -382,6 +412,22 @@ AES_set_encrypt_key:
 	orr	r3,r3,r6,lsl#24
 	str	r2,[r11,#-8]
 	str	r3,[r11,#-4]
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r11],#16
+	str	r1,[r11,#-12]
+	str	r2,[r11,#-8]
+	str	r3,[r11,#-4]
+#endif
 
 	teq	lr,#128
 	bne	.Lnot128
@@ -418,6 +464,7 @@ AES_set_encrypt_key:
 	b	.Ldone
 
 .Lnot128:
+#if __ARM_ARCH__<7
 	ldrb	r8,[r12,#19]
 	ldrb	r4,[r12,#18]
 	ldrb	r5,[r12,#17]
@@ -434,6 +481,16 @@ AES_set_encrypt_key:
 	str	r8,[r11],#8
 	orr	r9,r9,r6,lsl#24
 	str	r9,[r11,#-4]
+#else
+	ldr	r8,[r12,#16]
+	ldr	r9,[r12,#20]
+#ifdef __ARMEL__
+	rev	r8,r8
+	rev	r9,r9
+#endif
+	str	r8,[r11],#8
+	str	r9,[r11,#-4]
+#endif
 
 	teq	lr,#192
 	bne	.Lnot192
@@ -478,6 +535,7 @@ AES_set_encrypt_key:
 	b	.L192_loop
 
 .Lnot192:
+#if __ARM_ARCH__<7
 	ldrb	r8,[r12,#27]
 	ldrb	r4,[r12,#26]
 	ldrb	r5,[r12,#25]
@@ -494,6 +552,16 @@ AES_set_encrypt_key:
 	str	r8,[r11],#8
 	orr	r9,r9,r6,lsl#24
 	str	r9,[r11,#-4]
+#else
+	ldr	r8,[r12,#24]
+	ldr	r9,[r12,#28]
+#ifdef __ARMEL__
+	rev	r8,r8
+	rev	r9,r9
+#endif
+	str	r8,[r11],#8
+	str	r9,[r11,#-4]
+#endif
 
 	mov	r12,#14
 	str	r12,[r11,#240-32]
@@ -558,14 +626,14 @@ AES_set_encrypt_key:
 .Labrt:	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
-.global AES_set_decrypt_key
-.type   AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type   private_AES_set_decrypt_key,%function
 .align	5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	str	lr,[sp,#-4]!            @ push lr
-	bl	AES_set_encrypt_key
+	bl	_armv4_AES_set_encrypt_key
 	teq	r0,#0
 	ldrne	lr,[sp],#4              @ pop lr
 	bne	.Labrt
@@ -639,11 +707,15 @@ AES_set_decrypt_key:
 	bne	.Lmix
 
 	mov	r0,#0
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 
 .type	AES_Td,%object
 .align	5
@@ -758,7 +830,7 @@ AES_decrypt:
 	mov	r12,r0		@ inp
 	mov	r11,r2
 	sub	r10,r3,#AES_decrypt-AES_Td		@ Td
-
+#if __ARM_ARCH__<7
 	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
 	ldrb	r4,[r12,#2]	@ manner...
 	ldrb	r5,[r12,#1]
@@ -787,10 +859,33 @@ AES_decrypt:
 	orr	r3,r3,r4,lsl#8
 	orr	r3,r3,r5,lsl#16
 	orr	r3,r3,r6,lsl#24
-
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#endif
 	bl	_armv4_AES_decrypt
 
 	ldr	r12,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r12,#0]
+	str	r1,[r12,#4]
+	str	r2,[r12,#8]
+	str	r3,[r12,#12]
+#else
 	mov	r4,r0,lsr#24		@ write output in endian-neutral
 	mov	r5,r0,lsr#16		@ manner...
 	mov	r6,r0,lsr#8
@@ -819,11 +914,15 @@ AES_decrypt:
 	strb	r5,[r12,#13]
 	strb	r6,[r12,#14]
 	strb	r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _armv4_AES_decrypt,%function
@@ -863,11 +962,11 @@ _armv4_AES_decrypt:
 	and	r8,lr,r2		@ i1
 	eor	r6,r9,r6,ror#8
 	and	r9,lr,r2,lsr#16
-	eor	r1,r1,r4,ror#8
 	ldr	r7,[r10,r7,lsl#2]	@ Td2[s2>>8]
+	eor	r1,r1,r4,ror#8
+	ldr	r8,[r10,r8,lsl#2]	@ Td3[s2>>0]
 	mov	r2,r2,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Td3[s2>>0]
 	ldr	r9,[r10,r9,lsl#2]	@ Td1[s2>>16]
 	eor	r0,r0,r7,ror#16
 	ldr	r2,[r10,r2,lsl#2]	@ Td0[s2>>24]
@@ -876,22 +975,22 @@ _armv4_AES_decrypt:
 	and	r8,lr,r3,lsr#8	@ i1
 	eor	r6,r9,r6,ror#8
 	and	r9,lr,r3		@ i2
-	eor	r2,r2,r5,ror#8
 	ldr	r7,[r10,r7,lsl#2]	@ Td1[s3>>16]
+	eor	r2,r2,r5,ror#8
+	ldr	r8,[r10,r8,lsl#2]	@ Td2[s3>>8]
 	mov	r3,r3,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Td2[s3>>8]
 	ldr	r9,[r10,r9,lsl#2]	@ Td3[s3>>0]
 	eor	r0,r0,r7,ror#8
-	ldr	r3,[r10,r3,lsl#2]	@ Td0[s3>>24]
+	ldr	r7,[r11],#16
 	eor	r1,r1,r8,ror#16
+	ldr	r3,[r10,r3,lsl#2]	@ Td0[s3>>24]
 	eor	r2,r2,r9,ror#24
-	ldr	r7,[r11],#16
-	eor	r3,r3,r6,ror#8
 
 	ldr	r4,[r11,#-12]
-	ldr	r5,[r11,#-8]
 	eor	r0,r0,r7
+	ldr	r5,[r11,#-8]
+	eor	r3,r3,r6,ror#8
 	ldr	r6,[r11,#-4]
 	and	r7,lr,r0,lsr#16
 	eor	r1,r1,r4
@@ -932,11 +1031,11 @@ _armv4_AES_decrypt:
 	and	r7,lr,r2,lsr#8	@ i0
 	eor	r5,r5,r8,lsl#8
 	and	r8,lr,r2		@ i1
-	eor	r6,r6,r9,lsl#8
 	ldrb	r7,[r10,r7]		@ Td4[s2>>8]
+	eor	r6,r6,r9,lsl#8
+	ldrb	r8,[r10,r8]		@ Td4[s2>>0]
 	and	r9,lr,r2,lsr#16
 
-	ldrb	r8,[r10,r8]		@ Td4[s2>>0]
 	ldrb	r2,[r10,r2,lsr#24]	@ Td4[s2>>24]
 	eor	r0,r0,r7,lsl#8
 	ldrb	r9,[r10,r9]		@ Td4[s2>>16]
@@ -944,11 +1043,11 @@ _armv4_AES_decrypt:
 	and	r7,lr,r3,lsr#16	@ i0
 	eor	r2,r5,r2,lsl#16
 	and	r8,lr,r3,lsr#8	@ i1
-	eor	r6,r6,r9,lsl#16
 	ldrb	r7,[r10,r7]		@ Td4[s3>>16]
+	eor	r6,r6,r9,lsl#16
+	ldrb	r8,[r10,r8]		@ Td4[s3>>8]
 	and	r9,lr,r3		@ i2
 
-	ldrb	r8,[r10,r8]		@ Td4[s3>>8]
 	ldrb	r9,[r10,r9]		@ Td4[s3>>0]
 	ldrb	r3,[r10,r3,lsr#24]	@ Td4[s3>>24]
 	eor	r0,r0,r7,lsl#16
diff --git a/main/openssl/crypto/aes/asm/aes-mips.S b/main/openssl/crypto/aes/asm/aes-mips.S
new file mode 100644
index 00000000..f5750bf8
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-mips.S
@@ -0,0 +1,1337 @@
+.text
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+#if !defined(__vxworks) || defined(__pic__)
+.option	pic2
+#endif
+.set	noat
+.align	5
+.ent	_mips_AES_encrypt
+_mips_AES_encrypt:
+	.frame	$29,0,$31
+	.set	reorder
+	lw	$12,0($6)
+	lw	$13,4($6)
+	lw	$14,8($6)
+	lw	$15,12($6)
+	lw	$30,240($6)
+	add $3,$6,16
+
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+
+	sub	$30,1
+	srl	$1,$9,6
+.Loop_enc:
+	srl	$2,$10,6
+	srl	$24,$11,6
+	srl	$25,$8,6
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lwl	$12,2($1)		# Te1[s1>>16]
+	lwl	$13,2($2)		# Te1[s2>>16]
+	lwl	$14,2($24)		# Te1[s3>>16]
+	lwl	$15,2($25)		# Te1[s0>>16]
+	lwr	$12,3($1)		# Te1[s1>>16]
+	lwr	$13,3($2)		# Te1[s2>>16]
+	lwr	$14,3($24)		# Te1[s3>>16]
+	lwr	$15,3($25)		# Te1[s0>>16]
+
+	srl	$1,$10,14
+	srl	$2,$11,14
+	srl	$24,$8,14
+	srl	$25,$9,14
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lwl	$16,1($1)		# Te2[s2>>8]
+	lwl	$17,1($2)		# Te2[s3>>8]
+	lwl	$18,1($24)		# Te2[s0>>8]
+	lwl	$19,1($25)		# Te2[s1>>8]
+	lwr	$16,2($1)		# Te2[s2>>8]
+	lwr	$17,2($2)		# Te2[s3>>8]
+	lwr	$18,2($24)		# Te2[s0>>8]
+	lwr	$19,2($25)		# Te2[s1>>8]
+
+	srl	$1,$11,22
+	srl	$2,$8,22
+	srl	$24,$9,22
+	srl	$25,$10,22
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lwl	$20,0($1)		# Te3[s3]
+	lwl	$21,0($2)		# Te3[s0]
+	lwl	$22,0($24)		# Te3[s1]
+	lwl	$23,0($25)		# Te3[s2]
+	lwr	$20,1($1)		# Te3[s3]
+	lwr	$21,1($2)		# Te3[s0]
+	lwr	$22,1($24)		# Te3[s1]
+	lwr	$23,1($25)		# Te3[s2]
+
+	sll	$1,$8,2
+	sll	$2,$9,2
+	sll	$24,$10,2
+	sll	$25,$11,2
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+	lw	$16,0($1)		# Te0[s0>>24]
+	lw	$17,0($2)		# Te0[s1>>24]
+	lw	$18,0($24)		# Te0[s2>>24]
+	lw	$19,0($25)		# Te0[s3>>24]
+
+	lw	$8,0($3)
+	lw	$9,4($3)
+	lw	$10,8($3)
+	lw	$11,12($3)
+
+	xor	$12,$20
+	xor	$13,$21
+	xor	$14,$22
+	xor	$15,$23
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	sub	$30,1
+	add $3,16
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+	.set	noreorder
+	bnez	$30,.Loop_enc
+	srl	$1,$9,6
+
+	.set	reorder
+	srl	$2,$10,6
+	srl	$24,$11,6
+	srl	$25,$8,6
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$12,2($1)		# Te4[s1>>16]
+	lbu	$13,2($2)		# Te4[s2>>16]
+	lbu	$14,2($24)		# Te4[s3>>16]
+	lbu	$15,2($25)		# Te4[s0>>16]
+
+	srl	$1,$10,14
+	srl	$2,$11,14
+	srl	$24,$8,14
+	srl	$25,$9,14
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$16,2($1)		# Te4[s2>>8]
+	lbu	$17,2($2)		# Te4[s3>>8]
+	lbu	$18,2($24)		# Te4[s0>>8]
+	lbu	$19,2($25)		# Te4[s1>>8]
+
+	sll	$1,$8,2
+	sll	$2,$9,2
+	sll	$24,$10,2
+	sll	$25,$11,2
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$20,2($1)		# Te4[s0>>24]
+	lbu	$21,2($2)		# Te4[s1>>24]
+	lbu	$22,2($24)		# Te4[s2>>24]
+	lbu	$23,2($25)		# Te4[s3>>24]
+
+	srl	$1,$11,22
+	srl	$2,$8,22
+	srl	$24,$9,22
+	srl	$25,$10,22
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+
+	sll	$12,$12,8
+	sll	$13,$13,8
+	sll	$14,$14,8
+	sll	$15,$15,8
+
+	sll	$16,$16,16
+	sll	$17,$17,16
+	sll	$18,$18,16
+	sll	$19,$19,16
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$16,2($1)		# Te4[s3]
+	lbu	$17,2($2)		# Te4[s0]
+	lbu	$18,2($24)		# Te4[s1]
+	lbu	$19,2($25)		# Te4[s2]
+
+	#sll	$20,$20,0
+	#sll	$21,$21,0
+	#sll	$22,$22,0
+	#sll	$23,$23,0
+
+	lw	$8,0($3)
+	lw	$9,4($3)
+	lw	$10,8($3)
+	lw	$11,12($3)
+
+	xor	$12,$20
+	xor	$13,$21
+	xor	$14,$22
+	xor	$15,$23
+
+	sll	$16,$16,24
+	sll	$17,$17,24
+	sll	$18,$18,24
+	sll	$19,$19,24
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+
+	jr	$31
+.end	_mips_AES_encrypt
+
+.align	5
+.globl	AES_encrypt
+.ent	AES_encrypt
+AES_encrypt:
+	.frame	$29,64,$31
+	.mask	3237937152,-4
+	.set	noreorder
+	.cpload	$25
+	sub $29,64
+	sw	$31,64-1*4($29)
+	sw	$30,64-2*4($29)
+	sw	$23,64-3*4($29)
+	sw	$22,64-4*4($29)
+	sw	$21,64-5*4($29)
+	sw	$20,64-6*4($29)
+	sw	$19,64-7*4($29)
+	sw	$18,64-8*4($29)
+	sw	$17,64-9*4($29)
+	sw	$16,64-10*4($29)
+	.set	reorder
+	la	$7,AES_Te		# PIC-ified 'load address'
+
+	lwl	$8,0+3($4)
+	lwl	$9,4+3($4)
+	lwl	$10,8+3($4)
+	lwl	$11,12+3($4)
+	lwr	$8,0+0($4)
+	lwr	$9,4+0($4)
+	lwr	$10,8+0($4)
+	lwr	$11,12+0($4)
+
+	bal	_mips_AES_encrypt
+
+	swr	$8,0+0($5)
+	swr	$9,4+0($5)
+	swr	$10,8+0($5)
+	swr	$11,12+0($5)
+	swl	$8,0+3($5)
+	swl	$9,4+3($5)
+	swl	$10,8+3($5)
+	swl	$11,12+3($5)
+
+	.set	noreorder
+	lw	$31,64-1*4($29)
+	lw	$30,64-2*4($29)
+	lw	$23,64-3*4($29)
+	lw	$22,64-4*4($29)
+	lw	$21,64-5*4($29)
+	lw	$20,64-6*4($29)
+	lw	$19,64-7*4($29)
+	lw	$18,64-8*4($29)
+	lw	$17,64-9*4($29)
+	lw	$16,64-10*4($29)
+	jr	$31
+	add $29,64
+.end	AES_encrypt
+.align	5
+.ent	_mips_AES_decrypt
+_mips_AES_decrypt:
+	.frame	$29,0,$31
+	.set	reorder
+	lw	$12,0($6)
+	lw	$13,4($6)
+	lw	$14,8($6)
+	lw	$15,12($6)
+	lw	$30,240($6)
+	add $3,$6,16
+
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+
+	sub	$30,1
+	srl	$1,$11,6
+.Loop_dec:
+	srl	$2,$8,6
+	srl	$24,$9,6
+	srl	$25,$10,6
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lwl	$12,2($1)		# Td1[s3>>16]
+	lwl	$13,2($2)		# Td1[s0>>16]
+	lwl	$14,2($24)		# Td1[s1>>16]
+	lwl	$15,2($25)		# Td1[s2>>16]
+	lwr	$12,3($1)		# Td1[s3>>16]
+	lwr	$13,3($2)		# Td1[s0>>16]
+	lwr	$14,3($24)		# Td1[s1>>16]
+	lwr	$15,3($25)		# Td1[s2>>16]
+
+	srl	$1,$10,14
+	srl	$2,$11,14
+	srl	$24,$8,14
+	srl	$25,$9,14
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lwl	$16,1($1)		# Td2[s2>>8]
+	lwl	$17,1($2)		# Td2[s3>>8]
+	lwl	$18,1($24)		# Td2[s0>>8]
+	lwl	$19,1($25)		# Td2[s1>>8]
+	lwr	$16,2($1)		# Td2[s2>>8]
+	lwr	$17,2($2)		# Td2[s3>>8]
+	lwr	$18,2($24)		# Td2[s0>>8]
+	lwr	$19,2($25)		# Td2[s1>>8]
+
+	srl	$1,$9,22
+	srl	$2,$10,22
+	srl	$24,$11,22
+	srl	$25,$8,22
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lwl	$20,0($1)		# Td3[s1]
+	lwl	$21,0($2)		# Td3[s2]
+	lwl	$22,0($24)		# Td3[s3]
+	lwl	$23,0($25)		# Td3[s0]
+	lwr	$20,1($1)		# Td3[s1]
+	lwr	$21,1($2)		# Td3[s2]
+	lwr	$22,1($24)		# Td3[s3]
+	lwr	$23,1($25)		# Td3[s0]
+
+	sll	$1,$8,2
+	sll	$2,$9,2
+	sll	$24,$10,2
+	sll	$25,$11,2
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+
+	lw	$16,0($1)		# Td0[s0>>24]
+	lw	$17,0($2)		# Td0[s1>>24]
+	lw	$18,0($24)		# Td0[s2>>24]
+	lw	$19,0($25)		# Td0[s3>>24]
+
+	lw	$8,0($3)
+	lw	$9,4($3)
+	lw	$10,8($3)
+	lw	$11,12($3)
+
+	xor	$12,$20
+	xor	$13,$21
+	xor	$14,$22
+	xor	$15,$23
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	sub	$30,1
+	add $3,16
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+	.set	noreorder
+	bnez	$30,.Loop_dec
+	srl	$1,$11,6
+
+	.set	reorder
+	lw	$16,1024($7)		# prefetch Td4
+	lw	$17,1024+32($7)
+	lw	$18,1024+64($7)
+	lw	$19,1024+96($7)
+	lw	$20,1024+128($7)
+	lw	$21,1024+160($7)
+	lw	$22,1024+192($7)
+	lw	$23,1024+224($7)
+
+	srl	$1,$11,8
+	srl	$2,$8,8
+	srl	$24,$9,8
+	srl	$25,$10,8
+	and	$1,0xff
+	and	$2,0xff
+	and	$24,0xff
+	and	$25,0xff
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$12,1024($1)		# Td4[s3>>16]
+	lbu	$13,1024($2)		# Td4[s0>>16]
+	lbu	$14,1024($24)		# Td4[s1>>16]
+	lbu	$15,1024($25)		# Td4[s2>>16]
+
+	srl	$1,$10,16
+	srl	$2,$11,16
+	srl	$24,$8,16
+	srl	$25,$9,16
+	and	$1,0xff
+	and	$2,0xff
+	and	$24,0xff
+	and	$25,0xff
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$16,1024($1)		# Td4[s2>>8]
+	lbu	$17,1024($2)		# Td4[s3>>8]
+	lbu	$18,1024($24)		# Td4[s0>>8]
+	lbu	$19,1024($25)		# Td4[s1>>8]
+
+	and	$1,$8,0xff
+	and	$2,$9,0xff
+	and	$24,$10,0xff
+	and	$25,$11,0xff
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$20,1024($1)		# Td4[s0>>24]
+	lbu	$21,1024($2)		# Td4[s1>>24]
+	lbu	$22,1024($24)		# Td4[s2>>24]
+	lbu	$23,1024($25)		# Td4[s3>>24]
+
+	srl	$1,$9,24
+	srl	$2,$10,24
+	srl	$24,$11,24
+	srl	$25,$8,24
+
+	sll	$12,$12,8
+	sll	$13,$13,8
+	sll	$14,$14,8
+	sll	$15,$15,8
+
+	sll	$16,$16,16
+	sll	$17,$17,16
+	sll	$18,$18,16
+	sll	$19,$19,16
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$16,1024($1)		# Td4[s1]
+	lbu	$17,1024($2)		# Td4[s2]
+	lbu	$18,1024($24)		# Td4[s3]
+	lbu	$19,1024($25)		# Td4[s0]
+
+	#sll	$20,$20,0
+	#sll	$21,$21,0
+	#sll	$22,$22,0
+	#sll	$23,$23,0
+
+	lw	$8,0($3)
+	lw	$9,4($3)
+	lw	$10,8($3)
+	lw	$11,12($3)
+
+	sll	$16,$16,24
+	sll	$17,$17,24
+	sll	$18,$18,24
+	sll	$19,$19,24
+
+
+	xor	$12,$20
+	xor	$13,$21
+	xor	$14,$22
+	xor	$15,$23
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+
+	jr	$31
+.end	_mips_AES_decrypt
+
+.align	5
+.globl	AES_decrypt
+.ent	AES_decrypt
+AES_decrypt:
+	.frame	$29,64,$31
+	.mask	3237937152,-4
+	.set	noreorder
+	.cpload	$25
+	sub $29,64
+	sw	$31,64-1*4($29)
+	sw	$30,64-2*4($29)
+	sw	$23,64-3*4($29)
+	sw	$22,64-4*4($29)
+	sw	$21,64-5*4($29)
+	sw	$20,64-6*4($29)
+	sw	$19,64-7*4($29)
+	sw	$18,64-8*4($29)
+	sw	$17,64-9*4($29)
+	sw	$16,64-10*4($29)
+	.set	reorder
+	la	$7,AES_Td		# PIC-ified 'load address'
+
+	lwl	$8,0+3($4)
+	lwl	$9,4+3($4)
+	lwl	$10,8+3($4)
+	lwl	$11,12+3($4)
+	lwr	$8,0+0($4)
+	lwr	$9,4+0($4)
+	lwr	$10,8+0($4)
+	lwr	$11,12+0($4)
+
+	bal	_mips_AES_decrypt
+
+	swr	$8,0+0($5)
+	swr	$9,4+0($5)
+	swr	$10,8+0($5)
+	swr	$11,12+0($5)
+	swl	$8,0+3($5)
+	swl	$9,4+3($5)
+	swl	$10,8+3($5)
+	swl	$11,12+3($5)
+
+	.set	noreorder
+	lw	$31,64-1*4($29)
+	lw	$30,64-2*4($29)
+	lw	$23,64-3*4($29)
+	lw	$22,64-4*4($29)
+	lw	$21,64-5*4($29)
+	lw	$20,64-6*4($29)
+	lw	$19,64-7*4($29)
+	lw	$18,64-8*4($29)
+	lw	$17,64-9*4($29)
+	lw	$16,64-10*4($29)
+	jr	$31
+	add $29,64
+.end	AES_decrypt
+.align	5
+.ent	_mips_AES_set_encrypt_key
+_mips_AES_set_encrypt_key:
+	.frame	$29,0,$31
+	.set	noreorder
+	beqz	$4,.Lekey_done
+	li	$2,-1
+	beqz	$6,.Lekey_done
+	add $3,$7,1024+256
+
+	.set	reorder
+	lwl	$8,0+3($4)	# load 128 bits
+	lwl	$9,4+3($4)
+	lwl	$10,8+3($4)
+	lwl	$11,12+3($4)
+	li	$1,128
+	lwr	$8,0+0($4)
+	lwr	$9,4+0($4)
+	lwr	$10,8+0($4)
+	lwr	$11,12+0($4)
+	.set	noreorder
+	beq	$5,$1,.L128bits
+	li	$30,10
+
+	.set	reorder
+	lwl	$12,16+3($4)	# load 192 bits
+	lwl	$13,20+3($4)
+	li	$1,192
+	lwr	$12,16+0($4)
+	lwr	$13,20+0($4)
+	.set	noreorder
+	beq	$5,$1,.L192bits
+	li	$30,8
+
+	.set	reorder
+	lwl	$14,24+3($4)	# load 256 bits
+	lwl	$15,28+3($4)
+	li	$1,256
+	lwr	$14,24+0($4)
+	lwr	$15,28+0($4)
+	.set	noreorder
+	beq	$5,$1,.L256bits
+	li	$30,7
+
+	b	.Lekey_done
+	li	$2,-2
+
+.align	4
+.L128bits:
+	.set	reorder
+	srl	$1,$11,16
+	srl	$2,$11,8
+	and	$1,0xff
+	and	$2,0xff
+	and	$24,$11,0xff
+	srl	$25,$11,24
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$1,1024($1)
+	lbu	$2,1024($2)
+	lbu	$24,1024($24)
+	lbu	$25,1024($25)
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	sw	$11,12($6)
+	sub	$30,1
+	add $6,16
+
+	sll	$1,$1,8
+	#sll	$2,$2,0
+	sll	$24,$24,24
+	sll	$25,$25,16
+
+	xor	$8,$1
+	lw	$1,0($3)
+	xor	$8,$2
+	xor	$8,$24
+	xor	$8,$25
+	xor	$8,$1
+
+	xor	$9,$8
+	xor	$10,$9
+	xor	$11,$10
+
+	.set	noreorder
+	bnez	$30,.L128bits
+	add $3,4
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	li	$30,10
+	sw	$11,12($6)
+	li	$2,0
+	sw	$30,80($6)
+	b	.Lekey_done
+	sub $6,10*16
+
+.align	4
+.L192bits:
+	.set	reorder
+	srl	$1,$13,16
+	srl	$2,$13,8
+	and	$1,0xff
+	and	$2,0xff
+	and	$24,$13,0xff
+	srl	$25,$13,24
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$1,1024($1)
+	lbu	$2,1024($2)
+	lbu	$24,1024($24)
+	lbu	$25,1024($25)
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	sw	$11,12($6)
+	sw	$12,16($6)
+	sw	$13,20($6)
+	sub	$30,1
+	add $6,24
+
+	sll	$1,$1,8
+	#sll	$2,$2,0
+	sll	$24,$24,24
+	sll	$25,$25,16
+
+	xor	$8,$1
+	lw	$1,0($3)
+	xor	$8,$2
+	xor	$8,$24
+	xor	$8,$25
+	xor	$8,$1
+
+	xor	$9,$8
+	xor	$10,$9
+	xor	$11,$10
+	xor	$12,$11
+	xor	$13,$12
+
+	.set	noreorder
+	bnez	$30,.L192bits
+	add $3,4
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	li	$30,12
+	sw	$11,12($6)
+	li	$2,0
+	sw	$30,48($6)
+	b	.Lekey_done
+	sub $6,12*16
+
+.align	4
+.L256bits:
+	.set	reorder
+	srl	$1,$15,16
+	srl	$2,$15,8
+	and	$1,0xff
+	and	$2,0xff
+	and	$24,$15,0xff
+	srl	$25,$15,24
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$1,1024($1)
+	lbu	$2,1024($2)
+	lbu	$24,1024($24)
+	lbu	$25,1024($25)
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	sw	$11,12($6)
+	sw	$12,16($6)
+	sw	$13,20($6)
+	sw	$14,24($6)
+	sw	$15,28($6)
+	sub	$30,1
+
+	sll	$1,$1,8
+	#sll	$2,$2,0
+	sll	$24,$24,24
+	sll	$25,$25,16
+
+	xor	$8,$1
+	lw	$1,0($3)
+	xor	$8,$2
+	xor	$8,$24
+	xor	$8,$25
+	xor	$8,$1
+
+	xor	$9,$8
+	xor	$10,$9
+	xor	$11,$10
+	beqz	$30,.L256bits_done
+
+	srl	$1,$11,24
+	srl	$2,$11,16
+	srl	$24,$11,8
+	and	$25,$11,0xff
+	and	$2,0xff
+	and	$24,0xff
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$1,1024($1)
+	lbu	$2,1024($2)
+	lbu	$24,1024($24)
+	lbu	$25,1024($25)
+	sll	$1,24
+	sll	$2,16
+	sll	$24,8
+
+	xor	$12,$1
+	xor	$12,$2
+	xor	$12,$24
+	xor	$12,$25
+
+	xor	$13,$12
+	xor	$14,$13
+	xor	$15,$14
+
+	add $6,32
+	.set	noreorder
+	b	.L256bits
+	add $3,4
+
+.L256bits_done:
+	sw	$8,32($6)
+	sw	$9,36($6)
+	sw	$10,40($6)
+	li	$30,14
+	sw	$11,44($6)
+	li	$2,0
+	sw	$30,48($6)
+	sub $6,12*16
+
+.Lekey_done:
+	jr	$31
+	nop
+.end	_mips_AES_set_encrypt_key
+
+.globl	private_AES_set_encrypt_key
+.ent	private_AES_set_encrypt_key
+private_AES_set_encrypt_key:
+	.frame	$29,32,$31
+	.mask	3221225472,-4
+	.set	noreorder
+	.cpload	$25
+	sub $29,32
+	sw	$31,32-1*4($29)
+	sw	$30,32-2*4($29)
+	.set	reorder
+	la	$7,AES_Te		# PIC-ified 'load address'
+
+	bal	_mips_AES_set_encrypt_key
+
+	.set	noreorder
+	move	$4,$2
+	lw	$31,32-1*4($29)
+	lw	$30,32-2*4($29)
+	jr	$31
+	add $29,32
+.end	private_AES_set_encrypt_key
+.align	5
+.globl	private_AES_set_decrypt_key
+.ent	private_AES_set_decrypt_key
+private_AES_set_decrypt_key:
+	.frame	$29,32,$31
+	.mask	3221225472,-4
+	.set	noreorder
+	.cpload	$25
+	sub $29,32
+	sw	$31,32-1*4($29)
+	sw	$30,32-2*4($29)
+	.set	reorder
+	la	$7,AES_Te		# PIC-ified 'load address'
+
+	bal	_mips_AES_set_encrypt_key
+
+	bltz	$2,.Ldkey_done
+
+	sll	$1,$30,4
+	add $4,$6,0
+	add $5,$6,$1
+.align	4
+.Lswap:
+	lw	$8,0($4)
+	lw	$9,4($4)
+	lw	$10,8($4)
+	lw	$11,12($4)
+	lw	$12,0($5)
+	lw	$13,4($5)
+	lw	$14,8($5)
+	lw	$15,12($5)
+	sw	$8,0($5)
+	sw	$9,4($5)
+	sw	$10,8($5)
+	sw	$11,12($5)
+	add $4,16
+	sub $5,16
+	sw	$12,-16($4)
+	sw	$13,-12($4)
+	sw	$14,-8($4)
+	sw	$15,-4($4)
+	bne	$4,$5,.Lswap
+
+	lw	$8,16($6)		# modulo-scheduled
+	lui	$2,0x8080
+	sub	$30,1
+	or	$2,0x8080
+	sll	$30,2
+	add $6,16
+	lui	$25,0x1b1b
+	nor	$24,$0,$2
+	or	$25,0x1b1b
+.align	4
+.Lmix:
+	and	$1,$8,$2
+	and	$9,$8,$24
+	srl	$10,$1,7
+	addu	$9,$9		# tp2<<1
+	subu	$1,$10
+	and	$1,$25
+	xor	$9,$1
+
+	and	$1,$9,$2
+	and	$10,$9,$24
+	srl	$11,$1,7
+	addu	$10,$10		# tp4<<1
+	subu	$1,$11
+	and	$1,$25
+	xor	$10,$1
+
+	and	$1,$10,$2
+	and	$11,$10,$24
+	srl	$12,$1,7
+	addu	$11,$11		# tp8<<1
+	subu	$1,$12
+	and	$1,$25
+	xor	$11,$1
+
+	xor	$12,$11,$8
+	xor	$15,$11,$10
+	xor	$13,$12,$9
+	xor	$14,$12,$10
+
+	sll	$8,$14,16
+	 xor	$15,$9
+	srl	$9,$14,16
+	xor	$15,$8
+	sll	$8,$12,8
+	xor	$15,$9
+	srl	$9,$12,24
+	xor	$15,$8
+	sll	$8,$13,24
+	xor	$15,$9
+	srl	$9,$13,8
+	xor	$15,$8
+	lw	$8,4($6)		# modulo-scheduled
+	xor	$15,$9
+	sub	$30,1
+	sw	$15,0($6)
+	add $6,4
+	bnez	$30,.Lmix
+
+	li	$2,0
+.Ldkey_done:
+	.set	noreorder
+	move	$4,$2
+	lw	$31,32-1*4($29)
+	lw	$30,32-2*4($29)
+	jr	$31
+	add $29,32
+.end	private_AES_set_decrypt_key
+.rdata
+.align	6
+AES_Te:
+.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84	# Te0
+.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
+.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
+.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
+.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
+.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
+.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
+.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
+.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
+.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
+.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
+.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
+.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
+.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
+.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
+.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
+.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
+.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
+.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
+.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
+.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
+.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
+.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
+.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
+.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
+.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
+.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
+.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
+.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
+.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
+.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
+.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
+.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
+.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
+.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
+.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
+.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
+.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
+.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
+.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
+.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
+.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
+.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
+.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
+.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
+.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
+.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
+.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
+.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
+.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
+.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
+.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
+.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
+.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
+.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
+.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
+.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
+.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
+.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
+.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
+.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
+.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
+.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
+.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
+.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
+.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
+.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
+.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
+.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
+.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
+.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
+.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
+.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
+.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
+.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
+.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
+.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
+.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
+.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
+.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
+.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
+.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
+.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
+.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
+.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
+.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
+.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
+.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
+.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
+.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
+.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
+.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
+.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
+.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
+.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
+.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
+.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
+.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
+.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
+.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
+.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
+.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
+.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
+.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
+.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
+.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
+.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
+.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
+.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
+.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
+.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
+.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
+.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
+.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
+.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
+.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
+.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
+.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
+.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
+.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
+.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
+.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
+.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
+.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
+.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
+.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
+.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
+.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
+
+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5	# Te4
+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00	# rcon
+.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
+.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
+.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
+.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00
+
+.align	6
+AES_Td:
+.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53	# Td0
+.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
+.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
+.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
+.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
+.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
+.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
+.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
+.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
+.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
+.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
+.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
+.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
+.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
+.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
+.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
+.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
+.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
+.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
+.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
+.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
+.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
+.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
+.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
+.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
+.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
+.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
+.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
+.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
+.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
+.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
+.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
+.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
+.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
+.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
+.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
+.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
+.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
+.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
+.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
+.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
+.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
+.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
+.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
+.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
+.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
+.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
+.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
+.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
+.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
+.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
+.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
+.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
+.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
+.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
+.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
+.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
+.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
+.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
+.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
+.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
+.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
+.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
+.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
+.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
+.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
+.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
+.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
+.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
+.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
+.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
+.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
+.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
+.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
+.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
+.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
+.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
+.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
+.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
+.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
+.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
+.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
+.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
+.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
+.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
+.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
+.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
+.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
+.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
+.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
+.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
+.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
+.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
+.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
+.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
+.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
+.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
+.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
+.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
+.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
+.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
+.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
+.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
+.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
+.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
+.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
+.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
+.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
+.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
+.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
+.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
+.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
+.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
+.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
+.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
+.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
+.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
+.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
+.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
+.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
+.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
+.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
+.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
+.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
+.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
+.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
+.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
+.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
+
+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38	# Td4
+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
diff --git a/main/openssl/crypto/aes/asm/aes-mips.pl b/main/openssl/crypto/aes/asm/aes-mips.pl
new file mode 100644
index 00000000..e5239542
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-mips.pl
@@ -0,0 +1,1611 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for MIPS
+
+# October 2010
+#
+# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
+# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
+# faster than gcc-generated code, which is not very impressive. But
+# recall that compressed S-box requires extra processing, namely
+# additional rotations. Rotations are implemented with lwl/lwr pairs,
+# which is normally used for loading unaligned data. Another cool
+# thing about this module is its endian neutrality, which means that
+# it processes data without ever changing byte order...
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$PTR_SLL="dsll";	# incidentally works even on n32
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$PTR_SLL="sll";
+	$SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
+open STDOUT,">$output";
+
+if (!defined($big_endian))
+{    $big_endian=(unpack('L',pack('N',1))==1);   }
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($MSB,$LSB)=(0,3);	# automatically converted to little-endian
+
+$code.=<<___;
+.text
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+#if !defined(__vxworks) || defined(__pic__)
+.option	pic2
+#endif
+.set	noat
+___
+
+{{{
+my $FRAMESIZE=16*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
+my ($key0,$cnt)=($gp,$fp);
+
+# instuction ordering is "stolen" from output from MIPSpro assembler
+# invoked with -mips3 -O3 arguments...
+$code.=<<___;
+.align	5
+.ent	_mips_AES_encrypt
+_mips_AES_encrypt:
+	.frame	$sp,0,$ra
+	.set	reorder
+	lw	$t0,0($key)
+	lw	$t1,4($key)
+	lw	$t2,8($key)
+	lw	$t3,12($key)
+	lw	$cnt,240($key)
+	$PTR_ADD $key0,$key,16
+
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+
+	sub	$cnt,1
+	_xtr	$i0,$s1,16-2
+.Loop_enc:
+	_xtr	$i1,$s2,16-2
+	_xtr	$i2,$s3,16-2
+	_xtr	$i3,$s0,16-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t0,3($i0)		# Te1[s1>>16]
+	lwl	$t1,3($i1)		# Te1[s2>>16]
+	lwl	$t2,3($i2)		# Te1[s3>>16]
+	lwl	$t3,3($i3)		# Te1[s0>>16]
+	lwr	$t0,2($i0)		# Te1[s1>>16]
+	lwr	$t1,2($i1)		# Te1[s2>>16]
+	lwr	$t2,2($i2)		# Te1[s3>>16]
+	lwr	$t3,2($i3)		# Te1[s0>>16]
+
+	_xtr	$i0,$s2,8-2
+	_xtr	$i1,$s3,8-2
+	_xtr	$i2,$s0,8-2
+	_xtr	$i3,$s1,8-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t4,2($i0)		# Te2[s2>>8]
+	lwl	$t5,2($i1)		# Te2[s3>>8]
+	lwl	$t6,2($i2)		# Te2[s0>>8]
+	lwl	$t7,2($i3)		# Te2[s1>>8]
+	lwr	$t4,1($i0)		# Te2[s2>>8]
+	lwr	$t5,1($i1)		# Te2[s3>>8]
+	lwr	$t6,1($i2)		# Te2[s0>>8]
+	lwr	$t7,1($i3)		# Te2[s1>>8]
+
+	_xtr	$i0,$s3,0-2
+	_xtr	$i1,$s0,0-2
+	_xtr	$i2,$s1,0-2
+	_xtr	$i3,$s2,0-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t8,1($i0)		# Te3[s3]
+	lwl	$t9,1($i1)		# Te3[s0]
+	lwl	$t10,1($i2)		# Te3[s1]
+	lwl	$t11,1($i3)		# Te3[s2]
+	lwr	$t8,0($i0)		# Te3[s3]
+	lwr	$t9,0($i1)		# Te3[s0]
+	lwr	$t10,0($i2)		# Te3[s1]
+	lwr	$t11,0($i3)		# Te3[s2]
+
+	_xtr	$i0,$s0,24-2
+	_xtr	$i1,$s1,24-2
+	_xtr	$i2,$s2,24-2
+	_xtr	$i3,$s3,24-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+	lw	$t4,0($i0)		# Te0[s0>>24]
+	lw	$t5,0($i1)		# Te0[s1>>24]
+	lw	$t6,0($i2)		# Te0[s2>>24]
+	lw	$t7,0($i3)		# Te0[s3>>24]
+
+	lw	$s0,0($key0)
+	lw	$s1,4($key0)
+	lw	$s2,8($key0)
+	lw	$s3,12($key0)
+
+	xor	$t0,$t8
+	xor	$t1,$t9
+	xor	$t2,$t10
+	xor	$t3,$t11
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	sub	$cnt,1
+	$PTR_ADD $key0,16
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+	.set	noreorder
+	bnez	$cnt,.Loop_enc
+	_xtr	$i0,$s1,16-2
+
+	.set	reorder
+	_xtr	$i1,$s2,16-2
+	_xtr	$i2,$s3,16-2
+	_xtr	$i3,$s0,16-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t0,2($i0)		# Te4[s1>>16]
+	lbu	$t1,2($i1)		# Te4[s2>>16]
+	lbu	$t2,2($i2)		# Te4[s3>>16]
+	lbu	$t3,2($i3)		# Te4[s0>>16]
+
+	_xtr	$i0,$s2,8-2
+	_xtr	$i1,$s3,8-2
+	_xtr	$i2,$s0,8-2
+	_xtr	$i3,$s1,8-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t4,2($i0)		# Te4[s2>>8]
+	lbu	$t5,2($i1)		# Te4[s3>>8]
+	lbu	$t6,2($i2)		# Te4[s0>>8]
+	lbu	$t7,2($i3)		# Te4[s1>>8]
+
+	_xtr	$i0,$s0,24-2
+	_xtr	$i1,$s1,24-2
+	_xtr	$i2,$s2,24-2
+	_xtr	$i3,$s3,24-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t8,2($i0)		# Te4[s0>>24]
+	lbu	$t9,2($i1)		# Te4[s1>>24]
+	lbu	$t10,2($i2)		# Te4[s2>>24]
+	lbu	$t11,2($i3)		# Te4[s3>>24]
+
+	_xtr	$i0,$s3,0-2
+	_xtr	$i1,$s0,0-2
+	_xtr	$i2,$s1,0-2
+	_xtr	$i3,$s2,0-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+
+	_ins	$t0,16
+	_ins	$t1,16
+	_ins	$t2,16
+	_ins	$t3,16
+
+	_ins	$t4,8
+	_ins	$t5,8
+	_ins	$t6,8
+	_ins	$t7,8
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t4,2($i0)		# Te4[s3]
+	lbu	$t5,2($i1)		# Te4[s0]
+	lbu	$t6,2($i2)		# Te4[s1]
+	lbu	$t7,2($i3)		# Te4[s2]
+
+	_ins	$t8,24
+	_ins	$t9,24
+	_ins	$t10,24
+	_ins	$t11,24
+
+	lw	$s0,0($key0)
+	lw	$s1,4($key0)
+	lw	$s2,8($key0)
+	lw	$s3,12($key0)
+
+	xor	$t0,$t8
+	xor	$t1,$t9
+	xor	$t2,$t10
+	xor	$t3,$t11
+
+	_ins	$t4,0
+	_ins	$t5,0
+	_ins	$t6,0
+	_ins	$t7,0
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+
+	jr	$ra
+.end	_mips_AES_encrypt
+
+.align	5
+.globl	AES_encrypt
+.ent	AES_encrypt
+AES_encrypt:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	\$15,$FRAMESIZE-11*$SZREG($sp)
+	$REG_S	\$14,$FRAMESIZE-12*$SZREG($sp)
+	$REG_S	\$13,$FRAMESIZE-13*$SZREG($sp)
+	$REG_S	\$12,$FRAMESIZE-14*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Tbl
+	.cpsetup	$pf,$zero,AES_encrypt
+___
+$code.=<<___;
+	.set	reorder
+	la	$Tbl,AES_Te		# PIC-ified 'load address'
+
+	lwl	$s0,0+$MSB($inp)
+	lwl	$s1,4+$MSB($inp)
+	lwl	$s2,8+$MSB($inp)
+	lwl	$s3,12+$MSB($inp)
+	lwr	$s0,0+$LSB($inp)
+	lwr	$s1,4+$LSB($inp)
+	lwr	$s2,8+$LSB($inp)
+	lwr	$s3,12+$LSB($inp)
+
+	bal	_mips_AES_encrypt
+
+	swr	$s0,0+$LSB($out)
+	swr	$s1,4+$LSB($out)
+	swr	$s2,8+$LSB($out)
+	swr	$s3,12+$LSB($out)
+	swl	$s0,0+$MSB($out)
+	swl	$s1,4+$MSB($out)
+	swl	$s2,8+$MSB($out)
+	swl	$s3,12+$MSB($out)
+
+	.set	noreorder
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	\$15,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	\$14,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	\$13,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	\$12,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	AES_encrypt
+___
+
+$code.=<<___;
+.align	5
+.ent	_mips_AES_decrypt
+_mips_AES_decrypt:
+	.frame	$sp,0,$ra
+	.set	reorder
+	lw	$t0,0($key)
+	lw	$t1,4($key)
+	lw	$t2,8($key)
+	lw	$t3,12($key)
+	lw	$cnt,240($key)
+	$PTR_ADD $key0,$key,16
+
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+
+	sub	$cnt,1
+	_xtr	$i0,$s3,16-2
+.Loop_dec:
+	_xtr	$i1,$s0,16-2
+	_xtr	$i2,$s1,16-2
+	_xtr	$i3,$s2,16-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t0,3($i0)		# Td1[s3>>16]
+	lwl	$t1,3($i1)		# Td1[s0>>16]
+	lwl	$t2,3($i2)		# Td1[s1>>16]
+	lwl	$t3,3($i3)		# Td1[s2>>16]
+	lwr	$t0,2($i0)		# Td1[s3>>16]
+	lwr	$t1,2($i1)		# Td1[s0>>16]
+	lwr	$t2,2($i2)		# Td1[s1>>16]
+	lwr	$t3,2($i3)		# Td1[s2>>16]
+
+	_xtr	$i0,$s2,8-2
+	_xtr	$i1,$s3,8-2
+	_xtr	$i2,$s0,8-2
+	_xtr	$i3,$s1,8-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t4,2($i0)		# Td2[s2>>8]
+	lwl	$t5,2($i1)		# Td2[s3>>8]
+	lwl	$t6,2($i2)		# Td2[s0>>8]
+	lwl	$t7,2($i3)		# Td2[s1>>8]
+	lwr	$t4,1($i0)		# Td2[s2>>8]
+	lwr	$t5,1($i1)		# Td2[s3>>8]
+	lwr	$t6,1($i2)		# Td2[s0>>8]
+	lwr	$t7,1($i3)		# Td2[s1>>8]
+
+	_xtr	$i0,$s1,0-2
+	_xtr	$i1,$s2,0-2
+	_xtr	$i2,$s3,0-2
+	_xtr	$i3,$s0,0-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t8,1($i0)		# Td3[s1]
+	lwl	$t9,1($i1)		# Td3[s2]
+	lwl	$t10,1($i2)		# Td3[s3]
+	lwl	$t11,1($i3)		# Td3[s0]
+	lwr	$t8,0($i0)		# Td3[s1]
+	lwr	$t9,0($i1)		# Td3[s2]
+	lwr	$t10,0($i2)		# Td3[s3]
+	lwr	$t11,0($i3)		# Td3[s0]
+
+	_xtr	$i0,$s0,24-2
+	_xtr	$i1,$s1,24-2
+	_xtr	$i2,$s2,24-2
+	_xtr	$i3,$s3,24-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+
+	lw	$t4,0($i0)		# Td0[s0>>24]
+	lw	$t5,0($i1)		# Td0[s1>>24]
+	lw	$t6,0($i2)		# Td0[s2>>24]
+	lw	$t7,0($i3)		# Td0[s3>>24]
+
+	lw	$s0,0($key0)
+	lw	$s1,4($key0)
+	lw	$s2,8($key0)
+	lw	$s3,12($key0)
+
+	xor	$t0,$t8
+	xor	$t1,$t9
+	xor	$t2,$t10
+	xor	$t3,$t11
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	sub	$cnt,1
+	$PTR_ADD $key0,16
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+	.set	noreorder
+	bnez	$cnt,.Loop_dec
+	_xtr	$i0,$s3,16-2
+
+	.set	reorder
+	lw	$t4,1024($Tbl)		# prefetch Td4
+	lw	$t5,1024+32($Tbl)
+	lw	$t6,1024+64($Tbl)
+	lw	$t7,1024+96($Tbl)
+	lw	$t8,1024+128($Tbl)
+	lw	$t9,1024+160($Tbl)
+	lw	$t10,1024+192($Tbl)
+	lw	$t11,1024+224($Tbl)
+
+	_xtr	$i0,$s3,16
+	_xtr	$i1,$s0,16
+	_xtr	$i2,$s1,16
+	_xtr	$i3,$s2,16
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,0xff
+	and	$i3,0xff
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t0,1024($i0)		# Td4[s3>>16]
+	lbu	$t1,1024($i1)		# Td4[s0>>16]
+	lbu	$t2,1024($i2)		# Td4[s1>>16]
+	lbu	$t3,1024($i3)		# Td4[s2>>16]
+
+	_xtr	$i0,$s2,8
+	_xtr	$i1,$s3,8
+	_xtr	$i2,$s0,8
+	_xtr	$i3,$s1,8
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,0xff
+	and	$i3,0xff
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t4,1024($i0)		# Td4[s2>>8]
+	lbu	$t5,1024($i1)		# Td4[s3>>8]
+	lbu	$t6,1024($i2)		# Td4[s0>>8]
+	lbu	$t7,1024($i3)		# Td4[s1>>8]
+
+	_xtr	$i0,$s0,24
+	_xtr	$i1,$s1,24
+	_xtr	$i2,$s2,24
+	_xtr	$i3,$s3,24
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t8,1024($i0)		# Td4[s0>>24]
+	lbu	$t9,1024($i1)		# Td4[s1>>24]
+	lbu	$t10,1024($i2)		# Td4[s2>>24]
+	lbu	$t11,1024($i3)		# Td4[s3>>24]
+
+	_xtr	$i0,$s1,0
+	_xtr	$i1,$s2,0
+	_xtr	$i2,$s3,0
+	_xtr	$i3,$s0,0
+
+	_ins	$t0,16
+	_ins	$t1,16
+	_ins	$t2,16
+	_ins	$t3,16
+
+	_ins	$t4,8
+	_ins	$t5,8
+	_ins	$t6,8
+	_ins	$t7,8
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t4,1024($i0)		# Td4[s1]
+	lbu	$t5,1024($i1)		# Td4[s2]
+	lbu	$t6,1024($i2)		# Td4[s3]
+	lbu	$t7,1024($i3)		# Td4[s0]
+
+	_ins	$t8,24
+	_ins	$t9,24
+	_ins	$t10,24
+	_ins	$t11,24
+
+	lw	$s0,0($key0)
+	lw	$s1,4($key0)
+	lw	$s2,8($key0)
+	lw	$s3,12($key0)
+
+	_ins	$t4,0
+	_ins	$t5,0
+	_ins	$t6,0
+	_ins	$t7,0
+
+
+	xor	$t0,$t8
+	xor	$t1,$t9
+	xor	$t2,$t10
+	xor	$t3,$t11
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+
+	jr	$ra
+.end	_mips_AES_decrypt
+
+.align	5
+.globl	AES_decrypt
+.ent	AES_decrypt
+AES_decrypt:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	\$15,$FRAMESIZE-11*$SZREG($sp)
+	$REG_S	\$14,$FRAMESIZE-12*$SZREG($sp)
+	$REG_S	\$13,$FRAMESIZE-13*$SZREG($sp)
+	$REG_S	\$12,$FRAMESIZE-14*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Tbl
+	.cpsetup	$pf,$zero,AES_decrypt
+___
+$code.=<<___;
+	.set	reorder
+	la	$Tbl,AES_Td		# PIC-ified 'load address'
+
+	lwl	$s0,0+$MSB($inp)
+	lwl	$s1,4+$MSB($inp)
+	lwl	$s2,8+$MSB($inp)
+	lwl	$s3,12+$MSB($inp)
+	lwr	$s0,0+$LSB($inp)
+	lwr	$s1,4+$LSB($inp)
+	lwr	$s2,8+$LSB($inp)
+	lwr	$s3,12+$LSB($inp)
+
+	bal	_mips_AES_decrypt
+
+	swr	$s0,0+$LSB($out)
+	swr	$s1,4+$LSB($out)
+	swr	$s2,8+$LSB($out)
+	swr	$s3,12+$LSB($out)
+	swl	$s0,0+$MSB($out)
+	swl	$s1,4+$MSB($out)
+	swl	$s2,8+$MSB($out)
+	swl	$s3,12+$MSB($out)
+
+	.set	noreorder
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	\$15,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	\$14,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	\$13,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	\$12,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	AES_decrypt
+___
+}}}
+
+{{{
+my $FRAMESIZE=8*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
+
+my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($rcon,$cnt)=($gp,$fp);
+
+$code.=<<___;
+.align	5
+.ent	_mips_AES_set_encrypt_key
+_mips_AES_set_encrypt_key:
+	.frame	$sp,0,$ra
+	.set	noreorder
+	beqz	$inp,.Lekey_done
+	li	$t0,-1
+	beqz	$key,.Lekey_done
+	$PTR_ADD $rcon,$Tbl,1024+256
+
+	.set	reorder
+	lwl	$rk0,0+$MSB($inp)	# load 128 bits
+	lwl	$rk1,4+$MSB($inp)
+	lwl	$rk2,8+$MSB($inp)
+	lwl	$rk3,12+$MSB($inp)
+	li	$at,128
+	lwr	$rk0,0+$LSB($inp)
+	lwr	$rk1,4+$LSB($inp)
+	lwr	$rk2,8+$LSB($inp)
+	lwr	$rk3,12+$LSB($inp)
+	.set	noreorder
+	beq	$bits,$at,.L128bits
+	li	$cnt,10
+
+	.set	reorder
+	lwl	$rk4,16+$MSB($inp)	# load 192 bits
+	lwl	$rk5,20+$MSB($inp)
+	li	$at,192
+	lwr	$rk4,16+$LSB($inp)
+	lwr	$rk5,20+$LSB($inp)
+	.set	noreorder
+	beq	$bits,$at,.L192bits
+	li	$cnt,8
+
+	.set	reorder
+	lwl	$rk6,24+$MSB($inp)	# load 256 bits
+	lwl	$rk7,28+$MSB($inp)
+	li	$at,256
+	lwr	$rk6,24+$LSB($inp)
+	lwr	$rk7,28+$LSB($inp)
+	.set	noreorder
+	beq	$bits,$at,.L256bits
+	li	$cnt,7
+
+	b	.Lekey_done
+	li	$t0,-2
+
+.align	4
+.L128bits:
+	.set	reorder
+	srl	$i0,$rk3,16
+	srl	$i1,$rk3,8
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,$rk3,0xff
+	srl	$i3,$rk3,24
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$i0,1024($i0)
+	lbu	$i1,1024($i1)
+	lbu	$i2,1024($i2)
+	lbu	$i3,1024($i3)
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	sw	$rk3,12($key)
+	sub	$cnt,1
+	$PTR_ADD $key,16
+
+	_bias	$i0,24
+	_bias	$i1,16
+	_bias	$i2,8
+	_bias	$i3,0
+
+	xor	$rk0,$i0
+	lw	$i0,0($rcon)
+	xor	$rk0,$i1
+	xor	$rk0,$i2
+	xor	$rk0,$i3
+	xor	$rk0,$i0
+
+	xor	$rk1,$rk0
+	xor	$rk2,$rk1
+	xor	$rk3,$rk2
+
+	.set	noreorder
+	bnez	$cnt,.L128bits
+	$PTR_ADD $rcon,4
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	li	$cnt,10
+	sw	$rk3,12($key)
+	li	$t0,0
+	sw	$cnt,80($key)
+	b	.Lekey_done
+	$PTR_SUB $key,10*16
+
+.align	4
+.L192bits:
+	.set	reorder
+	srl	$i0,$rk5,16
+	srl	$i1,$rk5,8
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,$rk5,0xff
+	srl	$i3,$rk5,24
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$i0,1024($i0)
+	lbu	$i1,1024($i1)
+	lbu	$i2,1024($i2)
+	lbu	$i3,1024($i3)
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	sw	$rk3,12($key)
+	sw	$rk4,16($key)
+	sw	$rk5,20($key)
+	sub	$cnt,1
+	$PTR_ADD $key,24
+
+	_bias	$i0,24
+	_bias	$i1,16
+	_bias	$i2,8
+	_bias	$i3,0
+
+	xor	$rk0,$i0
+	lw	$i0,0($rcon)
+	xor	$rk0,$i1
+	xor	$rk0,$i2
+	xor	$rk0,$i3
+	xor	$rk0,$i0
+
+	xor	$rk1,$rk0
+	xor	$rk2,$rk1
+	xor	$rk3,$rk2
+	xor	$rk4,$rk3
+	xor	$rk5,$rk4
+
+	.set	noreorder
+	bnez	$cnt,.L192bits
+	$PTR_ADD $rcon,4
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	li	$cnt,12
+	sw	$rk3,12($key)
+	li	$t0,0
+	sw	$cnt,48($key)
+	b	.Lekey_done
+	$PTR_SUB $key,12*16
+
+.align	4
+.L256bits:
+	.set	reorder
+	srl	$i0,$rk7,16
+	srl	$i1,$rk7,8
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,$rk7,0xff
+	srl	$i3,$rk7,24
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$i0,1024($i0)
+	lbu	$i1,1024($i1)
+	lbu	$i2,1024($i2)
+	lbu	$i3,1024($i3)
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	sw	$rk3,12($key)
+	sw	$rk4,16($key)
+	sw	$rk5,20($key)
+	sw	$rk6,24($key)
+	sw	$rk7,28($key)
+	sub	$cnt,1
+
+	_bias	$i0,24
+	_bias	$i1,16
+	_bias	$i2,8
+	_bias	$i3,0
+
+	xor	$rk0,$i0
+	lw	$i0,0($rcon)
+	xor	$rk0,$i1
+	xor	$rk0,$i2
+	xor	$rk0,$i3
+	xor	$rk0,$i0
+
+	xor	$rk1,$rk0
+	xor	$rk2,$rk1
+	xor	$rk3,$rk2
+	beqz	$cnt,.L256bits_done
+
+	srl	$i0,$rk3,24
+	srl	$i1,$rk3,16
+	srl	$i2,$rk3,8
+	and	$i3,$rk3,0xff
+	and	$i1,0xff
+	and	$i2,0xff
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$i0,1024($i0)
+	lbu	$i1,1024($i1)
+	lbu	$i2,1024($i2)
+	lbu	$i3,1024($i3)
+	sll	$i0,24
+	sll	$i1,16
+	sll	$i2,8
+
+	xor	$rk4,$i0
+	xor	$rk4,$i1
+	xor	$rk4,$i2
+	xor	$rk4,$i3
+
+	xor	$rk5,$rk4
+	xor	$rk6,$rk5
+	xor	$rk7,$rk6
+
+	$PTR_ADD $key,32
+	.set	noreorder
+	b	.L256bits
+	$PTR_ADD $rcon,4
+
+.L256bits_done:
+	sw	$rk0,32($key)
+	sw	$rk1,36($key)
+	sw	$rk2,40($key)
+	li	$cnt,14
+	sw	$rk3,44($key)
+	li	$t0,0
+	sw	$cnt,48($key)
+	$PTR_SUB $key,12*16
+
+.Lekey_done:
+	jr	$ra
+	nop
+.end	_mips_AES_set_encrypt_key
+
+.globl	private_AES_set_encrypt_key
+.ent	private_AES_set_encrypt_key
+private_AES_set_encrypt_key:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	$s3,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s2,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s1,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s0,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Tbl
+	.cpsetup	$pf,$zero,private_AES_set_encrypt_key
+___
+$code.=<<___;
+	.set	reorder
+	la	$Tbl,AES_Te		# PIC-ified 'load address'
+
+	bal	_mips_AES_set_encrypt_key
+
+	.set	noreorder
+	move	$a0,$t0
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	private_AES_set_encrypt_key
+___
+
+my ($head,$tail)=($inp,$bits);
+my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
+$code.=<<___;
+.align	5
+.globl	private_AES_set_decrypt_key
+.ent	private_AES_set_decrypt_key
+private_AES_set_decrypt_key:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	$s3,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s2,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s1,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s0,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Tbl
+	.cpsetup	$pf,$zero,private_AES_set_decrypt_key
+___
+$code.=<<___;
+	.set	reorder
+	la	$Tbl,AES_Te		# PIC-ified 'load address'
+
+	bal	_mips_AES_set_encrypt_key
+
+	bltz	$t0,.Ldkey_done
+
+	sll	$at,$cnt,4
+	$PTR_ADD $head,$key,0
+	$PTR_ADD $tail,$key,$at
+.align	4
+.Lswap:
+	lw	$rk0,0($head)
+	lw	$rk1,4($head)
+	lw	$rk2,8($head)
+	lw	$rk3,12($head)
+	lw	$rk4,0($tail)
+	lw	$rk5,4($tail)
+	lw	$rk6,8($tail)
+	lw	$rk7,12($tail)
+	sw	$rk0,0($tail)
+	sw	$rk1,4($tail)
+	sw	$rk2,8($tail)
+	sw	$rk3,12($tail)
+	$PTR_ADD $head,16
+	$PTR_SUB $tail,16
+	sw	$rk4,-16($head)
+	sw	$rk5,-12($head)
+	sw	$rk6,-8($head)
+	sw	$rk7,-4($head)
+	bne	$head,$tail,.Lswap
+
+	lw	$tp1,16($key)		# modulo-scheduled
+	lui	$x80808080,0x8080
+	sub	$cnt,1
+	or	$x80808080,0x8080
+	sll	$cnt,2
+	$PTR_ADD $key,16
+	lui	$x1b1b1b1b,0x1b1b
+	nor	$x7f7f7f7f,$zero,$x80808080
+	or	$x1b1b1b1b,0x1b1b
+.align	4
+.Lmix:
+	and	$m,$tp1,$x80808080
+	and	$tp2,$tp1,$x7f7f7f7f
+	srl	$tp4,$m,7
+	addu	$tp2,$tp2		# tp2<<1
+	subu	$m,$tp4
+	and	$m,$x1b1b1b1b
+	xor	$tp2,$m
+
+	and	$m,$tp2,$x80808080
+	and	$tp4,$tp2,$x7f7f7f7f
+	srl	$tp8,$m,7
+	addu	$tp4,$tp4		# tp4<<1
+	subu	$m,$tp8
+	and	$m,$x1b1b1b1b
+	xor	$tp4,$m
+
+	and	$m,$tp4,$x80808080
+	and	$tp8,$tp4,$x7f7f7f7f
+	srl	$tp9,$m,7
+	addu	$tp8,$tp8		# tp8<<1
+	subu	$m,$tp9
+	and	$m,$x1b1b1b1b
+	xor	$tp8,$m
+
+	xor	$tp9,$tp8,$tp1
+	xor	$tpe,$tp8,$tp4
+	xor	$tpb,$tp9,$tp2
+	xor	$tpd,$tp9,$tp4
+
+	_ror	$tp1,$tpd,16
+	 xor	$tpe,$tp2
+	_ror	$tp2,$tpd,-16
+	xor	$tpe,$tp1
+	_ror	$tp1,$tp9,8
+	xor	$tpe,$tp2
+	_ror	$tp2,$tp9,-24
+	xor	$tpe,$tp1
+	_ror	$tp1,$tpb,24
+	xor	$tpe,$tp2
+	_ror	$tp2,$tpb,-8
+	xor	$tpe,$tp1
+	lw	$tp1,4($key)		# modulo-scheduled
+	xor	$tpe,$tp2
+	sub	$cnt,1
+	sw	$tpe,0($key)
+	$PTR_ADD $key,4
+	bnez	$cnt,.Lmix
+
+	li	$t0,0
+.Ldkey_done:
+	.set	noreorder
+	move	$a0,$t0
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	private_AES_set_decrypt_key
+___
+}}}
+
+######################################################################
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+.rdata
+.align	6
+AES_Te:
+.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84	# Te0
+.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
+.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
+.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
+.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
+.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
+.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
+.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
+.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
+.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
+.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
+.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
+.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
+.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
+.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
+.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
+.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
+.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
+.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
+.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
+.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
+.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
+.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
+.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
+.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
+.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
+.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
+.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
+.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
+.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
+.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
+.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
+.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
+.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
+.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
+.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
+.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
+.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
+.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
+.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
+.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
+.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
+.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
+.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
+.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
+.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
+.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
+.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
+.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
+.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
+.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
+.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
+.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
+.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
+.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
+.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
+.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
+.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
+.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
+.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
+.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
+.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
+.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
+.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
+.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
+.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
+.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
+.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
+.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
+.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
+.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
+.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
+.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
+.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
+.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
+.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
+.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
+.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
+.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
+.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
+.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
+.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
+.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
+.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
+.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
+.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
+.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
+.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
+.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
+.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
+.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
+.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
+.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
+.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
+.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
+.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
+.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
+.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
+.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
+.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
+.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
+.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
+.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
+.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
+.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
+.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
+.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
+.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
+.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
+.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
+.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
+.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
+.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
+.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
+.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
+.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
+.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
+.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
+.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
+.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
+.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
+.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
+.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
+.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
+.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
+.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
+.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
+.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
+
+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5	# Te4
+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00	# rcon
+.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
+.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
+.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
+.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00
+
+.align	6
+AES_Td:
+.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53	# Td0
+.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
+.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
+.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
+.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
+.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
+.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
+.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
+.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
+.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
+.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
+.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
+.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
+.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
+.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
+.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
+.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
+.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
+.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
+.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
+.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
+.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
+.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
+.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
+.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
+.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
+.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
+.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
+.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
+.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
+.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
+.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
+.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
+.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
+.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
+.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
+.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
+.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
+.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
+.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
+.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
+.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
+.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
+.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
+.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
+.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
+.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
+.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
+.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
+.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
+.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
+.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
+.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
+.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
+.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
+.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
+.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
+.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
+.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
+.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
+.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
+.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
+.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
+.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
+.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
+.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
+.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
+.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
+.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
+.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
+.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
+.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
+.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
+.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
+.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
+.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
+.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
+.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
+.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
+.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
+.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
+.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
+.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
+.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
+.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
+.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
+.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
+.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
+.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
+.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
+.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
+.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
+.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
+.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
+.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
+.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
+.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
+.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
+.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
+.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
+.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
+.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
+.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
+.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
+.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
+.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
+.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
+.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
+.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
+.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
+.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
+.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
+.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
+.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
+.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
+.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
+.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
+.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
+.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
+.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
+.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
+.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
+.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
+.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
+.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
+.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
+.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
+.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
+
+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38	# Td4
+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	# made-up _instructions, _xtr, _ins, _ror and _bias, cope
+	# with byte order dependencies...
+	if (/^\s+_/) {
+	    s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
+
+	    s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
+		sprintf("srl\t$1,$2,%d",$big_endian ?	eval($3)
+					:		eval("24-$3"))/e or
+	    s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+		sprintf("sll\t$1,$2,%d",$big_endian ?	eval($3)
+					:		eval("24-$3"))/e or
+	    s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
+		sprintf("srl\t$1,$2,%d",$big_endian ?	eval($3)
+					:		eval("$3*-1"))/e or
+	    s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+		sprintf("sll\t$1,$2,%d",$big_endian ?	eval($3)
+					:		eval("($3-16)&31"))/e;
+
+	    s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
+		sprintf("sll\t$1,$2,$3")/e				or
+	    s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
+		sprintf("and\t$1,$2,0xff")/e				or
+	    s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
+	}
+
+	# convert lwl/lwr and swr/swl to little-endian order
+	if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
+	    s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
+		sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e	or
+	    s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
+		sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
+	}
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/aes-parisc.pl b/main/openssl/crypto/aes/asm/aes-parisc.pl
new file mode 100644
index 00000000..714dcfbb
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-parisc.pl
@@ -0,0 +1,1022 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for PA-RISC.
+#
+# June 2009.
+#
+# The module is mechanical transliteration of aes-sparcv9.pl, but with
+# a twist: S-boxes are compressed even further down to 1K+256B. On
+# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
+# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
+# perform at 16 cycles per byte. It's not faster than code generated
+# by vendor compiler, but recall that it has compressed S-boxes, which
+# requires extra processing.
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+				#                 [+ argument transfer]
+$inp="%r26";	# arg0
+$out="%r25";	# arg1
+$key="%r24";	# arg2
+
+($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
+($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
+
+($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
+ $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
+("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
+"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
+
+$tbl="%r28";
+$rounds="%r29";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	64
+AES_encrypt
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	blr	%r0,$tbl
+	ldi	3,$t0
+L\$enc_pic
+	andcm	$tbl,$t0,$tbl
+	ldo	L\$AES_Te-L\$enc_pic($tbl),$tbl
+
+	and	$inp,$t0,$t0
+	sub	$inp,$t0,$inp
+	ldw	0($inp),$s0
+	ldw	4($inp),$s1
+	ldw	8($inp),$s2
+	comib,=	0,$t0,L\$enc_inp_aligned
+	ldw	12($inp),$s3
+
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11
+	ldw	16($inp),$t1
+	vshd	$s0,$s1,$s0
+	vshd	$s1,$s2,$s1
+	vshd	$s2,$s3,$s2
+	vshd	$s3,$t1,$s3
+
+L\$enc_inp_aligned
+	bl	_parisc_AES_encrypt,%r31
+	nop
+
+	extru,<> $out,31,2,%r0
+	b	L\$enc_out_aligned
+	nop
+
+	_srm	$s0,24,$acc0
+	_srm	$s0,16,$acc1
+	stb	$acc0,0($out)
+	_srm	$s0,8,$acc2
+	stb	$acc1,1($out)
+	_srm	$s1,24,$acc4
+	stb	$acc2,2($out)
+	_srm	$s1,16,$acc5
+	stb	$s0,3($out)
+	_srm	$s1,8,$acc6
+	stb	$acc4,4($out)
+	_srm	$s2,24,$acc0
+	stb	$acc5,5($out)
+	_srm	$s2,16,$acc1
+	stb	$acc6,6($out)
+	_srm	$s2,8,$acc2
+	stb	$s1,7($out)
+	_srm	$s3,24,$acc4
+	stb	$acc0,8($out)
+	_srm	$s3,16,$acc5
+	stb	$acc1,9($out)
+	_srm	$s3,8,$acc6
+	stb	$acc2,10($out)
+	stb	$s2,11($out)
+	stb	$acc4,12($out)
+	stb	$acc5,13($out)
+	stb	$acc6,14($out)
+	b	L\$enc_done
+	stb	$s3,15($out)
+
+L\$enc_out_aligned
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+L\$enc_done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	16
+_parisc_AES_encrypt
+	.PROC
+	.CALLINFO	MILLICODE
+	.ENTRY
+	ldw	240($key),$rounds
+	ldw	0($key),$t0
+	ldw	4($key),$t1
+	ldw	8($key),$t2
+	_srm	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ldw	12($key),$t3
+	_srm	$s0,24,$acc0
+	xor	$t1,$s1,$s1
+	ldw	16($key),$t0
+	_srm	$s1,16,$acc1
+	xor	$t2,$s2,$s2
+	ldw	20($key),$t1
+	xor	$t3,$s3,$s3
+	ldw	24($key),$t2
+	ldw	28($key),$t3
+L\$enc_loop
+	_srm	$s2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$s3,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$s1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$s2,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$s3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$s0,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$s2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$s3,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$s0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$s1,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$s3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$s0,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$s1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$s2,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+	ldwx,s	$acc14($tbl),$acc14
+	ldwx,s	$acc15($tbl),$acc15
+	addib,= -1,$rounds,L\$enc_last
+	ldo	32($key),$key
+
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+	_srm	$t1,16,$acc1
+		xor	$acc15,$t3,$t3
+
+	_srm	$t2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$t3,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$t1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$t2,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$t3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$t0,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$t2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$t3,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$t0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$t1,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$t3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$t0,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$t1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$t2,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+		_ror	$acc1,8,$acc1
+	ldwx,s	$acc14($tbl),$acc14
+
+		_ror	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldwx,s	$acc15($tbl),$acc15
+		_ror	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ldw	16($key),$t0
+		_ror	$acc5,8,$acc5
+		xor	$acc2,$s0,$s0
+	ldw	20($key),$t1
+		_ror	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ldw	24($key),$t2
+		_ror	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ldw	28($key),$t3
+		_ror	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldw	1024+0($tbl),%r0		; prefetch te4
+		_ror	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldw	1024+32($tbl),%r0		; prefetch te4
+		_ror	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldw	1024+64($tbl),%r0		; prefetch te4
+		_ror	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldw	1024+96($tbl),%r0		; prefetch te4
+		_ror	$acc14,16,$acc14
+		xor	$acc9,$s2,$s2
+	ldw	1024+128($tbl),%r0		; prefetch te4
+		_ror	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldw	1024+160($tbl),%r0		; prefetch te4
+	_srm	$s0,24,$acc0
+		xor	$acc11,$s2,$s2
+	ldw	1024+192($tbl),%r0		; prefetch te4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldw	1024+224($tbl),%r0		; prefetch te4
+	_srm	$s1,16,$acc1
+		xor	$acc14,$s3,$s3
+	b	L\$enc_loop
+		xor	$acc15,$s3,$s3
+
+	.ALIGN	16
+L\$enc_last
+	ldo	1024($tbl),$rounds
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+	_srm	$t1,16,$acc1
+		xor	$acc15,$t3,$t3
+
+	_srm	$t2,8,$acc2
+	ldbx	$acc0($rounds),$acc0
+	_srm	$t1,24,$acc4
+	ldbx	$acc1($rounds),$acc1
+	_srm	$t2,16,$acc5
+	_srm	$t3,0,$acc3
+	ldbx	$acc2($rounds),$acc2
+	ldbx	$acc3($rounds),$acc3
+	_srm	$t3,8,$acc6
+	ldbx	$acc4($rounds),$acc4
+	_srm	$t2,24,$acc8
+	ldbx	$acc5($rounds),$acc5
+	_srm	$t3,16,$acc9
+	_srm	$t0,0,$acc7
+	ldbx	$acc6($rounds),$acc6
+	ldbx	$acc7($rounds),$acc7
+	_srm	$t0,8,$acc10
+	ldbx	$acc8($rounds),$acc8
+	_srm	$t3,24,$acc12
+	ldbx	$acc9($rounds),$acc9
+	_srm	$t0,16,$acc13
+	_srm	$t1,0,$acc11
+	ldbx	$acc10($rounds),$acc10
+	_srm	$t1,8,$acc14
+	ldbx	$acc11($rounds),$acc11
+	ldbx	$acc12($rounds),$acc12
+	ldbx	$acc13($rounds),$acc13
+	_srm	$t2,0,$acc15
+	ldbx	$acc14($rounds),$acc14
+
+		dep	$acc0,7,8,$acc3
+	ldbx	$acc15($rounds),$acc15
+		dep	$acc4,7,8,$acc7
+		dep	$acc1,15,8,$acc3
+		dep	$acc5,15,8,$acc7
+		dep	$acc2,23,8,$acc3
+		dep	$acc6,23,8,$acc7
+		xor	$acc3,$s0,$s0
+		xor	$acc7,$s1,$s1
+		dep	$acc8,7,8,$acc11
+		dep	$acc12,7,8,$acc15
+		dep	$acc9,15,8,$acc11
+		dep	$acc13,15,8,$acc15
+		dep	$acc10,23,8,$acc11
+		dep	$acc14,23,8,$acc15
+		xor	$acc11,$s2,$s2
+
+	bv	(%r31)
+	.EXIT
+		xor	$acc15,$s3,$s3
+	.PROCEND
+
+	.ALIGN	64
+L\$AES_Te
+	.WORD	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+	.WORD	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+	.WORD	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+	.WORD	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+	.WORD	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+	.WORD	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+	.WORD	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+	.WORD	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+	.WORD	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+	.WORD	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+	.WORD	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+	.WORD	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+	.WORD	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+	.WORD	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+	.WORD	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+	.WORD	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+	.WORD	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+	.WORD	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+	.WORD	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+	.WORD	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+	.WORD	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+	.WORD	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+	.WORD	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+	.WORD	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+	.WORD	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+	.WORD	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+	.WORD	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+	.WORD	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+	.WORD	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+	.WORD	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+	.WORD	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+	.WORD	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+	.WORD	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+	.WORD	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+	.WORD	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+	.WORD	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+	.WORD	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+	.WORD	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+	.WORD	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+	.WORD	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+	.WORD	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+	.WORD	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+	.WORD	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+	.WORD	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+	.WORD	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+	.WORD	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+	.WORD	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+	.WORD	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+	.WORD	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+	.WORD	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+	.WORD	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+	.WORD	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+	.WORD	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+	.WORD	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+	.WORD	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+	.WORD	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+	.WORD	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+	.WORD	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+	.WORD	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+	.WORD	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+	.WORD	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+	.WORD	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+	.WORD	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+	.WORD	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+	.BYTE	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.BYTE	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.BYTE	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.BYTE	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.BYTE	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.BYTE	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.BYTE	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.BYTE	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.BYTE	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.BYTE	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.BYTE	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.BYTE	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.BYTE	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.BYTE	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.BYTE	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.BYTE	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.BYTE	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.BYTE	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.BYTE	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.BYTE	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.BYTE	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.BYTE	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.BYTE	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.BYTE	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.BYTE	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.BYTE	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.BYTE	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.BYTE	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.BYTE	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.BYTE	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.BYTE	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.BYTE	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+
+$code.=<<___;
+	.EXPORT	AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	16
+AES_decrypt
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	blr	%r0,$tbl
+	ldi	3,$t0
+L\$dec_pic
+	andcm	$tbl,$t0,$tbl
+	ldo	L\$AES_Td-L\$dec_pic($tbl),$tbl
+
+	and	$inp,$t0,$t0
+	sub	$inp,$t0,$inp
+	ldw	0($inp),$s0
+	ldw	4($inp),$s1
+	ldw	8($inp),$s2
+	comib,=	0,$t0,L\$dec_inp_aligned
+	ldw	12($inp),$s3
+
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11
+	ldw	16($inp),$t1
+	vshd	$s0,$s1,$s0
+	vshd	$s1,$s2,$s1
+	vshd	$s2,$s3,$s2
+	vshd	$s3,$t1,$s3
+
+L\$dec_inp_aligned
+	bl	_parisc_AES_decrypt,%r31
+	nop
+
+	extru,<> $out,31,2,%r0
+	b	L\$dec_out_aligned
+	nop
+
+	_srm	$s0,24,$acc0
+	_srm	$s0,16,$acc1
+	stb	$acc0,0($out)
+	_srm	$s0,8,$acc2
+	stb	$acc1,1($out)
+	_srm	$s1,24,$acc4
+	stb	$acc2,2($out)
+	_srm	$s1,16,$acc5
+	stb	$s0,3($out)
+	_srm	$s1,8,$acc6
+	stb	$acc4,4($out)
+	_srm	$s2,24,$acc0
+	stb	$acc5,5($out)
+	_srm	$s2,16,$acc1
+	stb	$acc6,6($out)
+	_srm	$s2,8,$acc2
+	stb	$s1,7($out)
+	_srm	$s3,24,$acc4
+	stb	$acc0,8($out)
+	_srm	$s3,16,$acc5
+	stb	$acc1,9($out)
+	_srm	$s3,8,$acc6
+	stb	$acc2,10($out)
+	stb	$s2,11($out)
+	stb	$acc4,12($out)
+	stb	$acc5,13($out)
+	stb	$acc6,14($out)
+	b	L\$dec_done
+	stb	$s3,15($out)
+
+L\$dec_out_aligned
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+L\$dec_done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	16
+_parisc_AES_decrypt
+	.PROC
+	.CALLINFO	MILLICODE
+	.ENTRY
+	ldw	240($key),$rounds
+	ldw	0($key),$t0
+	ldw	4($key),$t1
+	ldw	8($key),$t2
+	ldw	12($key),$t3
+	_srm	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ldw	16($key),$t0
+	xor	$t1,$s1,$s1
+	ldw	20($key),$t1
+	_srm	$s0,24,$acc0
+	xor	$t2,$s2,$s2
+	ldw	24($key),$t2
+	xor	$t3,$s3,$s3
+	ldw	28($key),$t3
+	_srm	$s3,16,$acc1
+L\$dec_loop
+	_srm	$s2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$s1,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$s1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$s0,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$s3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$s2,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$s2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$s1,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$s0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$s3,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$s3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$s2,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$s1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$s0,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+	ldwx,s	$acc14($tbl),$acc14
+	ldwx,s	$acc15($tbl),$acc15
+	addib,= -1,$rounds,L\$dec_last
+	ldo	32($key),$key
+
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3
+	_srm	$t3,16,$acc1
+
+	_srm	$t2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$t1,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$t1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$t0,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$t3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$t2,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$t2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$t1,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$t0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$t3,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$t3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$t2,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$t1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$t0,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+		_ror	$acc1,8,$acc1
+	ldwx,s	$acc14($tbl),$acc14
+
+		_ror	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldwx,s	$acc15($tbl),$acc15
+		_ror	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ldw	16($key),$t0
+		_ror	$acc5,8,$acc5
+		xor	$acc2,$s0,$s0
+	ldw	20($key),$t1
+		_ror	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ldw	24($key),$t2
+		_ror	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ldw	28($key),$t3
+		_ror	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldw	1024+0($tbl),%r0		; prefetch td4
+		_ror	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldw	1024+32($tbl),%r0		; prefetch td4
+		_ror	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldw	1024+64($tbl),%r0		; prefetch td4
+		_ror	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldw	1024+96($tbl),%r0		; prefetch td4
+		_ror	$acc14,16,$acc14
+		xor	$acc9,$s2,$s2
+	ldw	1024+128($tbl),%r0		; prefetch td4
+		_ror	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldw	1024+160($tbl),%r0		; prefetch td4
+	_srm	$s0,24,$acc0
+		xor	$acc11,$s2,$s2
+	ldw	1024+192($tbl),%r0		; prefetch td4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldw	1024+224($tbl),%r0		; prefetch td4
+		xor	$acc14,$s3,$s3
+		xor	$acc15,$s3,$s3
+	b	L\$dec_loop
+	_srm	$s3,16,$acc1
+
+	.ALIGN	16
+L\$dec_last
+	ldo	1024($tbl),$rounds
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3
+	_srm	$t3,16,$acc1
+
+	_srm	$t2,8,$acc2
+	ldbx	$acc0($rounds),$acc0
+	_srm	$t1,24,$acc4
+	ldbx	$acc1($rounds),$acc1
+	_srm	$t0,16,$acc5
+	_srm	$t1,0,$acc3
+	ldbx	$acc2($rounds),$acc2
+	ldbx	$acc3($rounds),$acc3
+	_srm	$t3,8,$acc6
+	ldbx	$acc4($rounds),$acc4
+	_srm	$t2,24,$acc8
+	ldbx	$acc5($rounds),$acc5
+	_srm	$t1,16,$acc9
+	_srm	$t2,0,$acc7
+	ldbx	$acc6($rounds),$acc6
+	ldbx	$acc7($rounds),$acc7
+	_srm	$t0,8,$acc10
+	ldbx	$acc8($rounds),$acc8
+	_srm	$t3,24,$acc12
+	ldbx	$acc9($rounds),$acc9
+	_srm	$t2,16,$acc13
+	_srm	$t3,0,$acc11
+	ldbx	$acc10($rounds),$acc10
+	_srm	$t1,8,$acc14
+	ldbx	$acc11($rounds),$acc11
+	ldbx	$acc12($rounds),$acc12
+	ldbx	$acc13($rounds),$acc13
+	_srm	$t0,0,$acc15
+	ldbx	$acc14($rounds),$acc14
+
+		dep	$acc0,7,8,$acc3
+	ldbx	$acc15($rounds),$acc15
+		dep	$acc4,7,8,$acc7
+		dep	$acc1,15,8,$acc3
+		dep	$acc5,15,8,$acc7
+		dep	$acc2,23,8,$acc3
+		dep	$acc6,23,8,$acc7
+		xor	$acc3,$s0,$s0
+		xor	$acc7,$s1,$s1
+		dep	$acc8,7,8,$acc11
+		dep	$acc12,7,8,$acc15
+		dep	$acc9,15,8,$acc11
+		dep	$acc13,15,8,$acc15
+		dep	$acc10,23,8,$acc11
+		dep	$acc14,23,8,$acc15
+		xor	$acc11,$s2,$s2
+
+	bv	(%r31)
+	.EXIT
+		xor	$acc15,$s3,$s3
+	.PROCEND
+
+	.ALIGN	64
+L\$AES_Td
+	.WORD	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+	.WORD	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+	.WORD	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+	.WORD	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+	.WORD	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+	.WORD	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+	.WORD	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+	.WORD	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+	.WORD	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+	.WORD	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+	.WORD	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+	.WORD	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+	.WORD	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+	.WORD	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+	.WORD	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+	.WORD	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+	.WORD	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+	.WORD	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+	.WORD	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+	.WORD	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+	.WORD	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+	.WORD	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+	.WORD	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+	.WORD	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+	.WORD	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+	.WORD	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+	.WORD	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+	.WORD	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+	.WORD	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+	.WORD	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+	.WORD	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+	.WORD	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+	.WORD	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+	.WORD	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+	.WORD	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+	.WORD	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+	.WORD	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+	.WORD	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+	.WORD	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+	.WORD	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+	.WORD	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+	.WORD	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+	.WORD	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+	.WORD	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+	.WORD	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+	.WORD	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+	.WORD	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+	.WORD	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+	.WORD	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+	.WORD	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+	.WORD	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+	.WORD	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+	.WORD	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+	.WORD	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+	.WORD	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+	.WORD	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+	.WORD	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+	.WORD	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+	.WORD	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+	.WORD	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+	.WORD	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+	.WORD	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+	.WORD	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+	.WORD	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+	.BYTE	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.BYTE	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.BYTE	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.BYTE	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.BYTE	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.BYTE	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.BYTE	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.BYTE	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.BYTE	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.BYTE	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.BYTE	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.BYTE	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.BYTE	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.BYTE	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.BYTE	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.BYTE	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.BYTE	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.BYTE	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.BYTE	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.BYTE	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.BYTE	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.BYTE	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.BYTE	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.BYTE	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.BYTE	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.BYTE	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.BYTE	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.BYTE	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.BYTE	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.BYTE	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.BYTE	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.BYTE	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+	.STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	# translate made up instructons: _ror, _srm
+	s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/				or
+
+	s/_srm(\s+%r[0-9]+),([0-9]+),/
+		$SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
+		:            sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
+
+	s/,\*/,/			if ($SIZE_T==4);
+	s/\bbv\b(.*\(%r2\))/bve$1/	if ($SIZE_T==8);
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/aes-ppc.pl b/main/openssl/crypto/aes/asm/aes-ppc.pl
index f82c5e18..7c52cbe5 100644
--- a/main/openssl/crypto/aes/asm/aes-ppc.pl
+++ b/main/openssl/crypto/aes/asm/aes-ppc.pl
@@ -7,7 +7,7 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
-# Needs more work: key setup, page boundaries, CBC routine...
+# Needs more work: key setup, CBC routine...
 #
 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
@@ -18,7 +18,7 @@
 
 # February 2010
 #
-# Rescheduling instructions to favour Power6 pipeline gives 10%
+# Rescheduling instructions to favour Power6 pipeline gave 10%
 # performance improvement on the platfrom in question (and marginal
 # improvement even on others). It should be noted that Power6 fails
 # to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@ $flavour = shift;
 
 if ($flavour =~ /64/) {
 	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
 	$STU	="stdu";
 	$POP	="ld";
 	$PUSH	="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
 	$STU	="stwu";
 	$POP	="lwz";
 	$PUSH	="stw";
@@ -116,15 +118,19 @@ LAES_Te:
 	addi	$Tbl0,$Tbl0,`128-8`
 	mtlr	r0
 	blr
-	.space	`32-24`
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
 LAES_Td:
 	mflr	r0
 	bcl	20,31,\$+4
 	mflr	$Tbl0	;    vvvvvvvv "distance" between . and 1st data entry
-	addi	$Tbl0,$Tbl0,`128-8-32+2048+256`
+	addi	$Tbl0,$Tbl0,`128-64-8+2048+256`
 	mtlr	r0
 	blr
-	.space	`128-32-24`
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`128-64-9*4`
 ___
 &_data_word(
 	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@ $code.=<<___;
 .globl	.AES_encrypt
 .align	7
 .AES_encrypt:
-	mflr	r0
 	$STU	$sp,-$FRAME($sp)
+	mflr	r0
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,7 +357,14 @@ $code.=<<___;
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
+
+	andi.	$t0,$inp,3
+	andi.	$t1,$out,3
+	or.	$t0,$t0,$t1
+	bne	Lenc_unaligned
 
+Lenc_unaligned_ok:
 	lwz	$s0,0($inp)
 	lwz	$s1,4($inp)
 	lwz	$s2,8($inp)
@@ -363,8 +375,80 @@ $code.=<<___;
 	stw	$s1,4($out)
 	stw	$s2,8($out)
 	stw	$s3,12($out)
+	b	Lenc_done
+
+Lenc_unaligned:
+	subfic	$t0,$inp,4096
+	subfic	$t1,$out,4096
+	andi.	$t0,$t0,4096-16
+	beq	Lenc_xpage
+	andi.	$t1,$t1,4096-16
+	bne	Lenc_unaligned_ok
+
+Lenc_xpage:
+	lbz	$acc00,0($inp)
+	lbz	$acc01,1($inp)
+	lbz	$acc02,2($inp)
+	lbz	$s0,3($inp)
+	lbz	$acc04,4($inp)
+	lbz	$acc05,5($inp)
+	lbz	$acc06,6($inp)
+	lbz	$s1,7($inp)
+	lbz	$acc08,8($inp)
+	lbz	$acc09,9($inp)
+	lbz	$acc10,10($inp)
+	insrwi	$s0,$acc00,8,0
+	lbz	$s2,11($inp)
+	insrwi	$s1,$acc04,8,0
+	lbz	$acc12,12($inp)
+	insrwi	$s0,$acc01,8,8
+	lbz	$acc13,13($inp)
+	insrwi	$s1,$acc05,8,8
+	lbz	$acc14,14($inp)
+	insrwi	$s0,$acc02,8,16
+	lbz	$s3,15($inp)
+	insrwi	$s1,$acc06,8,16
+	insrwi	$s2,$acc08,8,0
+	insrwi	$s3,$acc12,8,0
+	insrwi	$s2,$acc09,8,8
+	insrwi	$s3,$acc13,8,8
+	insrwi	$s2,$acc10,8,16
+	insrwi	$s3,$acc14,8,16
+
+	bl	LAES_Te
+	bl	Lppc_AES_encrypt_compact
+
+	extrwi	$acc00,$s0,8,0
+	extrwi	$acc01,$s0,8,8
+	stb	$acc00,0($out)
+	extrwi	$acc02,$s0,8,16
+	stb	$acc01,1($out)
+	stb	$acc02,2($out)
+	extrwi	$acc04,$s1,8,0
+	stb	$s0,3($out)
+	extrwi	$acc05,$s1,8,8
+	stb	$acc04,4($out)
+	extrwi	$acc06,$s1,8,16
+	stb	$acc05,5($out)
+	stb	$acc06,6($out)
+	extrwi	$acc08,$s2,8,0
+	stb	$s1,7($out)
+	extrwi	$acc09,$s2,8,8
+	stb	$acc08,8($out)
+	extrwi	$acc10,$s2,8,16
+	stb	$acc09,9($out)
+	stb	$acc10,10($out)
+	extrwi	$acc12,$s3,8,0
+	stb	$s2,11($out)
+	extrwi	$acc13,$s3,8,8
+	stb	$acc12,12($out)
+	extrwi	$acc14,$s3,8,16
+	stb	$acc13,13($out)
+	stb	$acc14,14($out)
+	stb	$s3,15($out)
 
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+Lenc_done:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,18 +472,21 @@ $code.=<<___;
 	mtlr	r0
 	addi	$sp,$sp,$FRAME
 	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 
 .align	5
 Lppc_AES_encrypt:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,3
+	lwz	$t0,0($key)
 	addi	$Tbl2,$Tbl0,2
+	lwz	$t1,4($key)
 	addi	$Tbl3,$Tbl0,1
+	lwz	$t2,8($key)
 	addi	$acc00,$acc00,-1
+	lwz	$t3,12($key)
 	addi	$key,$key,16
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
@@ -413,44 +500,44 @@ Lenc_loop:
 	rlwinm	$acc02,$s2,`32-24+3`,21,28
 	rlwinm	$acc03,$s3,`32-24+3`,21,28
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc04,$s1,`32-16+3`,21,28
+	lwz	$t1,4($key)
 	rlwinm	$acc05,$s2,`32-16+3`,21,28
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc06,$s3,`32-16+3`,21,28
+	lwz	$t3,12($key)
 	rlwinm	$acc07,$s0,`32-16+3`,21,28
 	lwzx	$acc00,$Tbl0,$acc00
-	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc09,$s3,`32-8+3`,21,28
 	lwzx	$acc02,$Tbl0,$acc02
-	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc11,$s1,`32-8+3`,21,28
 	lwzx	$acc04,$Tbl1,$acc04
-	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s3,`0+3`,21,28
+	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s0,`0+3`,21,28
 	lwzx	$acc06,$Tbl1,$acc06
-	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s1,`0+3`,21,28
+	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s2,`0+3`,21,28
 	lwzx	$acc08,$Tbl2,$acc08
-	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t0,$t0,$acc00
+	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t1,$t1,$acc01
 	lwzx	$acc10,$Tbl2,$acc10
-	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t2,$t2,$acc02
+	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t3,$t3,$acc03
 	lwzx	$acc12,$Tbl3,$acc12
-	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t0,$t0,$acc04
+	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t1,$t1,$acc05
 	lwzx	$acc14,$Tbl3,$acc14
-	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t2,$t2,$acc06
+	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t3,$t3,$acc07
 	xor	$t0,$t0,$acc08
 	xor	$t1,$t1,$acc09
@@ -466,60 +553,60 @@ Lenc_loop:
 	addi	$Tbl2,$Tbl0,2048
 	nop
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	lwz	$t1,4($key)
 	rlwinm	$acc01,$s1,`32-24`,24,31
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc02,$s2,`32-24`,24,31
+	lwz	$t3,12($key)
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Te4
-	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc04,$s1,`32-16`,24,31
+	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc05,$s2,`32-16`,24,31
 	lwz	$acc10,`2048+64`($Tbl0)
-	lwz	$acc11,`2048+96`($Tbl0)
 	rlwinm	$acc06,$s3,`32-16`,24,31
+	lwz	$acc11,`2048+96`($Tbl0)
 	rlwinm	$acc07,$s0,`32-16`,24,31
 	lwz	$acc12,`2048+128`($Tbl0)
-	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lwz	$acc14,`2048+192`($Tbl0)
-	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc00,$Tbl2,$acc00
-	lbzx	$acc01,$Tbl2,$acc01
 	rlwinm	$acc12,$s3,`0`,24,31
+	lbzx	$acc01,$Tbl2,$acc01
 	rlwinm	$acc13,$s0,`0`,24,31
 	lbzx	$acc02,$Tbl2,$acc02
-	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc14,$s1,`0`,24,31
+	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc15,$s2,`0`,24,31
 	lbzx	$acc04,$Tbl2,$acc04
-	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc06,$Tbl2,$acc06
-	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc08,$Tbl2,$acc08
-	lbzx	$acc09,$Tbl2,$acc09
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc09,$Tbl2,$acc09
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc10,$Tbl2,$acc10
-	lbzx	$acc11,$Tbl2,$acc11
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc11,$Tbl2,$acc11
 	rlwimi	$s3,$acc07,16,8,15
 	lbzx	$acc12,$Tbl2,$acc12
-	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s0,$acc08,8,16,23
+	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s1,$acc09,8,16,23
 	lbzx	$acc14,$Tbl2,$acc14
-	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s2,$acc10,8,16,23
+	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s3,$acc11,8,16,23
 	or	$s0,$s0,$acc12
 	or	$s1,$s1,$acc13
@@ -530,29 +617,31 @@ Lenc_loop:
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .align	4
 Lppc_AES_encrypt_compact:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,2048
+	lwz	$t0,0($key)
 	lis	$mask80,0x8080
+	lwz	$t1,4($key)
 	lis	$mask1b,0x1b1b
-	addi	$key,$key,16
+	lwz	$t2,8($key)
 	ori	$mask80,$mask80,0x8080
+	lwz	$t3,12($key)
 	ori	$mask1b,$mask1b,0x1b1b
+	addi	$key,$key,16
 	mtctr	$acc00
 .align	4
 Lenc_compact_loop:
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
-	xor	$s2,$s2,$t2
-	xor	$s3,$s3,$t3
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	xor	$s2,$s2,$t2
 	rlwinm	$acc01,$s1,`32-24`,24,31
+	xor	$s3,$s3,$t3
 	rlwinm	$acc02,$s2,`32-24`,24,31
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	rlwinm	$acc04,$s1,`32-16`,24,31
@@ -560,48 +649,48 @@ Lenc_compact_loop:
 	rlwinm	$acc06,$s3,`32-16`,24,31
 	rlwinm	$acc07,$s0,`32-16`,24,31
 	lbzx	$acc00,$Tbl1,$acc00
-	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl1,$acc02
-	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl1,$acc04
-	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s3,`0`,24,31
+	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s0,`0`,24,31
 	lbzx	$acc06,$Tbl1,$acc06
-	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s1,`0`,24,31
+	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s2,`0`,24,31
 	lbzx	$acc08,$Tbl1,$acc08
-	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl1,$acc10
-	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl1,$acc12
-	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl1,$acc14
-	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
 	rlwimi	$s2,$acc10,8,16,23
 	rlwimi	$s3,$acc11,8,16,23
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	or	$s0,$s0,$acc12
+	lwz	$t1,4($key)
 	or	$s1,$s1,$acc13
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	or	$s2,$s2,$acc14
+	lwz	$t3,12($key)
 	or	$s3,$s3,$acc15
 
 	addi	$key,$key,16
@@ -612,12 +701,12 @@ Lenc_compact_loop:
 	and	$acc02,$s2,$mask80
 	and	$acc03,$s3,$mask80
 	srwi	$acc04,$acc00,7		# r1>>7
-	srwi	$acc05,$acc01,7
-	srwi	$acc06,$acc02,7
-	srwi	$acc07,$acc03,7
 	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	srwi	$acc05,$acc01,7
 	andc	$acc09,$s1,$mask80
+	srwi	$acc06,$acc02,7
 	andc	$acc10,$s2,$mask80
+	srwi	$acc07,$acc03,7
 	andc	$acc11,$s3,$mask80
 	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
 	sub	$acc01,$acc01,$acc05
@@ -633,32 +722,32 @@ Lenc_compact_loop:
 	and	$acc03,$acc03,$mask1b
 	xor	$acc00,$acc00,$acc08	# r2
 	xor	$acc01,$acc01,$acc09
+	 rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
 	xor	$acc02,$acc02,$acc10
+	 rotlwi	$acc13,$s1,16
 	xor	$acc03,$acc03,$acc11
+	 rotlwi	$acc14,$s2,16
 
-	rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
-	rotlwi	$acc13,$s1,16
-	rotlwi	$acc14,$s2,16
-	rotlwi	$acc15,$s3,16
 	xor	$s0,$s0,$acc00		# r0^r2
+	rotlwi	$acc15,$s3,16
 	xor	$s1,$s1,$acc01
-	xor	$s2,$s2,$acc02
-	xor	$s3,$s3,$acc03
 	rotrwi	$s0,$s0,24		# ROTATE(r2^r0,24)
+	xor	$s2,$s2,$acc02
 	rotrwi	$s1,$s1,24
+	xor	$s3,$s3,$acc03
 	rotrwi	$s2,$s2,24
-	rotrwi	$s3,$s3,24
 	xor	$s0,$s0,$acc00		# ROTATE(r2^r0,24)^r2
+	rotrwi	$s3,$s3,24
 	xor	$s1,$s1,$acc01
 	xor	$s2,$s2,$acc02
 	xor	$s3,$s3,$acc03
 	rotlwi	$acc08,$acc12,8		# ROTATE(r0,24)
-	rotlwi	$acc09,$acc13,8
-	rotlwi	$acc10,$acc14,8
-	rotlwi	$acc11,$acc15,8
 	xor	$s0,$s0,$acc12		#
+	rotlwi	$acc09,$acc13,8
 	xor	$s1,$s1,$acc13
+	rotlwi	$acc10,$acc14,8
 	xor	$s2,$s2,$acc14
+	rotlwi	$acc11,$acc15,8
 	xor	$s3,$s3,$acc15
 	xor	$s0,$s0,$acc08		#
 	xor	$s1,$s1,$acc09
@@ -673,14 +762,15 @@ Lenc_compact_done:
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.AES_decrypt
 .align	7
 .AES_decrypt:
-	mflr	r0
 	$STU	$sp,-$FRAME($sp)
+	mflr	r0
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,7 +791,14 @@ Lenc_compact_done:
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 
+	andi.	$t0,$inp,3
+	andi.	$t1,$out,3
+	or.	$t0,$t0,$t1
+	bne	Ldec_unaligned
+
+Ldec_unaligned_ok:
 	lwz	$s0,0($inp)
 	lwz	$s1,4($inp)
 	lwz	$s2,8($inp)
@@ -712,8 +809,80 @@ Lenc_compact_done:
 	stw	$s1,4($out)
 	stw	$s2,8($out)
 	stw	$s3,12($out)
+	b	Ldec_done
+
+Ldec_unaligned:
+	subfic	$t0,$inp,4096
+	subfic	$t1,$out,4096
+	andi.	$t0,$t0,4096-16
+	beq	Ldec_xpage
+	andi.	$t1,$t1,4096-16
+	bne	Ldec_unaligned_ok
+
+Ldec_xpage:
+	lbz	$acc00,0($inp)
+	lbz	$acc01,1($inp)
+	lbz	$acc02,2($inp)
+	lbz	$s0,3($inp)
+	lbz	$acc04,4($inp)
+	lbz	$acc05,5($inp)
+	lbz	$acc06,6($inp)
+	lbz	$s1,7($inp)
+	lbz	$acc08,8($inp)
+	lbz	$acc09,9($inp)
+	lbz	$acc10,10($inp)
+	insrwi	$s0,$acc00,8,0
+	lbz	$s2,11($inp)
+	insrwi	$s1,$acc04,8,0
+	lbz	$acc12,12($inp)
+	insrwi	$s0,$acc01,8,8
+	lbz	$acc13,13($inp)
+	insrwi	$s1,$acc05,8,8
+	lbz	$acc14,14($inp)
+	insrwi	$s0,$acc02,8,16
+	lbz	$s3,15($inp)
+	insrwi	$s1,$acc06,8,16
+	insrwi	$s2,$acc08,8,0
+	insrwi	$s3,$acc12,8,0
+	insrwi	$s2,$acc09,8,8
+	insrwi	$s3,$acc13,8,8
+	insrwi	$s2,$acc10,8,16
+	insrwi	$s3,$acc14,8,16
+
+	bl	LAES_Td
+	bl	Lppc_AES_decrypt_compact
 
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+	extrwi	$acc00,$s0,8,0
+	extrwi	$acc01,$s0,8,8
+	stb	$acc00,0($out)
+	extrwi	$acc02,$s0,8,16
+	stb	$acc01,1($out)
+	stb	$acc02,2($out)
+	extrwi	$acc04,$s1,8,0
+	stb	$s0,3($out)
+	extrwi	$acc05,$s1,8,8
+	stb	$acc04,4($out)
+	extrwi	$acc06,$s1,8,16
+	stb	$acc05,5($out)
+	stb	$acc06,6($out)
+	extrwi	$acc08,$s2,8,0
+	stb	$s1,7($out)
+	extrwi	$acc09,$s2,8,8
+	stb	$acc08,8($out)
+	extrwi	$acc10,$s2,8,16
+	stb	$acc09,9($out)
+	stb	$acc10,10($out)
+	extrwi	$acc12,$s3,8,0
+	stb	$s2,11($out)
+	extrwi	$acc13,$s3,8,8
+	stb	$acc12,12($out)
+	extrwi	$acc14,$s3,8,16
+	stb	$acc13,13($out)
+	stb	$acc14,14($out)
+	stb	$s3,15($out)
+
+Ldec_done:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,18 +906,21 @@ Lenc_compact_done:
 	mtlr	r0
 	addi	$sp,$sp,$FRAME
 	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 
 .align	5
 Lppc_AES_decrypt:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,3
+	lwz	$t0,0($key)
 	addi	$Tbl2,$Tbl0,2
+	lwz	$t1,4($key)
 	addi	$Tbl3,$Tbl0,1
+	lwz	$t2,8($key)
 	addi	$acc00,$acc00,-1
+	lwz	$t3,12($key)
 	addi	$key,$key,16
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
@@ -762,44 +934,44 @@ Ldec_loop:
 	rlwinm	$acc02,$s2,`32-24+3`,21,28
 	rlwinm	$acc03,$s3,`32-24+3`,21,28
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc04,$s3,`32-16+3`,21,28
+	lwz	$t1,4($key)
 	rlwinm	$acc05,$s0,`32-16+3`,21,28
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc06,$s1,`32-16+3`,21,28
+	lwz	$t3,12($key)
 	rlwinm	$acc07,$s2,`32-16+3`,21,28
 	lwzx	$acc00,$Tbl0,$acc00
-	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc09,$s3,`32-8+3`,21,28
 	lwzx	$acc02,$Tbl0,$acc02
-	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc11,$s1,`32-8+3`,21,28
 	lwzx	$acc04,$Tbl1,$acc04
-	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s1,`0+3`,21,28
+	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s2,`0+3`,21,28
 	lwzx	$acc06,$Tbl1,$acc06
-	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s3,`0+3`,21,28
+	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s0,`0+3`,21,28
 	lwzx	$acc08,$Tbl2,$acc08
-	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t0,$t0,$acc00
+	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t1,$t1,$acc01
 	lwzx	$acc10,$Tbl2,$acc10
-	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t2,$t2,$acc02
+	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t3,$t3,$acc03
 	lwzx	$acc12,$Tbl3,$acc12
-	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t0,$t0,$acc04
+	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t1,$t1,$acc05
 	lwzx	$acc14,$Tbl3,$acc14
-	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t2,$t2,$acc06
+	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t3,$t3,$acc07
 	xor	$t0,$t0,$acc08
 	xor	$t1,$t1,$acc09
@@ -815,56 +987,56 @@ Ldec_loop:
 	addi	$Tbl2,$Tbl0,2048
 	nop
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	lwz	$t1,4($key)
 	rlwinm	$acc01,$s1,`32-24`,24,31
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc02,$s2,`32-24`,24,31
+	lwz	$t3,12($key)
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Td4
-	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc04,$s3,`32-16`,24,31
+	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc05,$s0,`32-16`,24,31
 	lwz	$acc10,`2048+64`($Tbl0)
-	lwz	$acc11,`2048+96`($Tbl0)
 	lbzx	$acc00,$Tbl2,$acc00
+	lwz	$acc11,`2048+96`($Tbl0)
 	lbzx	$acc01,$Tbl2,$acc01
 	lwz	$acc12,`2048+128`($Tbl0)
-	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc06,$s1,`32-16`,24,31
+	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc07,$s2,`32-16`,24,31
 	lwz	$acc14,`2048+192`($Tbl0)
-	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl2,$acc02
-	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl2,$acc04
-	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$acc12,$s1,`0`,24,31
+	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$acc13,$s2,`0`,24,31
 	lbzx	$acc06,$Tbl2,$acc06
-	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$acc14,$s3,`0`,24,31
+	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$acc15,$s0,`0`,24,31
 	lbzx	$acc08,$Tbl2,$acc08
-	lbzx	$acc09,$Tbl2,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl2,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl2,$acc10
-	lbzx	$acc11,$Tbl2,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl2,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl2,$acc12
-	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl2,$acc14
-	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
@@ -879,20 +1051,22 @@ Ldec_loop:
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .align	4
 Lppc_AES_decrypt_compact:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,2048
+	lwz	$t0,0($key)
 	lis	$mask80,0x8080
+	lwz	$t1,4($key)
 	lis	$mask1b,0x1b1b
-	addi	$key,$key,16
+	lwz	$t2,8($key)
 	ori	$mask80,$mask80,0x8080
+	lwz	$t3,12($key)
 	ori	$mask1b,$mask1b,0x1b1b
+	addi	$key,$key,16
 ___
 $code.=<<___ if ($SIZE_T==8);
 	insrdi	$mask80,$mask80,32,0
@@ -904,10 +1078,10 @@ $code.=<<___;
 Ldec_compact_loop:
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
-	xor	$s2,$s2,$t2
-	xor	$s3,$s3,$t3
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	xor	$s2,$s2,$t2
 	rlwinm	$acc01,$s1,`32-24`,24,31
+	xor	$s3,$s3,$t3
 	rlwinm	$acc02,$s2,`32-24`,24,31
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	rlwinm	$acc04,$s3,`32-16`,24,31
@@ -915,48 +1089,48 @@ Ldec_compact_loop:
 	rlwinm	$acc06,$s1,`32-16`,24,31
 	rlwinm	$acc07,$s2,`32-16`,24,31
 	lbzx	$acc00,$Tbl1,$acc00
-	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl1,$acc02
-	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl1,$acc04
-	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s1,`0`,24,31
+	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s2,`0`,24,31
 	lbzx	$acc06,$Tbl1,$acc06
-	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s3,`0`,24,31
+	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s0,`0`,24,31
 	lbzx	$acc08,$Tbl1,$acc08
-	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl1,$acc10
-	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl1,$acc12
-	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl1,$acc14
-	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
 	rlwimi	$s2,$acc10,8,16,23
 	rlwimi	$s3,$acc11,8,16,23
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	or	$s0,$s0,$acc12
+	lwz	$t1,4($key)
 	or	$s1,$s1,$acc13
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	or	$s2,$s2,$acc14
+	lwz	$t3,12($key)
 	or	$s3,$s3,$acc15
 
 	addi	$key,$key,16
@@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4);
 	and	$acc02,$s2,$mask80
 	and	$acc03,$s3,$mask80
 	srwi	$acc04,$acc00,7		# r1>>7
-	srwi	$acc05,$acc01,7
-	srwi	$acc06,$acc02,7
-	srwi	$acc07,$acc03,7
 	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	srwi	$acc05,$acc01,7
 	andc	$acc09,$s1,$mask80
+	srwi	$acc06,$acc02,7
 	andc	$acc10,$s2,$mask80
+	srwi	$acc07,$acc03,7
 	andc	$acc11,$s3,$mask80
 	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
 	sub	$acc01,$acc01,$acc05
@@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4);
 	and	$acc06,$acc02,$mask80
 	and	$acc07,$acc03,$mask80
 	srwi	$acc08,$acc04,7		# r1>>7
-	srwi	$acc09,$acc05,7
-	srwi	$acc10,$acc06,7
-	srwi	$acc11,$acc07,7
 	andc	$acc12,$acc00,$mask80	# r2&0x7f7f7f7f
+	srwi	$acc09,$acc05,7
 	andc	$acc13,$acc01,$mask80
+	srwi	$acc10,$acc06,7
 	andc	$acc14,$acc02,$mask80
+	srwi	$acc11,$acc07,7
 	andc	$acc15,$acc03,$mask80
 	sub	$acc04,$acc04,$acc08	# r1-(r1>>7)
 	sub	$acc05,$acc05,$acc09
@@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4);
 
 	and	$acc08,$acc04,$mask80	# r1=r4&0x80808080
 	and	$acc09,$acc05,$mask80
-	and	$acc10,$acc06,$mask80
-	and	$acc11,$acc07,$mask80
 	srwi	$acc12,$acc08,7		# r1>>7
+	and	$acc10,$acc06,$mask80
 	srwi	$acc13,$acc09,7
+	and	$acc11,$acc07,$mask80
 	srwi	$acc14,$acc10,7
-	srwi	$acc15,$acc11,7
 	sub	$acc08,$acc08,$acc12	# r1-(r1>>7)
+	srwi	$acc15,$acc11,7
 	sub	$acc09,$acc09,$acc13
 	sub	$acc10,$acc10,$acc14
 	sub	$acc11,$acc11,$acc15
@@ -1124,10 +1298,10 @@ ___
 $code.=<<___;
 	rotrwi	$s0,$s0,8		# = ROTATE(r0,8)
 	rotrwi	$s1,$s1,8
-	rotrwi	$s2,$s2,8
-	rotrwi	$s3,$s3,8
 	xor	$s0,$s0,$acc00		# ^= r2^r0
+	rotrwi	$s2,$s2,8
 	xor	$s1,$s1,$acc01
+	rotrwi	$s3,$s3,8
 	xor	$s2,$s2,$acc02
 	xor	$s3,$s3,$acc03
 	xor	$acc00,$acc00,$acc08
@@ -1135,32 +1309,32 @@ $code.=<<___;
 	xor	$acc02,$acc02,$acc10
 	xor	$acc03,$acc03,$acc11
 	xor	$s0,$s0,$acc04		# ^= r4^r0
-	xor	$s1,$s1,$acc05
-	xor	$s2,$s2,$acc06
-	xor	$s3,$s3,$acc07
 	rotrwi	$acc00,$acc00,24
+	xor	$s1,$s1,$acc05
 	rotrwi	$acc01,$acc01,24
+	xor	$s2,$s2,$acc06
 	rotrwi	$acc02,$acc02,24
+	xor	$s3,$s3,$acc07
 	rotrwi	$acc03,$acc03,24
 	xor	$acc04,$acc04,$acc08
 	xor	$acc05,$acc05,$acc09
 	xor	$acc06,$acc06,$acc10
 	xor	$acc07,$acc07,$acc11
 	xor	$s0,$s0,$acc08		# ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
-	xor	$s1,$s1,$acc09
-	xor	$s2,$s2,$acc10
-	xor	$s3,$s3,$acc11
 	rotrwi	$acc04,$acc04,16
+	xor	$s1,$s1,$acc09
 	rotrwi	$acc05,$acc05,16
+	xor	$s2,$s2,$acc10
 	rotrwi	$acc06,$acc06,16
+	xor	$s3,$s3,$acc11
 	rotrwi	$acc07,$acc07,16
 	xor	$s0,$s0,$acc00		# ^= ROTATE(r8^r2^r0,24)
-	xor	$s1,$s1,$acc01
-	xor	$s2,$s2,$acc02
-	xor	$s3,$s3,$acc03
 	rotrwi	$acc08,$acc08,8
+	xor	$s1,$s1,$acc01
 	rotrwi	$acc09,$acc09,8
+	xor	$s2,$s2,$acc02
 	rotrwi	$acc10,$acc10,8
+	xor	$s3,$s3,$acc03
 	rotrwi	$acc11,$acc11,8
 	xor	$s0,$s0,$acc04		# ^= ROTATE(r8^r4^r0,16)
 	xor	$s1,$s1,$acc05
@@ -1179,7 +1353,9 @@ Ldec_compact_done:
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
-.long	0
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+
 .asciz	"AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 .align	7
 ___
diff --git a/main/openssl/crypto/aes/asm/aes-s390x.pl b/main/openssl/crypto/aes/asm/aes-s390x.pl
index 7e018892..e75dcd03 100644
--- a/main/openssl/crypto/aes/asm/aes-s390x.pl
+++ b/main/openssl/crypto/aes/asm/aes-s390x.pl
@@ -44,12 +44,57 @@
 # Unlike previous version hardware support detection takes place only
 # at the moment of key schedule setup, which is denoted in key->rounds.
 # This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
 #
 # Add AES_cbc_encrypt, which gives incredible performance improvement,
 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
 # because software implementation was optimized.
 
+# May 2010.
+#
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+
+# December 2010.
+#
+# Add support for z196 "cipher message with counter" instruction.
+# Note however that it's disengaged, because it was measured to
+# perform ~12% worse than vanilla km-based code...
+
+# February 2011.
+#
+# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
+# instructions, which deliver ~70% improvement at 8KB block size over
+# vanilla km-based code, 37% - at most like 512-bytes block size.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $softonly=0;	# allow hardware support
 
 $t0="%r0";	$mask="%r0";
@@ -69,6 +114,8 @@ $rounds="%r13";
 $ra="%r14";
 $sp="%r15";
 
+$stdframe=16*$SIZE_T+4*8;
+
 sub _data_word()
 { my $i;
     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly);
 .Lesoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)
 
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@@ -220,20 +267,20 @@ $code.=<<___;
 	larl	$tbl,AES_Te
 	bras	$ra,_s390x_AES_encrypt
 
-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)
 
-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _s390x_AES_encrypt,\@function
 .align	16
 _s390x_AES_encrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,15*$SIZE_T($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@@ -397,7 +444,7 @@ _s390x_AES_encrypt:
 	or	$s2,$i3
 	or	$s3,$t3
 
-	lg	$ra,152($sp)
+	l${g}	$ra,15*$SIZE_T($sp)
 	xr	$s0,$t0
 	xr	$s1,$t2
 	x	$s2,24($key)
@@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly);
 .Ldsoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)
 
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@@ -546,20 +593,20 @@ $code.=<<___;
 	larl	$tbl,AES_Td
 	bras	$ra,_s390x_AES_decrypt
 
-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)
 
-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _s390x_AES_decrypt,\@function
 .align	16
 _s390x_AES_decrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,15*$SIZE_T($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@@ -703,7 +750,7 @@ _s390x_AES_decrypt:
 	nr	$i1,$mask
 	nr	$i2,$mask
 
-	lg	$ra,152($sp)
+	l${g}	$ra,15*$SIZE_T($sp)
 	or	$s1,$t1
 	l	$t0,16($key)
 	l	$t1,20($key)
@@ -732,14 +779,15 @@ ___
 $code.=<<___;
 # void AES_set_encrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	AES_set_encrypt_key
-.type	AES_set_encrypt_key,\@function
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,\@function
 .align	16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_s390x_AES_set_encrypt_key:
 	lghi	$t0,0
-	clgr	$inp,$t0
+	cl${g}r	$inp,$t0
 	je	.Lminus1
-	clgr	$key,$t0
+	cl${g}r	$key,$t0
 	je	.Lminus1
 
 	lghi	$t0,128
@@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly);
 	je	1f
 	lg	%r1,24($inp)
 	stg	%r1,24($key)
-1:	st	$bits,236($key)	# save bits
+1:	st	$bits,236($key)	# save bits [for debugging purposes]
+	lgr	$t0,%r5
 	st	%r5,240($key)	# save km code
 	lghi	%r2,0
 	br	%r14
@@ -797,7 +846,7 @@ ___
 $code.=<<___;
 .align	16
 .Lekey_internal:
-	stmg	%r6,%r13,48($sp)	# all non-volatile regs
+	stm${g}	%r4,%r13,4*$SIZE_T($sp)	# all non-volatile regs and $key
 
 	larl	$tbl,AES_Te+2048
 
@@ -857,8 +906,9 @@ $code.=<<___;
 	la	$key,16($key)		# key+=4
 	la	$t3,4($t3)		# i++
 	brct	$rounds,.L128_loop
+	lghi	$t0,10
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r4,%r13,4*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -905,8 +955,9 @@ $code.=<<___;
 	st	$s2,32($key)
 	st	$s3,36($key)
 	brct	$rounds,.L192_continue
+	lghi	$t0,12
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r4,%r13,4*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -967,8 +1018,9 @@ $code.=<<___;
 	st	$s2,40($key)
 	st	$s3,44($key)
 	brct	$rounds,.L256_continue
+	lghi	$t0,14
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r4,%r13,4*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -1011,42 +1063,34 @@ $code.=<<___;
 .Lminus1:
 	lghi	%r2,-1
 	br	$ra
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
 # void AES_set_decrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	AES_set_decrypt_key
-.type	AES_set_decrypt_key,\@function
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,\@function
 .align	16
-AES_set_decrypt_key:
-	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
-	stg	$ra,112($sp)		# save non-volatile registers!
-	bras	$ra,AES_set_encrypt_key
-	lg	$key,32($sp)
-	lg	$ra,112($sp)
+private_AES_set_decrypt_key:
+	#st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
+	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers and $key!
+	bras	$ra,_s390x_AES_set_encrypt_key
+	#l${g}	$key,4*$SIZE_T($sp)
+	l${g}	$ra,14*$SIZE_T($sp)
 	ltgr	%r2,%r2
 	bnzr	$ra
 ___
 $code.=<<___ if (!$softonly);
-	l	$t0,240($key)
+	#l	$t0,240($key)
 	lhi	$t1,16
 	cr	$t0,$t1
 	jl	.Lgo
 	oill	$t0,0x80	# set "decrypt" bit
 	st	$t0,240($key)
 	br	$ra
-
-.align	16
-.Ldkey_internal:
-	stg	$key,32($sp)
-	stg	$ra,40($sp)
-	bras	$ra,.Lekey_internal
-	lg	$key,32($sp)
-	lg	$ra,40($sp)
 ___
 $code.=<<___;
-
-.Lgo:	llgf	$rounds,240($key)
+.align	16
+.Lgo:	lgr	$rounds,$t0	#llgf	$rounds,240($key)
 	la	$i1,0($key)
 	sllg	$i2,$rounds,4
 	la	$i2,0($i2,$key)
@@ -1123,13 +1167,14 @@ $code.=<<___;
 	la	$key,4($key)
 	brct	$rounds,.Lmix
 
-	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
 	lghi	%r2,0
 	br	$ra
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 ___
 
-#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+########################################################################
+# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
 #                     size_t length, const AES_KEY *key,
 #                     unsigned char *ivec, const int enc)
 {
@@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly);
 	l	%r0,240($key)	# load kmc code
 	lghi	$key,15		# res=len%16, len-=res;
 	ngr	$key,$len
-	slgr	$len,$key
+	sl${g}r	$len,$key
 	la	%r1,16($sp)	# parameter block - ivec || key
 	jz	.Lkmc_truncated
 	.long	0xb92f0042	# kmc %r4,%r2
@@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly);
 	tmll	%r0,0x80
 	jnz	.Lkmc_truncated_dec
 	lghi	%r1,0
-	stg	%r1,128($sp)
-	stg	%r1,136($sp)
+	stg	%r1,16*$SIZE_T($sp)
+	stg	%r1,16*$SIZE_T+8($sp)
 	bras	%r1,1f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 1:	ex	$key,0(%r1)
 	la	%r1,16($sp)	# restore parameter block
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
 	j	.Lkmc_done
 .align	16
 .Lkmc_truncated_dec:
-	stg	$out,64($sp)
-	la	$out,128($sp)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$out,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
-	lg	$out,64($sp)
+	l${g}	$out,4*$SIZE_T($sp)
 	bras	%r1,2f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 2:	ex	$key,0(%r1)
 	j	.Lkmc_done
 .align	16
 .Lcbc_software:
 ___
 $code.=<<___;
-	stmg	$key,$ra,40($sp)
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
 	lhi	%r0,0
-	cl	%r0,164($sp)
+	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
 	je	.Lcbc_decrypt
 
 	larl	$tbl,AES_Te
@@ -1219,10 +1264,10 @@ $code.=<<___;
 	llgf	$s3,12($ivp)
 
 	lghi	$t0,16
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 .Lcbc_enc_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	x	$s0,0($inp)
 	x	$s1,4($inp)
 	x	$s2,8($inp)
@@ -1231,7 +1276,7 @@ $code.=<<___;
 
 	bras	$ra,_s390x_AES_encrypt
 
-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
@@ -1240,33 +1285,33 @@ $code.=<<___;
 	la	$inp,16($inp)
 	la	$out,16($out)
 	lghi	$t0,16
-	ltgr	$len,$len
+	lt${g}r	$len,$len
 	jz	.Lcbc_enc_done
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 	j	.Lcbc_enc_loop
 .align	16
 .Lcbc_enc_done:
-	lg	$ivp,48($sp)
+	l${g}	$ivp,6*$SIZE_T($sp)
 	st	$s0,0($ivp)
 	st	$s1,4($ivp)	
 	st	$s2,8($ivp)
 	st	$s3,12($ivp)
 
-	lmg	%r7,$ra,56($sp)
+	lm${g}	%r7,$ra,7*$SIZE_T($sp)
 	br	$ra
 
 .align	16
 .Lcbc_enc_tail:
 	aghi	$len,15
 	lghi	$t0,0
-	stg	$t0,128($sp)
-	stg	$t0,136($sp)
+	stg	$t0,16*$SIZE_T($sp)
+	stg	$t0,16*$SIZE_T+8($sp)
 	bras	$t1,3f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 3:	ex	$len,0($t1)
 	lghi	$len,0
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	j	.Lcbc_enc_loop
 
 .align	16
@@ -1275,10 +1320,10 @@ $code.=<<___;
 
 	lg	$t0,0($ivp)
 	lg	$t1,8($ivp)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)
 
 .Lcbc_dec_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
 	llgf	$s2,8($inp)
@@ -1287,7 +1332,7 @@ $code.=<<___;
 
 	bras	$ra,_s390x_AES_decrypt
 
-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	sllg	$s0,$s0,32
 	sllg	$s2,$s2,32
 	lr	$s0,$s1
@@ -1295,15 +1340,15 @@ $code.=<<___;
 
 	lg	$t0,0($inp)
 	lg	$t1,8($inp)
-	xg	$s0,128($sp)
-	xg	$s2,136($sp)
+	xg	$s0,16*$SIZE_T($sp)
+	xg	$s2,16*$SIZE_T+8($sp)
 	lghi	$s1,16
-	slgr	$len,$s1
+	sl${g}r	$len,$s1
 	brc	4,.Lcbc_dec_tail	# if borrow
 	brc	2,.Lcbc_dec_done	# if zero
 	stg	$s0,0($out)
 	stg	$s2,8($out)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)
 
 	la	$inp,16($inp)
 	la	$out,16($out)
@@ -1313,7 +1358,7 @@ $code.=<<___;
 	stg	$s0,0($out)
 	stg	$s2,8($out)
 .Lcbc_dec_exit:
-	lmg	$ivp,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	stmg	$t0,$t1,0($ivp)
 
 	br	$ra
@@ -1321,19 +1366,872 @@ $code.=<<___;
 .align	16
 .Lcbc_dec_tail:
 	aghi	$len,15
-	stg	$s0,128($sp)
-	stg	$s2,136($sp)
+	stg	$s0,16*$SIZE_T($sp)
+	stg	$s2,16*$SIZE_T+8($sp)
 	bras	$s1,4f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 4:	ex	$len,0($s1)
 	j	.Lcbc_dec_exit
 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
-.comm  OPENSSL_s390xcap_P,8,8
+___
+}
+########################################################################
+# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+#                     size_t blocks, const AES_KEY *key,
+#                     const unsigned char *ivec)
+{
+my $inp="%r2";
+my $out="%r4";	# blocks and out are swapped
+my $len="%r3";
+my $key="%r5";	my $iv0="%r5";
+my $ivp="%r6";
+my $fp ="%r7";
+
+$code.=<<___;
+.globl	AES_ctr32_encrypt
+.type	AES_ctr32_encrypt,\@function
+.align	16
+AES_ctr32_encrypt:
+	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
+___
+$code.=<<___ if (!$softonly);
+	l	%r0,240($key)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lctr32_software
+
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+
+	slgr	$out,$inp
+	la	%r1,0($key)	# %r1 is permanent copy of $key
+	lg	$iv0,0($ivp)	# load ivec
+	lg	$ivp,8($ivp)
+
+	# prepare and allocate stack frame at the top of 4K page
+	# with 1K reserved for eventual signal handling
+	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
+	lghi	$s1,-4096
+	algr	$s0,$sp
+	lgr	$fp,$sp
+	ngr	$s0,$s1		# align at page boundary
+	slgr	$fp,$s0		# total buffer size
+	lgr	$s2,$sp
+	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
+	slgr	$fp,$s1		# deduct reservation to get usable buffer size
+	# buffer size is at lest 256 and at most 3072+256-16
+
+	la	$sp,1024($s0)	# alloca
+	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
+	st${g}	$s2,0($sp)	# back-chain
+	st${g}	$fp,$SIZE_T($sp)
+
+	slgr	$len,$fp
+	brc	1,.Lctr32_hw_switch	# not zero, no borrow
+	algr	$fp,$len	# input is shorter than allocated buffer
+	lghi	$len,0
+	st${g}	$fp,$SIZE_T($sp)
+
+.Lctr32_hw_switch:
+___
+$code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
+	larl	$s0,OPENSSL_s390xcap_P
+	lg	$s0,8($s0)
+	tmhh	$s0,0x0004	# check for message_security-assist-4
+	jz	.Lctr32_km_loop
+
+	llgfr	$s0,%r0
+	lgr	$s1,%r1
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb92d2042	# kmctr %r4,%r2,%r2
+
+	llihh	%r0,0x8000	# check if kmctr supports the function code
+	srlg	%r0,%r0,0($s0)
+	ng	%r0,16($sp)
+	lgr	%r0,$s0
+	lgr	%r1,$s1
+	jz	.Lctr32_km_loop
+
+####### kmctr code
+	algr	$out,$inp	# restore $out
+	lgr	$s1,$len	# $s1 undertakes $len
+	j	.Lctr32_kmctr_loop
+.align	16
+.Lctr32_kmctr_loop:
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+.Lctr32_kmctr_prepare:
+	stg	$iv0,0($s2)
+	stg	$ivp,8($s2)
+	la	$s2,16($s2)
+	ahi	$ivp,1		# 32-bit increment, preserves upper half
+	brct	$s3,.Lctr32_kmctr_prepare
+
+	#la	$inp,0($inp)	# inp
+	sllg	$len,$fp,4	# len
+	#la	$out,0($out)	# out
+	la	$s2,16($sp)	# iv
+	.long	0xb92da042	# kmctr $out,$s2,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+
+	slgr	$s1,$fp
+	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
+	algr	$fp,$s1
+	lghi	$s1,0
+	brc	4+1,.Lctr32_kmctr_loop	# not zero
+
+	l${g}	$sp,0($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+___
+$code.=<<___;
+.Lctr32_km_loop:
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+.Lctr32_km_prepare:
+	stg	$iv0,0($s2)
+	stg	$ivp,8($s2)
+	la	$s2,16($s2)
+	ahi	$ivp,1		# 32-bit increment, preserves upper half
+	brct	$s3,.Lctr32_km_prepare
+
+	la	$s0,16($sp)	# inp
+	sllg	$s1,$fp,4	# len
+	la	$s2,16($sp)	# out
+	.long	0xb92e00a8	# km %r10,%r8
+	brc	1,.-4		# pay attention to "partial completion"
+
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+	slgr	$s2,$inp
+.Lctr32_km_xor:
+	lg	$s0,0($inp)
+	lg	$s1,8($inp)
+	xg	$s0,0($s2,$inp)
+	xg	$s1,8($s2,$inp)
+	stg	$s0,0($out,$inp)
+	stg	$s1,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lctr32_km_xor
+
+	slgr	$len,$fp
+	brc	1,.Lctr32_km_loop	# not zero, no borrow
+	algr	$fp,$len
+	lghi	$len,0
+	brc	4+1,.Lctr32_km_loop	# not zero
+
+	l${g}	$s0,0($sp)
+	l${g}	$s1,$SIZE_T($sp)
+	la	$s2,16($sp)
+.Lctr32_km_zap:
+	stg	$s0,0($s2)
+	stg	$s0,8($s2)
+	la	$s2,16($s2)
+	brct	$s1,.Lctr32_km_zap
+
+	la	$sp,0($s0)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lctr32_software:
+___
+$code.=<<___;
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
+	sl${g}r	$inp,$out
+	larl	$tbl,AES_Te
+	llgf	$t1,12($ivp)
+
+.Lctr32_loop:
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
+	llgf	$s0,0($ivp)
+	llgf	$s1,4($ivp)
+	llgf	$s2,8($ivp)
+	lgr	$s3,$t1
+	st	$t1,16*$SIZE_T($sp)
+	lgr	%r4,$key
+
+	bras	$ra,_s390x_AES_encrypt
+
+	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
+	llgf	$t1,16*$SIZE_T($sp)
+	x	$s0,0($inp,$out)
+	x	$s1,4($inp,$out)
+	x	$s2,8($inp,$out)
+	x	$s3,12($inp,$out)
+	stm	$s0,$s3,0($out)
+
+	la	$out,16($out)
+	ahi	$t1,1		# 32-bit increment
+	brct	$len,.Lctr32_loop
+
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
+___
+}
+
+########################################################################
+# void AES_xts_encrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+{
+my $inp="%r2";
+my $out="%r4";	# len and out are swapped
+my $len="%r3";
+my $key1="%r5";	# $i1
+my $key2="%r6";	# $i2
+my $fp="%r7";	# $i3
+my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...
+
+$code.=<<___;
+.type	_s390x_xts_km,\@function
+.align	16
+_s390x_xts_km:
+___
+$code.=<<___ if(1);
+	llgfr	$s0,%r0			# put aside the function code
+	lghi	$s1,0x7f
+	nr	$s1,%r0
+	lghi	%r0,0			# query capability vector
+	la	%r1,$tweak-16($sp)
+	.long	0xb92e0042		# km %r4,%r2
+	llihh	%r1,0x8000
+	srlg	%r1,%r1,32($s1)		# check for 32+function code
+	ng	%r1,$tweak-16($sp)
+	lgr	%r0,$s0			# restore the function code
+	la	%r1,0($key1)		# restore $key1
+	jz	.Lxts_km_vanilla
+
+	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
+	algr	$out,$inp
+
+	oill	%r0,32			# switch to xts function code
+	aghi	$s1,-18			#
+	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
+	la	%r1,$tweak-16($sp)
+	slgr	%r1,$s1			# parameter block position
+	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
+	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
+					# yes, it contains junk and overlaps
+					# with the tweak in 128-bit case.
+					# it's done to avoid conditional
+					# branch.
+	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value
+
+	.long	0xb92e0042		# km %r4,%r2
+	brc	1,.-4			# pay attention to "partial completion"
+
+	lrvg	$s0,$tweak+0($sp)	# load the last tweak
+	lrvg	$s1,$tweak+8($sp)
+	stmg	%r0,%r3,$tweak-32($sp)	# wipe copy of the key
+
+	nill	%r0,0xffdf		# switch back to original function code
+	la	%r1,0($key1)		# restore pointer to $key1
+	slgr	$out,$inp
+
+	llgc	$len,2*$SIZE_T-1($sp)
+	nill	$len,0x0f		# $len%=16
+	br	$ra
+	
+.align	16
+.Lxts_km_vanilla:
+___
+$code.=<<___;
+	# prepare and allocate stack frame at the top of 4K page
+	# with 1K reserved for eventual signal handling
+	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
+	lghi	$s1,-4096
+	algr	$s0,$sp
+	lgr	$fp,$sp
+	ngr	$s0,$s1		# align at page boundary
+	slgr	$fp,$s0		# total buffer size
+	lgr	$s2,$sp
+	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
+	slgr	$fp,$s1		# deduct reservation to get usable buffer size
+	# buffer size is at lest 256 and at most 3072+256-16
+
+	la	$sp,1024($s0)	# alloca
+	nill	$fp,0xfff0	# round to 16*n
+	st${g}	$s2,0($sp)	# back-chain
+	nill	$len,0xfff0	# redundant
+	st${g}	$fp,$SIZE_T($sp)
+
+	slgr	$len,$fp
+	brc	1,.Lxts_km_go	# not zero, no borrow
+	algr	$fp,$len	# input is shorter than allocated buffer
+	lghi	$len,0
+	st${g}	$fp,$SIZE_T($sp)
+
+.Lxts_km_go:
+	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
+	lrvg	$s1,$tweak+8($s2)
+
+	la	$s2,16($sp)		# vector of ascending tweak values
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+	j	.Lxts_km_start
+
+.Lxts_km_loop:
+	la	$s2,16($sp)
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+.Lxts_km_prepare:
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	algr	$s0,$s0
+	alcgr	$s1,$s1
+	xgr	$s0,$i1
+.Lxts_km_start:
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	stg	$i1,0($s2,$inp)
+	stg	$i2,8($s2,$inp)
+	xg	$i1,0($inp)
+	xg	$i2,8($inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lxts_km_prepare
+
+	slgr	$inp,$fp		# rewind $inp
+	la	$s2,0($out,$inp)
+	lgr	$s3,$fp
+	.long	0xb92e00aa		# km $s2,$s2
+	brc	1,.-4			# pay attention to "partial completion"
+
+	la	$s2,16($sp)
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+.Lxts_km_xor:
+	lg	$i1,0($out,$inp)
+	lg	$i2,8($out,$inp)
+	xg	$i1,0($s2,$inp)
+	xg	$i2,8($s2,$inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lxts_km_xor
+
+	slgr	$len,$fp
+	brc	1,.Lxts_km_loop		# not zero, no borrow
+	algr	$fp,$len
+	lghi	$len,0
+	brc	4+1,.Lxts_km_loop	# not zero
+
+	l${g}	$i1,0($sp)		# back-chain
+	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
+	la	$i2,16($sp)
+	srlg	$fp,$fp,4
+.Lxts_km_zap:
+	stg	$i1,0($i2)
+	stg	$i1,8($i2)
+	la	$i2,16($i2)
+	brct	$fp,.Lxts_km_zap
+
+	la	$sp,0($i1)
+	llgc	$len,2*$SIZE_T-1($i1)
+	nill	$len,0x0f		# $len%=16
+	bzr	$ra
+
+	# generate one more tweak...
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	algr	$s0,$s0
+	alcgr	$s1,$s1
+	xgr	$s0,$i1
+
+	ltr	$len,$len		# clear zero flag
+	br	$ra
+.size	_s390x_xts_km,.-_s390x_xts_km
+
+.globl	AES_xts_encrypt
+.type	AES_xts_encrypt,\@function
+.align	16
+AES_xts_encrypt:
+	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
+	srag	$len,$len,4		# formally wrong, because it expands
+					# sign byte, but who can afford asking
+					# to process more than 2^63-1 bytes?
+					# I use it, because it sets condition
+					# code...
+	bcr	8,$ra			# abort if zero (i.e. less than 16)
+___
+$code.=<<___ if (!$softonly);
+	llgf	%r0,240($key2)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lxts_enc_software
+
+	st${g}	$ra,5*$SIZE_T($sp)
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+
+	sllg	$len,$len,4		# $len&=~15
+	slgr	$out,$inp
+
+	# generate the tweak value
+	l${g}	$s3,$stdframe($sp)	# pointer to iv
+	la	$s2,$tweak($sp)
+	lmg	$s0,$s1,0($s3)
+	lghi	$s3,16
+	stmg	$s0,$s1,0($s2)
+	la	%r1,0($key2)		# $key2 is not needed anymore
+	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
+	brc	1,.-4			# can this happen?
+
+	l	%r0,240($key1)
+	la	%r1,0($key1)		# $key1 is not needed anymore
+	bras	$ra,_s390x_xts_km
+	jz	.Lxts_enc_km_done
+
+	aghi	$inp,-16		# take one step back
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_enc_km_steal:
+	llgc	$i1,16($inp)
+	llgc	$i2,0($out,$inp)
+	stc	$i1,0($out,$inp)
+	stc	$i2,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_enc_km_steal
+
+	la	$s2,0($i3)
+	lghi	$s3,16
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	xg	$i1,0($s2)
+	xg	$i2,8($s2)
+	stg	$i1,0($s2)
+	stg	$i2,8($s2)
+	.long	0xb92e00aa		# km $s2,$s2
+	brc	1,.-4			# can this happen?
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	xg	$i1,0($i3)
+	xg	$i2,8($i3)
+	stg	$i1,0($i3)
+	stg	$i2,8($i3)
+
+.Lxts_enc_km_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$tweak+8($sp)
+	l${g}	$ra,5*$SIZE_T($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lxts_enc_software:
+___
+$code.=<<___;
+	stm${g}	%r6,$ra,6*$SIZE_T($sp)
+
+	slgr	$out,$inp
+
+	l${g}	$s3,$stdframe($sp)	# ivp
+	llgf	$s0,0($s3)		# load iv
+	llgf	$s1,4($s3)
+	llgf	$s2,8($s3)
+	llgf	$s3,12($s3)
+	stm${g}	%r2,%r5,2*$SIZE_T($sp)
+	la	$key,0($key2)
+	larl	$tbl,AES_Te
+	bras	$ra,_s390x_AES_encrypt	# generate the tweak
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	stm	$s0,$s3,$tweak($sp)	# save the tweak
+	j	.Lxts_enc_enter
+
+.align	16
+.Lxts_enc_loop:
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+	la	$inp,16($inp)		# $inp+=16
+.Lxts_enc_enter:
+	x	$s0,0($inp)		# ^=*($inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_encrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+	brct${g}	$len,.Lxts_enc_loop
+
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	jz	.Lxts_enc_done
+
+	la	$i3,0($inp,$out)	# put aside real $out
+.Lxts_enc_steal:
+	llgc	%r0,16($inp)
+	llgc	%r1,0($out,$inp)
+	stc	%r0,0($out,$inp)
+	stc	%r1,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_enc_steal
+	la	$out,0($i3)		# restore real $out
+
+	# generate last tweak...
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+
+	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
+	x	$s1,4($out)
+	x	$s2,8($out)
+	x	$s3,12($out)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_encrypt
+	l${g}	$out,4*$SIZE_T($sp)
+	x	$s0,`$tweak+0`($sp)	# ^=tweak
+	x	$s1,`$tweak+4`($sp)
+	x	$s2,`$tweak+8`($sp)
+	x	$s3,`$tweak+12`($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+
+.Lxts_enc_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$twesk+8($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_xts_encrypt,.-AES_xts_encrypt
+___
+# void AES_xts_decrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+$code.=<<___;
+.globl	AES_xts_decrypt
+.type	AES_xts_decrypt,\@function
+.align	16
+AES_xts_decrypt:
+	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
+	aghi	$len,-16
+	bcr	4,$ra			# abort if less than zero. formally
+					# wrong, because $len is unsigned,
+					# but who can afford asking to
+					# process more than 2^63-1 bytes?
+	tmll	$len,0x0f
+	jnz	.Lxts_dec_proceed
+	aghi	$len,16
+.Lxts_dec_proceed:
+___
+$code.=<<___ if (!$softonly);
+	llgf	%r0,240($key2)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lxts_dec_software
+
+	st${g}	$ra,5*$SIZE_T($sp)
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+
+	nill	$len,0xfff0		# $len&=~15
+	slgr	$out,$inp
+
+	# generate the tweak value
+	l${g}	$s3,$stdframe($sp)	# pointer to iv
+	la	$s2,$tweak($sp)
+	lmg	$s0,$s1,0($s3)
+	lghi	$s3,16
+	stmg	$s0,$s1,0($s2)
+	la	%r1,0($key2)		# $key2 is not needed past this point
+	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
+	brc	1,.-4			# can this happen?
+
+	l	%r0,240($key1)
+	la	%r1,0($key1)		# $key1 is not needed anymore
+
+	ltgr	$len,$len
+	jz	.Lxts_dec_km_short
+	bras	$ra,_s390x_xts_km
+	jz	.Lxts_dec_km_done
+
+	lrvgr	$s2,$s0			# make copy in reverse byte order
+	lrvgr	$s3,$s1
+	j	.Lxts_dec_km_2ndtweak
+
+.Lxts_dec_km_short:
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%=16
+	lrvg	$s0,$tweak+0($sp)	# load the tweak
+	lrvg	$s1,$tweak+8($sp)
+	lrvgr	$s2,$s0			# make copy in reverse byte order
+	lrvgr	$s3,$s1
+
+.Lxts_dec_km_2ndtweak:
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	algr	$s0,$s0
+	alcgr	$s1,$s1
+	xgr	$s0,$i1
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+
+	xg	$i1,0($inp)
+	xg	$i2,8($inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$i2,0($out,$inp)
+	lghi	$i3,16
+	.long	0xb92e0066		# km $i2,$i2
+	brc	1,.-4			# can this happen?
+	lrvgr	$i1,$s0
+	lrvgr	$i2,$s1
+	xg	$i1,0($out,$inp)
+	xg	$i2,8($out,$inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_dec_km_steal:
+	llgc	$i1,16($inp)
+	llgc	$i2,0($out,$inp)
+	stc	$i1,0($out,$inp)
+	stc	$i2,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_dec_km_steal
+
+	lgr	$s0,$s2
+	lgr	$s1,$s3
+	xg	$s0,0($i3)
+	xg	$s1,8($i3)
+	stg	$s0,0($i3)
+	stg	$s1,8($i3)
+	la	$s0,0($i3)
+	lghi	$s1,16
+	.long	0xb92e0088		# km $s0,$s0
+	brc	1,.-4			# can this happen?
+	xg	$s2,0($i3)
+	xg	$s3,8($i3)
+	stg	$s2,0($i3)
+	stg	$s3,8($i3)
+.Lxts_dec_km_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$tweak+8($sp)
+	l${g}	$ra,5*$SIZE_T($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lxts_dec_software:
+___
+$code.=<<___;
+	stm${g}	%r6,$ra,6*$SIZE_T($sp)
+
+	srlg	$len,$len,4
+	slgr	$out,$inp
+
+	l${g}	$s3,$stdframe($sp)	# ivp
+	llgf	$s0,0($s3)		# load iv
+	llgf	$s1,4($s3)
+	llgf	$s2,8($s3)
+	llgf	$s3,12($s3)
+	stm${g}	%r2,%r5,2*$SIZE_T($sp)
+	la	$key,0($key2)
+	larl	$tbl,AES_Te
+	bras	$ra,_s390x_AES_encrypt	# generate the tweak
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	larl	$tbl,AES_Td
+	lt${g}r	$len,$len
+	stm	$s0,$s3,$tweak($sp)	# save the tweak
+	jz	.Lxts_dec_short
+	j	.Lxts_dec_enter
+
+.align	16
+.Lxts_dec_loop:
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+.Lxts_dec_enter:
+	x	$s0,0($inp)		# tweak^=*(inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+	la	$inp,16($inp)
+	brct${g}	$len,.Lxts_dec_loop
+
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	jz	.Lxts_dec_done
+
+	# generate pair of tweaks...
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$i2,$s1			# flip byte order
+	lrvgr	$i3,$s3
+	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
+	j	.Lxts_dec_2ndtweak
+
+.align	16
+.Lxts_dec_short:
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+.Lxts_dec_2ndtweak:
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
+	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak-16+8($sp)
+	llgfr	$s3,$s3
+
+	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
+	x	$s1,$tweak-16+4($sp)
+	x	$s2,$tweak-16+8($sp)
+	x	$s3,$tweak-16+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_dec_steal:
+	llgc	%r0,16($inp)
+	llgc	%r1,0($out,$inp)
+	stc	%r0,0($out,$inp)
+	stc	%r1,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_dec_steal
+	la	$out,0($i3)		# restore real $out
+
+	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
+	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
+	x	$s1,4($out)
+	x	$s2,8($out)
+	x	$s3,12($out)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	l${g}	$out,4*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
+	stg	$sp,$tweak-16+8($sp)
+.Lxts_dec_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$twesk+8($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_xts_decrypt,.-AES_xts_decrypt
 ___
 }
 $code.=<<___;
 .string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
+close STDOUT;	# force flush
diff --git a/main/openssl/crypto/aes/asm/aes-sparcv9.pl b/main/openssl/crypto/aes/asm/aes-sparcv9.pl
index c57b3a2d..403c4d12 100755
--- a/main/openssl/crypto/aes/asm/aes-sparcv9.pl
+++ b/main/openssl/crypto/aes/asm/aes-sparcv9.pl
@@ -1176,6 +1176,7 @@ ___
 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
 # undesired effect, so just omit them and sacrifice some portion of
 # percent in performance...
-$code =~ s/fmovs.*$//gem;
+$code =~ s/fmovs.*$//gm;
 
 print $code;
+close STDOUT;	# ensure flush
diff --git a/main/openssl/crypto/aes/asm/aes-x86_64.S b/main/openssl/crypto/aes/asm/aes-x86_64.S
new file mode 100644
index 00000000..e385566f
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-x86_64.S
@@ -0,0 +1,2541 @@
+.text	
+.type	_x86_64_AES_encrypt,@function
+.align	16
+_x86_64_AES_encrypt:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+
+	movl	240(%r15),%r13d
+	subl	$1,%r13d
+	jmp	.Lenc_loop
+.align	16
+.Lenc_loop:
+
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movl	0(%r14,%rsi,8),%r10d
+	movl	0(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r12d
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movzbl	%dl,%ebp
+	xorl	3(%r14,%rsi,8),%r10d
+	xorl	3(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r8d
+
+	movzbl	%dh,%esi
+	shrl	$16,%ecx
+	movzbl	%ah,%ebp
+	xorl	3(%r14,%rsi,8),%r12d
+	shrl	$16,%edx
+	xorl	3(%r14,%rbp,8),%r8d
+
+	shrl	$16,%ebx
+	leaq	16(%r15),%r15
+	shrl	$16,%eax
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	xorl	2(%r14,%rsi,8),%r10d
+	xorl	2(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r12d
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movzbl	%bl,%ebp
+	xorl	1(%r14,%rsi,8),%r10d
+	xorl	1(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r8d
+
+	movl	12(%r15),%edx
+	movzbl	%bh,%edi
+	movzbl	%ch,%ebp
+	movl	0(%r15),%eax
+	xorl	1(%r14,%rdi,8),%r12d
+	xorl	1(%r14,%rbp,8),%r8d
+
+	movl	4(%r15),%ebx
+	movl	8(%r15),%ecx
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+	subl	$1,%r13d
+	jnz	.Lenc_loop
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movzbl	2(%r14,%rsi,8),%r10d
+	movzbl	2(%r14,%rdi,8),%r11d
+	movzbl	2(%r14,%rbp,8),%r12d
+
+	movzbl	%dl,%esi
+	movzbl	%bh,%edi
+	movzbl	%ch,%ebp
+	movzbl	2(%r14,%rsi,8),%r8d
+	movl	0(%r14,%rdi,8),%edi
+	movl	0(%r14,%rbp,8),%ebp
+
+	andl	$65280,%edi
+	andl	$65280,%ebp
+
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+	shrl	$16,%ecx
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	shrl	$16,%edx
+	movl	0(%r14,%rsi,8),%esi
+	movl	0(%r14,%rdi,8),%edi
+
+	andl	$65280,%esi
+	andl	$65280,%edi
+	shrl	$16,%ebx
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+	shrl	$16,%eax
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	movl	0(%r14,%rsi,8),%esi
+	movl	0(%r14,%rdi,8),%edi
+	movl	0(%r14,%rbp,8),%ebp
+
+	andl	$16711680,%esi
+	andl	$16711680,%edi
+	andl	$16711680,%ebp
+
+	xorl	%esi,%r10d
+	xorl	%edi,%r11d
+	xorl	%ebp,%r12d
+
+	movzbl	%bl,%esi
+	movzbl	%dh,%edi
+	movzbl	%ah,%ebp
+	movl	0(%r14,%rsi,8),%esi
+	movl	2(%r14,%rdi,8),%edi
+	movl	2(%r14,%rbp,8),%ebp
+
+	andl	$16711680,%esi
+	andl	$4278190080,%edi
+	andl	$4278190080,%ebp
+
+	xorl	%esi,%r8d
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movl	16+12(%r15),%edx
+	movl	2(%r14,%rsi,8),%esi
+	movl	2(%r14,%rdi,8),%edi
+	movl	16+0(%r15),%eax
+
+	andl	$4278190080,%esi
+	andl	$4278190080,%edi
+
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+
+	movl	16+4(%r15),%ebx
+	movl	16+8(%r15),%ecx
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
+.type	_x86_64_AES_encrypt_compact,@function
+.align	16
+_x86_64_AES_encrypt_compact:
+	leaq	128(%r14),%r8
+	movl	0-128(%r8),%edi
+	movl	32-128(%r8),%ebp
+	movl	64-128(%r8),%r10d
+	movl	96-128(%r8),%r11d
+	movl	128-128(%r8),%edi
+	movl	160-128(%r8),%ebp
+	movl	192-128(%r8),%r10d
+	movl	224-128(%r8),%r11d
+	jmp	.Lenc_loop_compact
+.align	16
+.Lenc_loop_compact:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+	leaq	16(%r15),%r15
+	movzbl	%al,%r10d
+	movzbl	%bl,%r11d
+	movzbl	%cl,%r12d
+	movzbl	(%r14,%r10,1),%r10d
+	movzbl	(%r14,%r11,1),%r11d
+	movzbl	(%r14,%r12,1),%r12d
+
+	movzbl	%dl,%r8d
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movzbl	(%r14,%r8,1),%r8d
+	movzbl	(%r14,%rsi,1),%r9d
+	movzbl	(%r14,%rdi,1),%r13d
+
+	movzbl	%dh,%ebp
+	movzbl	%ah,%esi
+	shrl	$16,%ecx
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	shrl	$16,%edx
+
+	movzbl	%cl,%edi
+	shll	$8,%r9d
+	shll	$8,%r13d
+	movzbl	(%r14,%rdi,1),%edi
+	xorl	%r9d,%r10d
+	xorl	%r13d,%r11d
+
+	movzbl	%dl,%r9d
+	shrl	$16,%eax
+	shrl	$16,%ebx
+	movzbl	%al,%r13d
+	shll	$8,%ebp
+	shll	$8,%esi
+	movzbl	(%r14,%r9,1),%r9d
+	movzbl	(%r14,%r13,1),%r13d
+	xorl	%ebp,%r12d
+	xorl	%esi,%r8d
+
+	movzbl	%bl,%ebp
+	movzbl	%dh,%esi
+	shll	$16,%edi
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	xorl	%edi,%r10d
+
+	movzbl	%ah,%edi
+	shrl	$8,%ecx
+	shrl	$8,%ebx
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rcx,1),%edx
+	movzbl	(%r14,%rbx,1),%ecx
+	shll	$16,%r9d
+	shll	$16,%r13d
+	shll	$16,%ebp
+	xorl	%r9d,%r11d
+	xorl	%r13d,%r12d
+	xorl	%ebp,%r8d
+
+	shll	$24,%esi
+	shll	$24,%edi
+	shll	$24,%edx
+	xorl	%esi,%r10d
+	shll	$24,%ecx
+	xorl	%edi,%r11d
+	movl	%r10d,%eax
+	movl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+	cmpq	16(%rsp),%r15
+	je	.Lenc_compact_done
+	movl	%eax,%esi
+	movl	%ebx,%edi
+	andl	$2155905152,%esi
+	andl	$2155905152,%edi
+	movl	%esi,%r10d
+	movl	%edi,%r11d
+	shrl	$7,%r10d
+	leal	(%rax,%rax,1),%r8d
+	shrl	$7,%r11d
+	leal	(%rbx,%rbx,1),%r9d
+	subl	%r10d,%esi
+	subl	%r11d,%edi
+	andl	$4278124286,%r8d
+	andl	$4278124286,%r9d
+	andl	$454761243,%esi
+	andl	$454761243,%edi
+	movl	%eax,%r10d
+	movl	%ebx,%r11d
+	xorl	%esi,%r8d
+	xorl	%edi,%r9d
+
+	xorl	%r8d,%eax
+	xorl	%r9d,%ebx
+	movl	%ecx,%esi
+	movl	%edx,%edi
+	roll	$24,%eax
+	roll	$24,%ebx
+	andl	$2155905152,%esi
+	andl	$2155905152,%edi
+	xorl	%r8d,%eax
+	xorl	%r9d,%ebx
+	movl	%esi,%r12d
+	movl	%edi,%ebp
+	rorl	$16,%r10d
+	rorl	$16,%r11d
+	shrl	$7,%r12d
+	leal	(%rcx,%rcx,1),%r8d
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	shrl	$7,%ebp
+	leal	(%rdx,%rdx,1),%r9d
+	rorl	$8,%r10d
+	rorl	$8,%r11d
+	subl	%r12d,%esi
+	subl	%ebp,%edi
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+
+	andl	$4278124286,%r8d
+	andl	$4278124286,%r9d
+	andl	$454761243,%esi
+	andl	$454761243,%edi
+	movl	%ecx,%r12d
+	movl	%edx,%ebp
+	xorl	%esi,%r8d
+	xorl	%edi,%r9d
+
+	xorl	%r8d,%ecx
+	xorl	%r9d,%edx
+	roll	$24,%ecx
+	roll	$24,%edx
+	xorl	%r8d,%ecx
+	xorl	%r9d,%edx
+	movl	0(%r14),%esi
+	rorl	$16,%r12d
+	rorl	$16,%ebp
+	movl	64(%r14),%edi
+	xorl	%r12d,%ecx
+	xorl	%ebp,%edx
+	movl	128(%r14),%r8d
+	rorl	$8,%r12d
+	rorl	$8,%ebp
+	movl	192(%r14),%r9d
+	xorl	%r12d,%ecx
+	xorl	%ebp,%edx
+	jmp	.Lenc_loop_compact
+.align	16
+.Lenc_compact_done:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
+.globl	AES_encrypt
+.type	AES_encrypt,@function
+.align	16
+.globl	asm_AES_encrypt
+.hidden	asm_AES_encrypt
+asm_AES_encrypt:
+AES_encrypt:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+	movq	%rsp,%r10
+	leaq	-63(%rdx),%rcx
+	andq	$-64,%rsp
+	subq	%rsp,%rcx
+	negq	%rcx
+	andq	$960,%rcx
+	subq	%rcx,%rsp
+	subq	$32,%rsp
+
+	movq	%rsi,16(%rsp)
+	movq	%r10,24(%rsp)
+.Lenc_prologue:
+
+	movq	%rdx,%r15
+	movl	240(%r15),%r13d
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+
+	shll	$4,%r13d
+	leaq	(%r15,%r13,1),%rbp
+	movq	%r15,(%rsp)
+	movq	%rbp,8(%rsp)
+
+
+	leaq	.LAES_Te+2048(%rip),%r14
+	leaq	768(%rsp),%rbp
+	subq	%r14,%rbp
+	andq	$768,%rbp
+	leaq	(%r14,%rbp,1),%r14
+
+	call	_x86_64_AES_encrypt_compact
+
+	movq	16(%rsp),%r9
+	movq	24(%rsp),%rsi
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lenc_epilogue:
+	.byte	0xf3,0xc3
+.size	AES_encrypt,.-AES_encrypt
+.type	_x86_64_AES_decrypt,@function
+.align	16
+_x86_64_AES_decrypt:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+
+	movl	240(%r15),%r13d
+	subl	$1,%r13d
+	jmp	.Ldec_loop
+.align	16
+.Ldec_loop:
+
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movl	0(%r14,%rsi,8),%r10d
+	movl	0(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r12d
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movzbl	%dl,%ebp
+	xorl	3(%r14,%rsi,8),%r10d
+	xorl	3(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r8d
+
+	movzbl	%bh,%esi
+	shrl	$16,%eax
+	movzbl	%ch,%ebp
+	xorl	3(%r14,%rsi,8),%r12d
+	shrl	$16,%edx
+	xorl	3(%r14,%rbp,8),%r8d
+
+	shrl	$16,%ebx
+	leaq	16(%r15),%r15
+	shrl	$16,%ecx
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	xorl	2(%r14,%rsi,8),%r10d
+	xorl	2(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r12d
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movzbl	%bl,%ebp
+	xorl	1(%r14,%rsi,8),%r10d
+	xorl	1(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r8d
+
+	movzbl	%dh,%esi
+	movl	12(%r15),%edx
+	movzbl	%ah,%ebp
+	xorl	1(%r14,%rsi,8),%r12d
+	movl	0(%r15),%eax
+	xorl	1(%r14,%rbp,8),%r8d
+
+	xorl	%r10d,%eax
+	movl	4(%r15),%ebx
+	movl	8(%r15),%ecx
+	xorl	%r12d,%ecx
+	xorl	%r11d,%ebx
+	xorl	%r8d,%edx
+	subl	$1,%r13d
+	jnz	.Ldec_loop
+	leaq	2048(%r14),%r14
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movzbl	(%r14,%rsi,1),%r10d
+	movzbl	(%r14,%rdi,1),%r11d
+	movzbl	(%r14,%rbp,1),%r12d
+
+	movzbl	%dl,%esi
+	movzbl	%dh,%edi
+	movzbl	%ah,%ebp
+	movzbl	(%r14,%rsi,1),%r8d
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rbp,1),%ebp
+
+	shll	$8,%edi
+	shll	$8,%ebp
+
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+	shrl	$16,%edx
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	shrl	$16,%eax
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+
+	shll	$8,%esi
+	shll	$8,%edi
+	shrl	$16,%ebx
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+	shrl	$16,%ecx
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rbp,1),%ebp
+
+	shll	$16,%esi
+	shll	$16,%edi
+	shll	$16,%ebp
+
+	xorl	%esi,%r10d
+	xorl	%edi,%r11d
+	xorl	%ebp,%r12d
+
+	movzbl	%bl,%esi
+	movzbl	%bh,%edi
+	movzbl	%ch,%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rbp,1),%ebp
+
+	shll	$16,%esi
+	shll	$24,%edi
+	shll	$24,%ebp
+
+	xorl	%esi,%r8d
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movl	16+12(%r15),%edx
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+	movl	16+0(%r15),%eax
+
+	shll	$24,%esi
+	shll	$24,%edi
+
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+
+	movl	16+4(%r15),%ebx
+	movl	16+8(%r15),%ecx
+	leaq	-2048(%r14),%r14
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
+.type	_x86_64_AES_decrypt_compact,@function
+.align	16
+_x86_64_AES_decrypt_compact:
+	leaq	128(%r14),%r8
+	movl	0-128(%r8),%edi
+	movl	32-128(%r8),%ebp
+	movl	64-128(%r8),%r10d
+	movl	96-128(%r8),%r11d
+	movl	128-128(%r8),%edi
+	movl	160-128(%r8),%ebp
+	movl	192-128(%r8),%r10d
+	movl	224-128(%r8),%r11d
+	jmp	.Ldec_loop_compact
+
+.align	16
+.Ldec_loop_compact:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+	leaq	16(%r15),%r15
+	movzbl	%al,%r10d
+	movzbl	%bl,%r11d
+	movzbl	%cl,%r12d
+	movzbl	(%r14,%r10,1),%r10d
+	movzbl	(%r14,%r11,1),%r11d
+	movzbl	(%r14,%r12,1),%r12d
+
+	movzbl	%dl,%r8d
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movzbl	(%r14,%r8,1),%r8d
+	movzbl	(%r14,%rsi,1),%r9d
+	movzbl	(%r14,%rdi,1),%r13d
+
+	movzbl	%bh,%ebp
+	movzbl	%ch,%esi
+	shrl	$16,%ecx
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	shrl	$16,%edx
+
+	movzbl	%cl,%edi
+	shll	$8,%r9d
+	shll	$8,%r13d
+	movzbl	(%r14,%rdi,1),%edi
+	xorl	%r9d,%r10d
+	xorl	%r13d,%r11d
+
+	movzbl	%dl,%r9d
+	shrl	$16,%eax
+	shrl	$16,%ebx
+	movzbl	%al,%r13d
+	shll	$8,%ebp
+	shll	$8,%esi
+	movzbl	(%r14,%r9,1),%r9d
+	movzbl	(%r14,%r13,1),%r13d
+	xorl	%ebp,%r12d
+	xorl	%esi,%r8d
+
+	movzbl	%bl,%ebp
+	movzbl	%bh,%esi
+	shll	$16,%edi
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	xorl	%edi,%r10d
+
+	movzbl	%ch,%edi
+	shll	$16,%r9d
+	shll	$16,%r13d
+	movzbl	(%r14,%rdi,1),%ebx
+	xorl	%r9d,%r11d
+	xorl	%r13d,%r12d
+
+	movzbl	%dh,%edi
+	shrl	$8,%eax
+	shll	$16,%ebp
+	movzbl	(%r14,%rdi,1),%ecx
+	movzbl	(%r14,%rax,1),%edx
+	xorl	%ebp,%r8d
+
+	shll	$24,%esi
+	shll	$24,%ebx
+	shll	$24,%ecx
+	xorl	%esi,%r10d
+	shll	$24,%edx
+	xorl	%r11d,%ebx
+	movl	%r10d,%eax
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+	cmpq	16(%rsp),%r15
+	je	.Ldec_compact_done
+
+	movq	256+0(%r14),%rsi
+	shlq	$32,%rbx
+	shlq	$32,%rdx
+	movq	256+8(%r14),%rdi
+	orq	%rbx,%rax
+	orq	%rdx,%rcx
+	movq	256+16(%r14),%rbp
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+	shrq	$7,%r9
+	leaq	(%rax,%rax,1),%r8
+	shrq	$7,%r12
+	leaq	(%rcx,%rcx,1),%r11
+	subq	%r9,%rbx
+	subq	%r12,%rdx
+	andq	%rdi,%r8
+	andq	%rdi,%r11
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r8,%rbx
+	xorq	%r11,%rdx
+	movq	%rbx,%r8
+	movq	%rdx,%r11
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	leaq	(%r8,%r8,1),%r9
+	shrq	$7,%r13
+	leaq	(%r11,%r11,1),%r12
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	andq	%rdi,%r9
+	andq	%rdi,%r12
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r9,%rbx
+	xorq	%r12,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	xorq	%rax,%r8
+	shrq	$7,%r13
+	xorq	%rcx,%r11
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	leaq	(%r9,%r9,1),%r10
+	leaq	(%r12,%r12,1),%r13
+	xorq	%rax,%r9
+	xorq	%rcx,%r12
+	andq	%rdi,%r10
+	andq	%rdi,%r13
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%rbx,%r10
+	xorq	%rdx,%r13
+
+	xorq	%r10,%rax
+	xorq	%r13,%rcx
+	xorq	%r10,%r8
+	xorq	%r13,%r11
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	xorq	%r10,%r9
+	xorq	%r13,%r12
+	shrq	$32,%rbx
+	shrq	$32,%rdx
+	xorq	%r8,%r10
+	xorq	%r11,%r13
+	roll	$8,%eax
+	roll	$8,%ecx
+	xorq	%r9,%r10
+	xorq	%r12,%r13
+
+	roll	$8,%ebx
+	roll	$8,%edx
+	xorl	%r10d,%eax
+	xorl	%r13d,%ecx
+	shrq	$32,%r10
+	shrq	$32,%r13
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+	movq	%r8,%r10
+	movq	%r11,%r13
+	shrq	$32,%r10
+	shrq	$32,%r13
+	roll	$24,%r8d
+	roll	$24,%r11d
+	roll	$24,%r10d
+	roll	$24,%r13d
+	xorl	%r8d,%eax
+	xorl	%r11d,%ecx
+	movq	%r9,%r8
+	movq	%r12,%r11
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+	movq	0(%r14),%rsi
+	shrq	$32,%r8
+	shrq	$32,%r11
+	movq	64(%r14),%rdi
+	roll	$16,%r9d
+	roll	$16,%r12d
+	movq	128(%r14),%rbp
+	roll	$16,%r8d
+	roll	$16,%r11d
+	movq	192(%r14),%r10
+	xorl	%r9d,%eax
+	xorl	%r12d,%ecx
+	movq	256(%r14),%r13
+	xorl	%r8d,%ebx
+	xorl	%r11d,%edx
+	jmp	.Ldec_loop_compact
+.align	16
+.Ldec_compact_done:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
+.globl	AES_decrypt
+.type	AES_decrypt,@function
+.align	16
+.globl	asm_AES_decrypt
+.hidden	asm_AES_decrypt
+asm_AES_decrypt:
+AES_decrypt:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+	movq	%rsp,%r10
+	leaq	-63(%rdx),%rcx
+	andq	$-64,%rsp
+	subq	%rsp,%rcx
+	negq	%rcx
+	andq	$960,%rcx
+	subq	%rcx,%rsp
+	subq	$32,%rsp
+
+	movq	%rsi,16(%rsp)
+	movq	%r10,24(%rsp)
+.Ldec_prologue:
+
+	movq	%rdx,%r15
+	movl	240(%r15),%r13d
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+
+	shll	$4,%r13d
+	leaq	(%r15,%r13,1),%rbp
+	movq	%r15,(%rsp)
+	movq	%rbp,8(%rsp)
+
+
+	leaq	.LAES_Td+2048(%rip),%r14
+	leaq	768(%rsp),%rbp
+	subq	%r14,%rbp
+	andq	$768,%rbp
+	leaq	(%r14,%rbp,1),%r14
+	shrq	$3,%rbp
+	addq	%rbp,%r14
+
+	call	_x86_64_AES_decrypt_compact
+
+	movq	16(%rsp),%r9
+	movq	24(%rsp),%rsi
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Ldec_epilogue:
+	.byte	0xf3,0xc3
+.size	AES_decrypt,.-AES_decrypt
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,@function
+.align	16
+private_AES_set_encrypt_key:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$8,%rsp
+.Lenc_key_prologue:
+
+	call	_x86_64_AES_set_encrypt_key
+
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbp
+	movq	48(%rsp),%rbx
+	addq	$56,%rsp
+.Lenc_key_epilogue:
+	.byte	0xf3,0xc3
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+
+.type	_x86_64_AES_set_encrypt_key,@function
+.align	16
+_x86_64_AES_set_encrypt_key:
+	movl	%esi,%ecx
+	movq	%rdi,%rsi
+	movq	%rdx,%rdi
+
+	testq	$-1,%rsi
+	jz	.Lbadpointer
+	testq	$-1,%rdi
+	jz	.Lbadpointer
+
+	leaq	.LAES_Te(%rip),%rbp
+	leaq	2048+128(%rbp),%rbp
+
+
+	movl	0-128(%rbp),%eax
+	movl	32-128(%rbp),%ebx
+	movl	64-128(%rbp),%r8d
+	movl	96-128(%rbp),%edx
+	movl	128-128(%rbp),%eax
+	movl	160-128(%rbp),%ebx
+	movl	192-128(%rbp),%r8d
+	movl	224-128(%rbp),%edx
+
+	cmpl	$128,%ecx
+	je	.L10rounds
+	cmpl	$192,%ecx
+	je	.L12rounds
+	cmpl	$256,%ecx
+	je	.L14rounds
+	movq	$-2,%rax
+	jmp	.Lexit
+
+.L10rounds:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rdx,8(%rdi)
+
+	shrq	$32,%rdx
+	xorl	%ecx,%ecx
+	jmp	.L10shortcut
+.align	4
+.L10loop:
+	movl	0(%rdi),%eax
+	movl	12(%rdi),%edx
+.L10shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	xorl	1024-128(%rbp,%rcx,4),%eax
+	movl	%eax,16(%rdi)
+	xorl	4(%rdi),%eax
+	movl	%eax,20(%rdi)
+	xorl	8(%rdi),%eax
+	movl	%eax,24(%rdi)
+	xorl	12(%rdi),%eax
+	movl	%eax,28(%rdi)
+	addl	$1,%ecx
+	leaq	16(%rdi),%rdi
+	cmpl	$10,%ecx
+	jl	.L10loop
+
+	movl	$10,80(%rdi)
+	xorq	%rax,%rax
+	jmp	.Lexit
+
+.L12rounds:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	16(%rsi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rdx,16(%rdi)
+
+	shrq	$32,%rdx
+	xorl	%ecx,%ecx
+	jmp	.L12shortcut
+.align	4
+.L12loop:
+	movl	0(%rdi),%eax
+	movl	20(%rdi),%edx
+.L12shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	xorl	1024-128(%rbp,%rcx,4),%eax
+	movl	%eax,24(%rdi)
+	xorl	4(%rdi),%eax
+	movl	%eax,28(%rdi)
+	xorl	8(%rdi),%eax
+	movl	%eax,32(%rdi)
+	xorl	12(%rdi),%eax
+	movl	%eax,36(%rdi)
+
+	cmpl	$7,%ecx
+	je	.L12break
+	addl	$1,%ecx
+
+	xorl	16(%rdi),%eax
+	movl	%eax,40(%rdi)
+	xorl	20(%rdi),%eax
+	movl	%eax,44(%rdi)
+
+	leaq	24(%rdi),%rdi
+	jmp	.L12loop
+.L12break:
+	movl	$12,72(%rdi)
+	xorq	%rax,%rax
+	jmp	.Lexit
+
+.L14rounds:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	16(%rsi),%rcx
+	movq	24(%rsi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+
+	shrq	$32,%rdx
+	xorl	%ecx,%ecx
+	jmp	.L14shortcut
+.align	4
+.L14loop:
+	movl	0(%rdi),%eax
+	movl	28(%rdi),%edx
+.L14shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	xorl	1024-128(%rbp,%rcx,4),%eax
+	movl	%eax,32(%rdi)
+	xorl	4(%rdi),%eax
+	movl	%eax,36(%rdi)
+	xorl	8(%rdi),%eax
+	movl	%eax,40(%rdi)
+	xorl	12(%rdi),%eax
+	movl	%eax,44(%rdi)
+
+	cmpl	$6,%ecx
+	je	.L14break
+	addl	$1,%ecx
+
+	movl	%eax,%edx
+	movl	16(%rdi),%eax
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	shll	$8,%ebx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movl	%eax,48(%rdi)
+	xorl	20(%rdi),%eax
+	movl	%eax,52(%rdi)
+	xorl	24(%rdi),%eax
+	movl	%eax,56(%rdi)
+	xorl	28(%rdi),%eax
+	movl	%eax,60(%rdi)
+
+	leaq	32(%rdi),%rdi
+	jmp	.L14loop
+.L14break:
+	movl	$14,48(%rdi)
+	xorq	%rax,%rax
+	jmp	.Lexit
+
+.Lbadpointer:
+	movq	$-1,%rax
+.Lexit:
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,@function
+.align	16
+private_AES_set_decrypt_key:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdx
+.Ldec_key_prologue:
+
+	call	_x86_64_AES_set_encrypt_key
+	movq	(%rsp),%r8
+	cmpl	$0,%eax
+	jne	.Labort
+
+	movl	240(%r8),%r14d
+	xorq	%rdi,%rdi
+	leaq	(%rdi,%r14,4),%rcx
+	movq	%r8,%rsi
+	leaq	(%r8,%rcx,4),%rdi
+.align	4
+.Linvert:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	0(%rdi),%rcx
+	movq	8(%rdi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,0(%rsi)
+	movq	%rdx,8(%rsi)
+	leaq	16(%rsi),%rsi
+	leaq	-16(%rdi),%rdi
+	cmpq	%rsi,%rdi
+	jne	.Linvert
+
+	leaq	.LAES_Te+2048+1024(%rip),%rax
+
+	movq	40(%rax),%rsi
+	movq	48(%rax),%rdi
+	movq	56(%rax),%rbp
+
+	movq	%r8,%r15
+	subl	$1,%r14d
+.align	4
+.Lpermute:
+	leaq	16(%r15),%r15
+	movq	0(%r15),%rax
+	movq	8(%r15),%rcx
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+	shrq	$7,%r9
+	leaq	(%rax,%rax,1),%r8
+	shrq	$7,%r12
+	leaq	(%rcx,%rcx,1),%r11
+	subq	%r9,%rbx
+	subq	%r12,%rdx
+	andq	%rdi,%r8
+	andq	%rdi,%r11
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r8,%rbx
+	xorq	%r11,%rdx
+	movq	%rbx,%r8
+	movq	%rdx,%r11
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	leaq	(%r8,%r8,1),%r9
+	shrq	$7,%r13
+	leaq	(%r11,%r11,1),%r12
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	andq	%rdi,%r9
+	andq	%rdi,%r12
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r9,%rbx
+	xorq	%r12,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	xorq	%rax,%r8
+	shrq	$7,%r13
+	xorq	%rcx,%r11
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	leaq	(%r9,%r9,1),%r10
+	leaq	(%r12,%r12,1),%r13
+	xorq	%rax,%r9
+	xorq	%rcx,%r12
+	andq	%rdi,%r10
+	andq	%rdi,%r13
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%rbx,%r10
+	xorq	%rdx,%r13
+
+	xorq	%r10,%rax
+	xorq	%r13,%rcx
+	xorq	%r10,%r8
+	xorq	%r13,%r11
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	xorq	%r10,%r9
+	xorq	%r13,%r12
+	shrq	$32,%rbx
+	shrq	$32,%rdx
+	xorq	%r8,%r10
+	xorq	%r11,%r13
+	roll	$8,%eax
+	roll	$8,%ecx
+	xorq	%r9,%r10
+	xorq	%r12,%r13
+
+	roll	$8,%ebx
+	roll	$8,%edx
+	xorl	%r10d,%eax
+	xorl	%r13d,%ecx
+	shrq	$32,%r10
+	shrq	$32,%r13
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+	movq	%r8,%r10
+	movq	%r11,%r13
+	shrq	$32,%r10
+	shrq	$32,%r13
+	roll	$24,%r8d
+	roll	$24,%r11d
+	roll	$24,%r10d
+	roll	$24,%r13d
+	xorl	%r8d,%eax
+	xorl	%r11d,%ecx
+	movq	%r9,%r8
+	movq	%r12,%r11
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+
+	shrq	$32,%r8
+	shrq	$32,%r11
+
+	roll	$16,%r9d
+	roll	$16,%r12d
+
+	roll	$16,%r8d
+	roll	$16,%r11d
+
+	xorl	%r9d,%eax
+	xorl	%r12d,%ecx
+
+	xorl	%r8d,%ebx
+	xorl	%r11d,%edx
+	movl	%eax,0(%r15)
+	movl	%ebx,4(%r15)
+	movl	%ecx,8(%r15)
+	movl	%edx,12(%r15)
+	subl	$1,%r14d
+	jnz	.Lpermute
+
+	xorq	%rax,%rax
+.Labort:
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbp
+	movq	48(%rsp),%rbx
+	addq	$56,%rsp
+.Ldec_key_epilogue:
+	.byte	0xf3,0xc3
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.globl	AES_cbc_encrypt
+.type	AES_cbc_encrypt,@function
+.align	16
+
+.globl	asm_AES_cbc_encrypt
+.hidden	asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
+AES_cbc_encrypt:
+	cmpq	$0,%rdx
+	je	.Lcbc_epilogue
+	pushfq
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+.Lcbc_prologue:
+
+	cld
+	movl	%r9d,%r9d
+
+	leaq	.LAES_Te(%rip),%r14
+	cmpq	$0,%r9
+	jne	.Lcbc_picked_te
+	leaq	.LAES_Td(%rip),%r14
+.Lcbc_picked_te:
+
+	movl	OPENSSL_ia32cap_P(%rip),%r10d
+	cmpq	$512,%rdx
+	jb	.Lcbc_slow_prologue
+	testq	$15,%rdx
+	jnz	.Lcbc_slow_prologue
+	btl	$28,%r10d
+	jc	.Lcbc_slow_prologue
+
+
+	leaq	-88-248(%rsp),%r15
+	andq	$-64,%r15
+
+
+	movq	%r14,%r10
+	leaq	2304(%r14),%r11
+	movq	%r15,%r12
+	andq	$4095,%r10
+	andq	$4095,%r11
+	andq	$4095,%r12
+
+	cmpq	%r11,%r12
+	jb	.Lcbc_te_break_out
+	subq	%r11,%r12
+	subq	%r12,%r15
+	jmp	.Lcbc_te_ok
+.Lcbc_te_break_out:
+	subq	%r10,%r12
+	andq	$4095,%r12
+	addq	$320,%r12
+	subq	%r12,%r15
+.align	4
+.Lcbc_te_ok:
+
+	xchgq	%rsp,%r15
+
+	movq	%r15,16(%rsp)
+.Lcbc_fast_body:
+	movq	%rdi,24(%rsp)
+	movq	%rsi,32(%rsp)
+	movq	%rdx,40(%rsp)
+	movq	%rcx,48(%rsp)
+	movq	%r8,56(%rsp)
+	movl	$0,80+240(%rsp)
+	movq	%r8,%rbp
+	movq	%r9,%rbx
+	movq	%rsi,%r9
+	movq	%rdi,%r8
+	movq	%rcx,%r15
+
+	movl	240(%r15),%eax
+
+	movq	%r15,%r10
+	subq	%r14,%r10
+	andq	$4095,%r10
+	cmpq	$2304,%r10
+	jb	.Lcbc_do_ecopy
+	cmpq	$4096-248,%r10
+	jb	.Lcbc_skip_ecopy
+.align	4
+.Lcbc_do_ecopy:
+	movq	%r15,%rsi
+	leaq	80(%rsp),%rdi
+	leaq	80(%rsp),%r15
+	movl	$30,%ecx
+.long	0x90A548F3	
+	movl	%eax,(%rdi)
+.Lcbc_skip_ecopy:
+	movq	%r15,0(%rsp)
+
+	movl	$18,%ecx
+.align	4
+.Lcbc_prefetch_te:
+	movq	0(%r14),%r10
+	movq	32(%r14),%r11
+	movq	64(%r14),%r12
+	movq	96(%r14),%r13
+	leaq	128(%r14),%r14
+	subl	$1,%ecx
+	jnz	.Lcbc_prefetch_te
+	leaq	-2304(%r14),%r14
+
+	cmpq	$0,%rbx
+	je	.LFAST_DECRYPT
+
+
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+
+.align	4
+.Lcbc_fast_enc_loop:
+	xorl	0(%r8),%eax
+	xorl	4(%r8),%ebx
+	xorl	8(%r8),%ecx
+	xorl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+
+	call	_x86_64_AES_encrypt
+
+	movq	24(%rsp),%r8
+	movq	40(%rsp),%r10
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	subq	$16,%r10
+	testq	$-16,%r10
+	movq	%r10,40(%rsp)
+	jnz	.Lcbc_fast_enc_loop
+	movq	56(%rsp),%rbp
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	jmp	.Lcbc_fast_cleanup
+
+
+.align	16
+.LFAST_DECRYPT:
+	cmpq	%r8,%r9
+	je	.Lcbc_fast_dec_in_place
+
+	movq	%rbp,64(%rsp)
+.align	4
+.Lcbc_fast_dec_loop:
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+
+	call	_x86_64_AES_decrypt
+
+	movq	64(%rsp),%rbp
+	movq	24(%rsp),%r8
+	movq	40(%rsp),%r10
+	xorl	0(%rbp),%eax
+	xorl	4(%rbp),%ebx
+	xorl	8(%rbp),%ecx
+	xorl	12(%rbp),%edx
+	movq	%r8,%rbp
+
+	subq	$16,%r10
+	movq	%r10,40(%rsp)
+	movq	%rbp,64(%rsp)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	jnz	.Lcbc_fast_dec_loop
+	movq	56(%rsp),%r12
+	movq	0(%rbp),%r10
+	movq	8(%rbp),%r11
+	movq	%r10,0(%r12)
+	movq	%r11,8(%r12)
+	jmp	.Lcbc_fast_cleanup
+
+.align	16
+.Lcbc_fast_dec_in_place:
+	movq	0(%rbp),%r10
+	movq	8(%rbp),%r11
+	movq	%r10,0+64(%rsp)
+	movq	%r11,8+64(%rsp)
+.align	4
+.Lcbc_fast_dec_in_place_loop:
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+
+	call	_x86_64_AES_decrypt
+
+	movq	24(%rsp),%r8
+	movq	40(%rsp),%r10
+	xorl	0+64(%rsp),%eax
+	xorl	4+64(%rsp),%ebx
+	xorl	8+64(%rsp),%ecx
+	xorl	12+64(%rsp),%edx
+
+	movq	0(%r8),%r11
+	movq	8(%r8),%r12
+	subq	$16,%r10
+	jz	.Lcbc_fast_dec_in_place_done
+
+	movq	%r11,0+64(%rsp)
+	movq	%r12,8+64(%rsp)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	movq	%r10,40(%rsp)
+	jmp	.Lcbc_fast_dec_in_place_loop
+.Lcbc_fast_dec_in_place_done:
+	movq	56(%rsp),%rdi
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+.align	4
+.Lcbc_fast_cleanup:
+	cmpl	$0,80+240(%rsp)
+	leaq	80(%rsp),%rdi
+	je	.Lcbc_exit
+	movl	$30,%ecx
+	xorq	%rax,%rax
+.long	0x90AB48F3	
+
+	jmp	.Lcbc_exit
+
+
+.align	16
+.Lcbc_slow_prologue:
+
+	leaq	-88(%rsp),%rbp
+	andq	$-64,%rbp
+
+	leaq	-88-63(%rcx),%r10
+	subq	%rbp,%r10
+	negq	%r10
+	andq	$960,%r10
+	subq	%r10,%rbp
+
+	xchgq	%rsp,%rbp
+
+	movq	%rbp,16(%rsp)
+.Lcbc_slow_body:
+
+
+
+
+	movq	%r8,56(%rsp)
+	movq	%r8,%rbp
+	movq	%r9,%rbx
+	movq	%rsi,%r9
+	movq	%rdi,%r8
+	movq	%rcx,%r15
+	movq	%rdx,%r10
+
+	movl	240(%r15),%eax
+	movq	%r15,0(%rsp)
+	shll	$4,%eax
+	leaq	(%r15,%rax,1),%rax
+	movq	%rax,8(%rsp)
+
+
+	leaq	2048(%r14),%r14
+	leaq	768-8(%rsp),%rax
+	subq	%r14,%rax
+	andq	$768,%rax
+	leaq	(%r14,%rax,1),%r14
+
+	cmpq	$0,%rbx
+	je	.LSLOW_DECRYPT
+
+
+	testq	$-16,%r10
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+	jz	.Lcbc_slow_enc_tail	
+
+.align	4
+.Lcbc_slow_enc_loop:
+	xorl	0(%r8),%eax
+	xorl	4(%r8),%ebx
+	xorl	8(%r8),%ecx
+	xorl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+	movq	%r9,32(%rsp)
+	movq	%r10,40(%rsp)
+
+	call	_x86_64_AES_encrypt_compact
+
+	movq	24(%rsp),%r8
+	movq	32(%rsp),%r9
+	movq	40(%rsp),%r10
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	subq	$16,%r10
+	testq	$-16,%r10
+	jnz	.Lcbc_slow_enc_loop
+	testq	$15,%r10
+	jnz	.Lcbc_slow_enc_tail
+	movq	56(%rsp),%rbp
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	jmp	.Lcbc_exit
+
+.align	4
+.Lcbc_slow_enc_tail:
+	movq	%rax,%r11
+	movq	%rcx,%r12
+	movq	%r10,%rcx
+	movq	%r8,%rsi
+	movq	%r9,%rdi
+.long	0x9066A4F3		
+	movq	$16,%rcx
+	subq	%r10,%rcx
+	xorq	%rax,%rax
+.long	0x9066AAF3		
+	movq	%r9,%r8
+	movq	$16,%r10
+	movq	%r11,%rax
+	movq	%r12,%rcx
+	jmp	.Lcbc_slow_enc_loop	
+
+.align	16
+.LSLOW_DECRYPT:
+	shrq	$3,%rax
+	addq	%rax,%r14
+
+	movq	0(%rbp),%r11
+	movq	8(%rbp),%r12
+	movq	%r11,0+64(%rsp)
+	movq	%r12,8+64(%rsp)
+
+.align	4
+.Lcbc_slow_dec_loop:
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+	movq	%r9,32(%rsp)
+	movq	%r10,40(%rsp)
+
+	call	_x86_64_AES_decrypt_compact
+
+	movq	24(%rsp),%r8
+	movq	32(%rsp),%r9
+	movq	40(%rsp),%r10
+	xorl	0+64(%rsp),%eax
+	xorl	4+64(%rsp),%ebx
+	xorl	8+64(%rsp),%ecx
+	xorl	12+64(%rsp),%edx
+
+	movq	0(%r8),%r11
+	movq	8(%r8),%r12
+	subq	$16,%r10
+	jc	.Lcbc_slow_dec_partial
+	jz	.Lcbc_slow_dec_done
+
+	movq	%r11,0+64(%rsp)
+	movq	%r12,8+64(%rsp)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	jmp	.Lcbc_slow_dec_loop
+.Lcbc_slow_dec_done:
+	movq	56(%rsp),%rdi
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	jmp	.Lcbc_exit
+
+.align	4
+.Lcbc_slow_dec_partial:
+	movq	56(%rsp),%rdi
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+
+	movl	%eax,0+64(%rsp)
+	movl	%ebx,4+64(%rsp)
+	movl	%ecx,8+64(%rsp)
+	movl	%edx,12+64(%rsp)
+
+	movq	%r9,%rdi
+	leaq	64(%rsp),%rsi
+	leaq	16(%r10),%rcx
+.long	0x9066A4F3	
+	jmp	.Lcbc_exit
+
+.align	16
+.Lcbc_exit:
+	movq	16(%rsp),%rsi
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lcbc_popfq:
+	popfq
+.Lcbc_epilogue:
+	.byte	0xf3,0xc3
+.size	AES_cbc_encrypt,.-AES_cbc_encrypt
+.align	64
+.LAES_Te:
+.long	0xa56363c6,0xa56363c6
+.long	0x847c7cf8,0x847c7cf8
+.long	0x997777ee,0x997777ee
+.long	0x8d7b7bf6,0x8d7b7bf6
+.long	0x0df2f2ff,0x0df2f2ff
+.long	0xbd6b6bd6,0xbd6b6bd6
+.long	0xb16f6fde,0xb16f6fde
+.long	0x54c5c591,0x54c5c591
+.long	0x50303060,0x50303060
+.long	0x03010102,0x03010102
+.long	0xa96767ce,0xa96767ce
+.long	0x7d2b2b56,0x7d2b2b56
+.long	0x19fefee7,0x19fefee7
+.long	0x62d7d7b5,0x62d7d7b5
+.long	0xe6abab4d,0xe6abab4d
+.long	0x9a7676ec,0x9a7676ec
+.long	0x45caca8f,0x45caca8f
+.long	0x9d82821f,0x9d82821f
+.long	0x40c9c989,0x40c9c989
+.long	0x877d7dfa,0x877d7dfa
+.long	0x15fafaef,0x15fafaef
+.long	0xeb5959b2,0xeb5959b2
+.long	0xc947478e,0xc947478e
+.long	0x0bf0f0fb,0x0bf0f0fb
+.long	0xecadad41,0xecadad41
+.long	0x67d4d4b3,0x67d4d4b3
+.long	0xfda2a25f,0xfda2a25f
+.long	0xeaafaf45,0xeaafaf45
+.long	0xbf9c9c23,0xbf9c9c23
+.long	0xf7a4a453,0xf7a4a453
+.long	0x967272e4,0x967272e4
+.long	0x5bc0c09b,0x5bc0c09b
+.long	0xc2b7b775,0xc2b7b775
+.long	0x1cfdfde1,0x1cfdfde1
+.long	0xae93933d,0xae93933d
+.long	0x6a26264c,0x6a26264c
+.long	0x5a36366c,0x5a36366c
+.long	0x413f3f7e,0x413f3f7e
+.long	0x02f7f7f5,0x02f7f7f5
+.long	0x4fcccc83,0x4fcccc83
+.long	0x5c343468,0x5c343468
+.long	0xf4a5a551,0xf4a5a551
+.long	0x34e5e5d1,0x34e5e5d1
+.long	0x08f1f1f9,0x08f1f1f9
+.long	0x937171e2,0x937171e2
+.long	0x73d8d8ab,0x73d8d8ab
+.long	0x53313162,0x53313162
+.long	0x3f15152a,0x3f15152a
+.long	0x0c040408,0x0c040408
+.long	0x52c7c795,0x52c7c795
+.long	0x65232346,0x65232346
+.long	0x5ec3c39d,0x5ec3c39d
+.long	0x28181830,0x28181830
+.long	0xa1969637,0xa1969637
+.long	0x0f05050a,0x0f05050a
+.long	0xb59a9a2f,0xb59a9a2f
+.long	0x0907070e,0x0907070e
+.long	0x36121224,0x36121224
+.long	0x9b80801b,0x9b80801b
+.long	0x3de2e2df,0x3de2e2df
+.long	0x26ebebcd,0x26ebebcd
+.long	0x6927274e,0x6927274e
+.long	0xcdb2b27f,0xcdb2b27f
+.long	0x9f7575ea,0x9f7575ea
+.long	0x1b090912,0x1b090912
+.long	0x9e83831d,0x9e83831d
+.long	0x742c2c58,0x742c2c58
+.long	0x2e1a1a34,0x2e1a1a34
+.long	0x2d1b1b36,0x2d1b1b36
+.long	0xb26e6edc,0xb26e6edc
+.long	0xee5a5ab4,0xee5a5ab4
+.long	0xfba0a05b,0xfba0a05b
+.long	0xf65252a4,0xf65252a4
+.long	0x4d3b3b76,0x4d3b3b76
+.long	0x61d6d6b7,0x61d6d6b7
+.long	0xceb3b37d,0xceb3b37d
+.long	0x7b292952,0x7b292952
+.long	0x3ee3e3dd,0x3ee3e3dd
+.long	0x712f2f5e,0x712f2f5e
+.long	0x97848413,0x97848413
+.long	0xf55353a6,0xf55353a6
+.long	0x68d1d1b9,0x68d1d1b9
+.long	0x00000000,0x00000000
+.long	0x2cededc1,0x2cededc1
+.long	0x60202040,0x60202040
+.long	0x1ffcfce3,0x1ffcfce3
+.long	0xc8b1b179,0xc8b1b179
+.long	0xed5b5bb6,0xed5b5bb6
+.long	0xbe6a6ad4,0xbe6a6ad4
+.long	0x46cbcb8d,0x46cbcb8d
+.long	0xd9bebe67,0xd9bebe67
+.long	0x4b393972,0x4b393972
+.long	0xde4a4a94,0xde4a4a94
+.long	0xd44c4c98,0xd44c4c98
+.long	0xe85858b0,0xe85858b0
+.long	0x4acfcf85,0x4acfcf85
+.long	0x6bd0d0bb,0x6bd0d0bb
+.long	0x2aefefc5,0x2aefefc5
+.long	0xe5aaaa4f,0xe5aaaa4f
+.long	0x16fbfbed,0x16fbfbed
+.long	0xc5434386,0xc5434386
+.long	0xd74d4d9a,0xd74d4d9a
+.long	0x55333366,0x55333366
+.long	0x94858511,0x94858511
+.long	0xcf45458a,0xcf45458a
+.long	0x10f9f9e9,0x10f9f9e9
+.long	0x06020204,0x06020204
+.long	0x817f7ffe,0x817f7ffe
+.long	0xf05050a0,0xf05050a0
+.long	0x443c3c78,0x443c3c78
+.long	0xba9f9f25,0xba9f9f25
+.long	0xe3a8a84b,0xe3a8a84b
+.long	0xf35151a2,0xf35151a2
+.long	0xfea3a35d,0xfea3a35d
+.long	0xc0404080,0xc0404080
+.long	0x8a8f8f05,0x8a8f8f05
+.long	0xad92923f,0xad92923f
+.long	0xbc9d9d21,0xbc9d9d21
+.long	0x48383870,0x48383870
+.long	0x04f5f5f1,0x04f5f5f1
+.long	0xdfbcbc63,0xdfbcbc63
+.long	0xc1b6b677,0xc1b6b677
+.long	0x75dadaaf,0x75dadaaf
+.long	0x63212142,0x63212142
+.long	0x30101020,0x30101020
+.long	0x1affffe5,0x1affffe5
+.long	0x0ef3f3fd,0x0ef3f3fd
+.long	0x6dd2d2bf,0x6dd2d2bf
+.long	0x4ccdcd81,0x4ccdcd81
+.long	0x140c0c18,0x140c0c18
+.long	0x35131326,0x35131326
+.long	0x2fececc3,0x2fececc3
+.long	0xe15f5fbe,0xe15f5fbe
+.long	0xa2979735,0xa2979735
+.long	0xcc444488,0xcc444488
+.long	0x3917172e,0x3917172e
+.long	0x57c4c493,0x57c4c493
+.long	0xf2a7a755,0xf2a7a755
+.long	0x827e7efc,0x827e7efc
+.long	0x473d3d7a,0x473d3d7a
+.long	0xac6464c8,0xac6464c8
+.long	0xe75d5dba,0xe75d5dba
+.long	0x2b191932,0x2b191932
+.long	0x957373e6,0x957373e6
+.long	0xa06060c0,0xa06060c0
+.long	0x98818119,0x98818119
+.long	0xd14f4f9e,0xd14f4f9e
+.long	0x7fdcdca3,0x7fdcdca3
+.long	0x66222244,0x66222244
+.long	0x7e2a2a54,0x7e2a2a54
+.long	0xab90903b,0xab90903b
+.long	0x8388880b,0x8388880b
+.long	0xca46468c,0xca46468c
+.long	0x29eeeec7,0x29eeeec7
+.long	0xd3b8b86b,0xd3b8b86b
+.long	0x3c141428,0x3c141428
+.long	0x79dedea7,0x79dedea7
+.long	0xe25e5ebc,0xe25e5ebc
+.long	0x1d0b0b16,0x1d0b0b16
+.long	0x76dbdbad,0x76dbdbad
+.long	0x3be0e0db,0x3be0e0db
+.long	0x56323264,0x56323264
+.long	0x4e3a3a74,0x4e3a3a74
+.long	0x1e0a0a14,0x1e0a0a14
+.long	0xdb494992,0xdb494992
+.long	0x0a06060c,0x0a06060c
+.long	0x6c242448,0x6c242448
+.long	0xe45c5cb8,0xe45c5cb8
+.long	0x5dc2c29f,0x5dc2c29f
+.long	0x6ed3d3bd,0x6ed3d3bd
+.long	0xefacac43,0xefacac43
+.long	0xa66262c4,0xa66262c4
+.long	0xa8919139,0xa8919139
+.long	0xa4959531,0xa4959531
+.long	0x37e4e4d3,0x37e4e4d3
+.long	0x8b7979f2,0x8b7979f2
+.long	0x32e7e7d5,0x32e7e7d5
+.long	0x43c8c88b,0x43c8c88b
+.long	0x5937376e,0x5937376e
+.long	0xb76d6dda,0xb76d6dda
+.long	0x8c8d8d01,0x8c8d8d01
+.long	0x64d5d5b1,0x64d5d5b1
+.long	0xd24e4e9c,0xd24e4e9c
+.long	0xe0a9a949,0xe0a9a949
+.long	0xb46c6cd8,0xb46c6cd8
+.long	0xfa5656ac,0xfa5656ac
+.long	0x07f4f4f3,0x07f4f4f3
+.long	0x25eaeacf,0x25eaeacf
+.long	0xaf6565ca,0xaf6565ca
+.long	0x8e7a7af4,0x8e7a7af4
+.long	0xe9aeae47,0xe9aeae47
+.long	0x18080810,0x18080810
+.long	0xd5baba6f,0xd5baba6f
+.long	0x887878f0,0x887878f0
+.long	0x6f25254a,0x6f25254a
+.long	0x722e2e5c,0x722e2e5c
+.long	0x241c1c38,0x241c1c38
+.long	0xf1a6a657,0xf1a6a657
+.long	0xc7b4b473,0xc7b4b473
+.long	0x51c6c697,0x51c6c697
+.long	0x23e8e8cb,0x23e8e8cb
+.long	0x7cdddda1,0x7cdddda1
+.long	0x9c7474e8,0x9c7474e8
+.long	0x211f1f3e,0x211f1f3e
+.long	0xdd4b4b96,0xdd4b4b96
+.long	0xdcbdbd61,0xdcbdbd61
+.long	0x868b8b0d,0x868b8b0d
+.long	0x858a8a0f,0x858a8a0f
+.long	0x907070e0,0x907070e0
+.long	0x423e3e7c,0x423e3e7c
+.long	0xc4b5b571,0xc4b5b571
+.long	0xaa6666cc,0xaa6666cc
+.long	0xd8484890,0xd8484890
+.long	0x05030306,0x05030306
+.long	0x01f6f6f7,0x01f6f6f7
+.long	0x120e0e1c,0x120e0e1c
+.long	0xa36161c2,0xa36161c2
+.long	0x5f35356a,0x5f35356a
+.long	0xf95757ae,0xf95757ae
+.long	0xd0b9b969,0xd0b9b969
+.long	0x91868617,0x91868617
+.long	0x58c1c199,0x58c1c199
+.long	0x271d1d3a,0x271d1d3a
+.long	0xb99e9e27,0xb99e9e27
+.long	0x38e1e1d9,0x38e1e1d9
+.long	0x13f8f8eb,0x13f8f8eb
+.long	0xb398982b,0xb398982b
+.long	0x33111122,0x33111122
+.long	0xbb6969d2,0xbb6969d2
+.long	0x70d9d9a9,0x70d9d9a9
+.long	0x898e8e07,0x898e8e07
+.long	0xa7949433,0xa7949433
+.long	0xb69b9b2d,0xb69b9b2d
+.long	0x221e1e3c,0x221e1e3c
+.long	0x92878715,0x92878715
+.long	0x20e9e9c9,0x20e9e9c9
+.long	0x49cece87,0x49cece87
+.long	0xff5555aa,0xff5555aa
+.long	0x78282850,0x78282850
+.long	0x7adfdfa5,0x7adfdfa5
+.long	0x8f8c8c03,0x8f8c8c03
+.long	0xf8a1a159,0xf8a1a159
+.long	0x80898909,0x80898909
+.long	0x170d0d1a,0x170d0d1a
+.long	0xdabfbf65,0xdabfbf65
+.long	0x31e6e6d7,0x31e6e6d7
+.long	0xc6424284,0xc6424284
+.long	0xb86868d0,0xb86868d0
+.long	0xc3414182,0xc3414182
+.long	0xb0999929,0xb0999929
+.long	0x772d2d5a,0x772d2d5a
+.long	0x110f0f1e,0x110f0f1e
+.long	0xcbb0b07b,0xcbb0b07b
+.long	0xfc5454a8,0xfc5454a8
+.long	0xd6bbbb6d,0xd6bbbb6d
+.long	0x3a16162c,0x3a16162c
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
+.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
+.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
+.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
+.align	64
+.LAES_Td:
+.long	0x50a7f451,0x50a7f451
+.long	0x5365417e,0x5365417e
+.long	0xc3a4171a,0xc3a4171a
+.long	0x965e273a,0x965e273a
+.long	0xcb6bab3b,0xcb6bab3b
+.long	0xf1459d1f,0xf1459d1f
+.long	0xab58faac,0xab58faac
+.long	0x9303e34b,0x9303e34b
+.long	0x55fa3020,0x55fa3020
+.long	0xf66d76ad,0xf66d76ad
+.long	0x9176cc88,0x9176cc88
+.long	0x254c02f5,0x254c02f5
+.long	0xfcd7e54f,0xfcd7e54f
+.long	0xd7cb2ac5,0xd7cb2ac5
+.long	0x80443526,0x80443526
+.long	0x8fa362b5,0x8fa362b5
+.long	0x495ab1de,0x495ab1de
+.long	0x671bba25,0x671bba25
+.long	0x980eea45,0x980eea45
+.long	0xe1c0fe5d,0xe1c0fe5d
+.long	0x02752fc3,0x02752fc3
+.long	0x12f04c81,0x12f04c81
+.long	0xa397468d,0xa397468d
+.long	0xc6f9d36b,0xc6f9d36b
+.long	0xe75f8f03,0xe75f8f03
+.long	0x959c9215,0x959c9215
+.long	0xeb7a6dbf,0xeb7a6dbf
+.long	0xda595295,0xda595295
+.long	0x2d83bed4,0x2d83bed4
+.long	0xd3217458,0xd3217458
+.long	0x2969e049,0x2969e049
+.long	0x44c8c98e,0x44c8c98e
+.long	0x6a89c275,0x6a89c275
+.long	0x78798ef4,0x78798ef4
+.long	0x6b3e5899,0x6b3e5899
+.long	0xdd71b927,0xdd71b927
+.long	0xb64fe1be,0xb64fe1be
+.long	0x17ad88f0,0x17ad88f0
+.long	0x66ac20c9,0x66ac20c9
+.long	0xb43ace7d,0xb43ace7d
+.long	0x184adf63,0x184adf63
+.long	0x82311ae5,0x82311ae5
+.long	0x60335197,0x60335197
+.long	0x457f5362,0x457f5362
+.long	0xe07764b1,0xe07764b1
+.long	0x84ae6bbb,0x84ae6bbb
+.long	0x1ca081fe,0x1ca081fe
+.long	0x942b08f9,0x942b08f9
+.long	0x58684870,0x58684870
+.long	0x19fd458f,0x19fd458f
+.long	0x876cde94,0x876cde94
+.long	0xb7f87b52,0xb7f87b52
+.long	0x23d373ab,0x23d373ab
+.long	0xe2024b72,0xe2024b72
+.long	0x578f1fe3,0x578f1fe3
+.long	0x2aab5566,0x2aab5566
+.long	0x0728ebb2,0x0728ebb2
+.long	0x03c2b52f,0x03c2b52f
+.long	0x9a7bc586,0x9a7bc586
+.long	0xa50837d3,0xa50837d3
+.long	0xf2872830,0xf2872830
+.long	0xb2a5bf23,0xb2a5bf23
+.long	0xba6a0302,0xba6a0302
+.long	0x5c8216ed,0x5c8216ed
+.long	0x2b1ccf8a,0x2b1ccf8a
+.long	0x92b479a7,0x92b479a7
+.long	0xf0f207f3,0xf0f207f3
+.long	0xa1e2694e,0xa1e2694e
+.long	0xcdf4da65,0xcdf4da65
+.long	0xd5be0506,0xd5be0506
+.long	0x1f6234d1,0x1f6234d1
+.long	0x8afea6c4,0x8afea6c4
+.long	0x9d532e34,0x9d532e34
+.long	0xa055f3a2,0xa055f3a2
+.long	0x32e18a05,0x32e18a05
+.long	0x75ebf6a4,0x75ebf6a4
+.long	0x39ec830b,0x39ec830b
+.long	0xaaef6040,0xaaef6040
+.long	0x069f715e,0x069f715e
+.long	0x51106ebd,0x51106ebd
+.long	0xf98a213e,0xf98a213e
+.long	0x3d06dd96,0x3d06dd96
+.long	0xae053edd,0xae053edd
+.long	0x46bde64d,0x46bde64d
+.long	0xb58d5491,0xb58d5491
+.long	0x055dc471,0x055dc471
+.long	0x6fd40604,0x6fd40604
+.long	0xff155060,0xff155060
+.long	0x24fb9819,0x24fb9819
+.long	0x97e9bdd6,0x97e9bdd6
+.long	0xcc434089,0xcc434089
+.long	0x779ed967,0x779ed967
+.long	0xbd42e8b0,0xbd42e8b0
+.long	0x888b8907,0x888b8907
+.long	0x385b19e7,0x385b19e7
+.long	0xdbeec879,0xdbeec879
+.long	0x470a7ca1,0x470a7ca1
+.long	0xe90f427c,0xe90f427c
+.long	0xc91e84f8,0xc91e84f8
+.long	0x00000000,0x00000000
+.long	0x83868009,0x83868009
+.long	0x48ed2b32,0x48ed2b32
+.long	0xac70111e,0xac70111e
+.long	0x4e725a6c,0x4e725a6c
+.long	0xfbff0efd,0xfbff0efd
+.long	0x5638850f,0x5638850f
+.long	0x1ed5ae3d,0x1ed5ae3d
+.long	0x27392d36,0x27392d36
+.long	0x64d90f0a,0x64d90f0a
+.long	0x21a65c68,0x21a65c68
+.long	0xd1545b9b,0xd1545b9b
+.long	0x3a2e3624,0x3a2e3624
+.long	0xb1670a0c,0xb1670a0c
+.long	0x0fe75793,0x0fe75793
+.long	0xd296eeb4,0xd296eeb4
+.long	0x9e919b1b,0x9e919b1b
+.long	0x4fc5c080,0x4fc5c080
+.long	0xa220dc61,0xa220dc61
+.long	0x694b775a,0x694b775a
+.long	0x161a121c,0x161a121c
+.long	0x0aba93e2,0x0aba93e2
+.long	0xe52aa0c0,0xe52aa0c0
+.long	0x43e0223c,0x43e0223c
+.long	0x1d171b12,0x1d171b12
+.long	0x0b0d090e,0x0b0d090e
+.long	0xadc78bf2,0xadc78bf2
+.long	0xb9a8b62d,0xb9a8b62d
+.long	0xc8a91e14,0xc8a91e14
+.long	0x8519f157,0x8519f157
+.long	0x4c0775af,0x4c0775af
+.long	0xbbdd99ee,0xbbdd99ee
+.long	0xfd607fa3,0xfd607fa3
+.long	0x9f2601f7,0x9f2601f7
+.long	0xbcf5725c,0xbcf5725c
+.long	0xc53b6644,0xc53b6644
+.long	0x347efb5b,0x347efb5b
+.long	0x7629438b,0x7629438b
+.long	0xdcc623cb,0xdcc623cb
+.long	0x68fcedb6,0x68fcedb6
+.long	0x63f1e4b8,0x63f1e4b8
+.long	0xcadc31d7,0xcadc31d7
+.long	0x10856342,0x10856342
+.long	0x40229713,0x40229713
+.long	0x2011c684,0x2011c684
+.long	0x7d244a85,0x7d244a85
+.long	0xf83dbbd2,0xf83dbbd2
+.long	0x1132f9ae,0x1132f9ae
+.long	0x6da129c7,0x6da129c7
+.long	0x4b2f9e1d,0x4b2f9e1d
+.long	0xf330b2dc,0xf330b2dc
+.long	0xec52860d,0xec52860d
+.long	0xd0e3c177,0xd0e3c177
+.long	0x6c16b32b,0x6c16b32b
+.long	0x99b970a9,0x99b970a9
+.long	0xfa489411,0xfa489411
+.long	0x2264e947,0x2264e947
+.long	0xc48cfca8,0xc48cfca8
+.long	0x1a3ff0a0,0x1a3ff0a0
+.long	0xd82c7d56,0xd82c7d56
+.long	0xef903322,0xef903322
+.long	0xc74e4987,0xc74e4987
+.long	0xc1d138d9,0xc1d138d9
+.long	0xfea2ca8c,0xfea2ca8c
+.long	0x360bd498,0x360bd498
+.long	0xcf81f5a6,0xcf81f5a6
+.long	0x28de7aa5,0x28de7aa5
+.long	0x268eb7da,0x268eb7da
+.long	0xa4bfad3f,0xa4bfad3f
+.long	0xe49d3a2c,0xe49d3a2c
+.long	0x0d927850,0x0d927850
+.long	0x9bcc5f6a,0x9bcc5f6a
+.long	0x62467e54,0x62467e54
+.long	0xc2138df6,0xc2138df6
+.long	0xe8b8d890,0xe8b8d890
+.long	0x5ef7392e,0x5ef7392e
+.long	0xf5afc382,0xf5afc382
+.long	0xbe805d9f,0xbe805d9f
+.long	0x7c93d069,0x7c93d069
+.long	0xa92dd56f,0xa92dd56f
+.long	0xb31225cf,0xb31225cf
+.long	0x3b99acc8,0x3b99acc8
+.long	0xa77d1810,0xa77d1810
+.long	0x6e639ce8,0x6e639ce8
+.long	0x7bbb3bdb,0x7bbb3bdb
+.long	0x097826cd,0x097826cd
+.long	0xf418596e,0xf418596e
+.long	0x01b79aec,0x01b79aec
+.long	0xa89a4f83,0xa89a4f83
+.long	0x656e95e6,0x656e95e6
+.long	0x7ee6ffaa,0x7ee6ffaa
+.long	0x08cfbc21,0x08cfbc21
+.long	0xe6e815ef,0xe6e815ef
+.long	0xd99be7ba,0xd99be7ba
+.long	0xce366f4a,0xce366f4a
+.long	0xd4099fea,0xd4099fea
+.long	0xd67cb029,0xd67cb029
+.long	0xafb2a431,0xafb2a431
+.long	0x31233f2a,0x31233f2a
+.long	0x3094a5c6,0x3094a5c6
+.long	0xc066a235,0xc066a235
+.long	0x37bc4e74,0x37bc4e74
+.long	0xa6ca82fc,0xa6ca82fc
+.long	0xb0d090e0,0xb0d090e0
+.long	0x15d8a733,0x15d8a733
+.long	0x4a9804f1,0x4a9804f1
+.long	0xf7daec41,0xf7daec41
+.long	0x0e50cd7f,0x0e50cd7f
+.long	0x2ff69117,0x2ff69117
+.long	0x8dd64d76,0x8dd64d76
+.long	0x4db0ef43,0x4db0ef43
+.long	0x544daacc,0x544daacc
+.long	0xdf0496e4,0xdf0496e4
+.long	0xe3b5d19e,0xe3b5d19e
+.long	0x1b886a4c,0x1b886a4c
+.long	0xb81f2cc1,0xb81f2cc1
+.long	0x7f516546,0x7f516546
+.long	0x04ea5e9d,0x04ea5e9d
+.long	0x5d358c01,0x5d358c01
+.long	0x737487fa,0x737487fa
+.long	0x2e410bfb,0x2e410bfb
+.long	0x5a1d67b3,0x5a1d67b3
+.long	0x52d2db92,0x52d2db92
+.long	0x335610e9,0x335610e9
+.long	0x1347d66d,0x1347d66d
+.long	0x8c61d79a,0x8c61d79a
+.long	0x7a0ca137,0x7a0ca137
+.long	0x8e14f859,0x8e14f859
+.long	0x893c13eb,0x893c13eb
+.long	0xee27a9ce,0xee27a9ce
+.long	0x35c961b7,0x35c961b7
+.long	0xede51ce1,0xede51ce1
+.long	0x3cb1477a,0x3cb1477a
+.long	0x59dfd29c,0x59dfd29c
+.long	0x3f73f255,0x3f73f255
+.long	0x79ce1418,0x79ce1418
+.long	0xbf37c773,0xbf37c773
+.long	0xeacdf753,0xeacdf753
+.long	0x5baafd5f,0x5baafd5f
+.long	0x146f3ddf,0x146f3ddf
+.long	0x86db4478,0x86db4478
+.long	0x81f3afca,0x81f3afca
+.long	0x3ec468b9,0x3ec468b9
+.long	0x2c342438,0x2c342438
+.long	0x5f40a3c2,0x5f40a3c2
+.long	0x72c31d16,0x72c31d16
+.long	0x0c25e2bc,0x0c25e2bc
+.long	0x8b493c28,0x8b493c28
+.long	0x41950dff,0x41950dff
+.long	0x7101a839,0x7101a839
+.long	0xdeb30c08,0xdeb30c08
+.long	0x9ce4b4d8,0x9ce4b4d8
+.long	0x90c15664,0x90c15664
+.long	0x6184cb7b,0x6184cb7b
+.long	0x70b632d5,0x70b632d5
+.long	0x745c6c48,0x745c6c48
+.long	0x4257b8d0,0x4257b8d0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/main/openssl/crypto/aes/asm/aes-x86_64.pl b/main/openssl/crypto/aes/asm/aes-x86_64.pl
index a545e892..34cbb5d8 100755
--- a/main/openssl/crypto/aes/asm/aes-x86_64.pl
+++ b/main/openssl/crypto/aes/asm/aes-x86_64.pl
@@ -36,7 +36,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $verticalspin=1;	# unlike 32-bit version $verticalspin performs
 			# ~15% better on both AMD and Intel cores
@@ -588,6 +589,9 @@ $code.=<<___;
 .globl	AES_encrypt
 .type	AES_encrypt,\@function,3
 .align	16
+.globl	asm_AES_encrypt
+.hidden	asm_AES_encrypt
+asm_AES_encrypt:
 AES_encrypt:
 	push	%rbx
 	push	%rbp
@@ -1184,6 +1188,9 @@ $code.=<<___;
 .globl	AES_decrypt
 .type	AES_decrypt,\@function,3
 .align	16
+.globl	asm_AES_decrypt
+.hidden	asm_AES_decrypt
+asm_AES_decrypt:
 AES_decrypt:
 	push	%rbx
 	push	%rbp
@@ -1277,13 +1284,13 @@ $code.=<<___;
 ___
 }
 
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	AES_set_encrypt_key
-.type	AES_set_encrypt_key,\@function,3
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,\@function,3
 .align	16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12			# redundant, but allows to share 
@@ -1304,7 +1311,7 @@ AES_set_encrypt_key:
 	add	\$56,%rsp
 .Lenc_key_epilogue:
 	ret
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
 .align	16
@@ -1547,13 +1554,13 @@ $code.=<<___;
 ___
 }
 
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	AES_set_decrypt_key
-.type	AES_set_decrypt_key,\@function,3
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,\@function,3
 .align	16
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -1622,7 +1629,7 @@ $code.=<<___;
 	add	\$56,%rsp
 .Ldec_key_epilogue:
 	ret
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 ___
 
 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -1648,6 +1655,9 @@ $code.=<<___;
 .type	AES_cbc_encrypt,\@function,6
 .align	16
 .extern	OPENSSL_ia32cap_P
+.globl	asm_AES_cbc_encrypt
+.hidden	asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
 AES_cbc_encrypt:
 	cmp	\$0,%rdx	# check length
 	je	.Lcbc_epilogue
@@ -2766,13 +2776,13 @@ cbc_se_handler:
 	.rva	.LSEH_end_AES_decrypt
 	.rva	.LSEH_info_AES_decrypt
 
-	.rva	.LSEH_begin_AES_set_encrypt_key
-	.rva	.LSEH_end_AES_set_encrypt_key
-	.rva	.LSEH_info_AES_set_encrypt_key
+	.rva	.LSEH_begin_private_AES_set_encrypt_key
+	.rva	.LSEH_end_private_AES_set_encrypt_key
+	.rva	.LSEH_info_private_AES_set_encrypt_key
 
-	.rva	.LSEH_begin_AES_set_decrypt_key
-	.rva	.LSEH_end_AES_set_decrypt_key
-	.rva	.LSEH_info_AES_set_decrypt_key
+	.rva	.LSEH_begin_private_AES_set_decrypt_key
+	.rva	.LSEH_end_private_AES_set_decrypt_key
+	.rva	.LSEH_info_private_AES_set_decrypt_key
 
 	.rva	.LSEH_begin_AES_cbc_encrypt
 	.rva	.LSEH_end_AES_cbc_encrypt
@@ -2788,11 +2798,11 @@ cbc_se_handler:
 	.byte	9,0,0,0
 	.rva	block_se_handler
 	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
-.LSEH_info_AES_set_encrypt_key:
+.LSEH_info_private_AES_set_encrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
-.LSEH_info_AES_set_decrypt_key:
+.LSEH_info_private_AES_set_decrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
diff --git a/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.S b/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.S
new file mode 100644
index 00000000..32fd600b
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.S
@@ -0,0 +1,1396 @@
+.text	
+
+
+.globl	aesni_cbc_sha1_enc
+.type	aesni_cbc_sha1_enc,@function
+.align	16
+aesni_cbc_sha1_enc:
+
+	movl	OPENSSL_ia32cap_P+0(%rip),%r10d
+	movl	OPENSSL_ia32cap_P+4(%rip),%r11d
+	jmp	aesni_cbc_sha1_enc_ssse3
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+.type	aesni_cbc_sha1_enc_ssse3,@function
+.align	16
+aesni_cbc_sha1_enc_ssse3:
+	movq	8(%rsp),%r10
+
+
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-104(%rsp),%rsp
+
+
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	movdqu	(%r8),%xmm11
+	movq	%r8,88(%rsp)
+	shlq	$6,%r14
+	subq	%r12,%r13
+	movl	240(%r15),%r8d
+	addq	%r10,%r14
+
+	leaq	K_XX_XX(%rip),%r11
+	movl	0(%r9),%eax
+	movl	4(%r9),%ebx
+	movl	8(%r9),%ecx
+	movl	12(%r9),%edx
+	movl	%ebx,%esi
+	movl	16(%r9),%ebp
+
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r10),%xmm0
+	movdqu	16(%r10),%xmm1
+	movdqu	32(%r10),%xmm2
+	movdqu	48(%r10),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r10
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm9,%xmm0
+	paddd	%xmm9,%xmm1
+	paddd	%xmm9,%xmm2
+	movdqa	%xmm0,0(%rsp)
+	psubd	%xmm9,%xmm0
+	movdqa	%xmm1,16(%rsp)
+	psubd	%xmm9,%xmm1
+	movdqa	%xmm2,32(%rsp)
+	psubd	%xmm9,%xmm2
+	movups	(%r15),%xmm13
+	movups	16(%r15),%xmm14
+	jmp	.Loop_ssse3
+.align	16
+.Loop_ssse3:
+	movdqa	%xmm1,%xmm4
+	addl	0(%rsp),%ebp
+	movups	0(%r12),%xmm12
+	xorps	%xmm13,%xmm12
+	xorps	%xmm12,%xmm11
+.byte	102,69,15,56,220,222
+	movups	32(%r15),%xmm15
+	xorl	%edx,%ecx
+	movdqa	%xmm3,%xmm8
+.byte	102,15,58,15,224,8
+	movl	%eax,%edi
+	roll	$5,%eax
+	paddd	%xmm3,%xmm9
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrldq	$4,%xmm8
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	pxor	%xmm0,%xmm4
+	rorl	$2,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm2,%xmm8
+	addl	4(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pxor	%xmm8,%xmm4
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm9,48(%rsp)
+	xorl	%ecx,%edi
+.byte	102,69,15,56,220,223
+	movups	48(%r15),%xmm14
+	addl	%ebp,%edx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm4,%xmm8
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	8(%rsp),%ecx
+	xorl	%ebx,%eax
+	pslldq	$12,%xmm10
+	paddd	%xmm4,%xmm4
+	movl	%edx,%edi
+	roll	$5,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrld	$31,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm4
+	addl	12(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm15
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm4
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	0(%r11),%xmm10
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	pxor	%xmm9,%xmm4
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movdqa	%xmm2,%xmm5
+	addl	16(%rsp),%eax
+	xorl	%ebp,%edx
+	movdqa	%xmm4,%xmm9
+.byte	102,15,58,15,233,8
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	paddd	%xmm4,%xmm10
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrldq	$4,%xmm9
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	pxor	%xmm1,%xmm5
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm3,%xmm9
+	addl	20(%rsp),%ebp
+.byte	102,69,15,56,220,223
+	movups	80(%r15),%xmm14
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pxor	%xmm9,%xmm5
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	%xmm10,0(%rsp)
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm5,%xmm9
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	24(%rsp),%edx
+	xorl	%ecx,%ebx
+	pslldq	$12,%xmm8
+	paddd	%xmm5,%xmm5
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	psrld	$31,%xmm9
+	xorl	%ecx,%esi
+.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm15
+	addl	%ebp,%edx
+	movdqa	%xmm8,%xmm10
+	rorl	$7,%eax
+	addl	%esi,%edx
+	psrld	$30,%xmm8
+	por	%xmm9,%xmm5
+	addl	28(%rsp),%ecx
+	xorl	%ebx,%eax
+	movl	%edx,%esi
+	roll	$5,%edx
+	pslld	$2,%xmm10
+	pxor	%xmm8,%xmm5
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	16(%r11),%xmm8
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	pxor	%xmm10,%xmm5
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movdqa	%xmm3,%xmm6
+	addl	32(%rsp),%ebx
+	xorl	%eax,%ebp
+	movdqa	%xmm5,%xmm10
+.byte	102,15,58,15,242,8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+.byte	102,69,15,56,220,223
+	movups	112(%r15),%xmm14
+	paddd	%xmm5,%xmm8
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	psrldq	$4,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	pxor	%xmm2,%xmm6
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm4,%xmm10
+	addl	36(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	pxor	%xmm10,%xmm6
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm8,16(%rsp)
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm6,%xmm10
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	40(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%ecx
+	pslldq	$12,%xmm9
+	paddd	%xmm6,%xmm6
+	movl	%eax,%edi
+	roll	$5,%eax
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrld	$31,%xmm10
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	psrld	$30,%xmm9
+	por	%xmm10,%xmm6
+	addl	44(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pslld	$2,%xmm8
+	pxor	%xmm9,%xmm6
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	16(%r11),%xmm9
+	xorl	%ecx,%edi
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%ebp,%edx
+	pxor	%xmm8,%xmm6
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movdqa	%xmm4,%xmm7
+	addl	48(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm8
+.byte	102,15,58,15,251,8
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm6,%xmm9
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrldq	$4,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	pxor	%xmm3,%xmm7
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm5,%xmm8
+	addl	52(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	pxor	%xmm8,%xmm7
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm9,32(%rsp)
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm7,%xmm8
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	56(%rsp),%eax
+	xorl	%ebp,%edx
+	pslldq	$12,%xmm10
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrld	$31,%xmm8
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm7
+	addl	60(%rsp),%ebp
+	cmpl	$11,%r8d
+	jb	.Laesenclast1
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast1
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast1:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm7
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	16(%r11),%xmm10
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	pxor	%xmm9,%xmm7
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movdqa	%xmm7,%xmm9
+	addl	0(%rsp),%edx
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,206,8
+	xorl	%ecx,%ebx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm1,%xmm0
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm7,%xmm10
+	xorl	%ecx,%esi
+	movups	16(%r12),%xmm12
+	xorps	%xmm13,%xmm12
+	movups	%xmm11,0(%r13,%r12,1)
+	xorps	%xmm12,%xmm11
+.byte	102,69,15,56,220,222
+	movups	32(%r15),%xmm15
+	addl	%ebp,%edx
+	pxor	%xmm9,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	4(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm10,48(%rsp)
+	movl	%edx,%esi
+	roll	$5,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	pslld	$2,%xmm0
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	psrld	$30,%xmm9
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	8(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%edi
+	roll	$5,%ecx
+.byte	102,69,15,56,220,223
+	movups	48(%r15),%xmm14
+	por	%xmm9,%xmm0
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	movdqa	%xmm0,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	16(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm15
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,215,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm0,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm10,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm1,%xmm10
+	movdqa	%xmm8,0(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm10
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	80(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm10,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%eax,%edi
+	movdqa	%xmm1,%xmm8
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	32(%rsp),%eax
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,192,8
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	movdqa	32(%r11),%xmm10
+	paddd	%xmm1,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm8,%xmm2
+	addl	36(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm8
+	movdqa	%xmm9,16(%rsp)
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	pslld	$2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ecx,%esi
+	psrld	$30,%xmm8
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	por	%xmm8,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%ebx,%edi
+	movdqa	%xmm2,%xmm9
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+.byte	102,69,15,56,220,223
+	movups	112(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	48(%rsp),%ebx
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,201,8
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm2,%xmm10
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm9,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm9
+	movdqa	%xmm10,32(%rsp)
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%esi
+	psrld	$30,%xmm9
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	por	%xmm9,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ecx,%edi
+	movdqa	%xmm3,%xmm10
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	0(%rsp),%ecx
+	pxor	%xmm0,%xmm4
+.byte	102,68,15,58,15,210,8
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%edx,%ecx
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm3,%xmm8
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm10,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm8,48(%rsp)
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	pslld	$2,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%ebp,%esi
+	psrld	$30,%xmm10
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	por	%xmm10,%xmm4
+	addl	12(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	xorl	%edx,%edi
+	movdqa	%xmm4,%xmm8
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	16(%rsp),%edx
+	pxor	%xmm1,%xmm5
+.byte	102,68,15,58,15,195,8
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm6,%xmm5
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm4,%xmm9
+	rorl	$7,%eax
+	addl	%esi,%edx
+	pxor	%xmm8,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm9,0(%rsp)
+	xorl	%eax,%edi
+	cmpl	$11,%r8d
+	jb	.Laesenclast2
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast2
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast2:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	pslld	$2,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%eax,%esi
+	psrld	$30,%xmm8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	por	%xmm8,%xmm5
+	addl	28(%rsp),%eax
+	xorl	%ebp,%edi
+	movdqa	%xmm5,%xmm9
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ecx,%edi
+	movups	32(%r12),%xmm12
+	xorps	%xmm13,%xmm12
+	movups	%xmm11,16(%r13,%r12,1)
+	xorps	%xmm12,%xmm11
+.byte	102,69,15,56,220,222
+	movups	32(%r15),%xmm15
+	pxor	%xmm2,%xmm6
+.byte	102,68,15,58,15,204,8
+	xorl	%edx,%ecx
+	addl	32(%rsp),%ebp
+	andl	%edx,%edi
+	pxor	%xmm7,%xmm6
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm5,%xmm10
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	pxor	%xmm9,%xmm6
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm10,16(%rsp)
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	36(%rsp),%edx
+	andl	%ecx,%esi
+	pslld	$2,%xmm6
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	psrld	$30,%xmm9
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+.byte	102,69,15,56,220,223
+	movups	48(%r15),%xmm14
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	por	%xmm9,%xmm6
+	movl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm10
+	addl	40(%rsp),%ecx
+	andl	%ebx,%edi
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	44(%rsp),%ebx
+	andl	%eax,%esi
+	andl	%ebp,%edi
+.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm15
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%edi
+	pxor	%xmm3,%xmm7
+.byte	102,68,15,58,15,213,8
+	xorl	%ebp,%edx
+	addl	48(%rsp),%eax
+	andl	%ebp,%edi
+	pxor	%xmm0,%xmm7
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	48(%r11),%xmm9
+	paddd	%xmm6,%xmm8
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	pxor	%xmm10,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm8,32(%rsp)
+	movl	%ecx,%esi
+.byte	102,69,15,56,220,223
+	movups	80(%r15),%xmm14
+	xorl	%edx,%ecx
+	addl	52(%rsp),%ebp
+	andl	%edx,%esi
+	pslld	$2,%xmm7
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	psrld	$30,%xmm10
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	por	%xmm10,%xmm7
+	movl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm7,%xmm8
+	addl	56(%rsp),%edx
+	andl	%ecx,%edi
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm15
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	60(%rsp),%ecx
+	andl	%ebx,%esi
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%edi
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,198,8
+	xorl	%eax,%ebp
+	addl	0(%rsp),%ebx
+	andl	%eax,%edi
+	pxor	%xmm1,%xmm0
+	andl	%ebp,%esi
+.byte	102,69,15,56,220,223
+	movups	112(%r15),%xmm14
+	rorl	$7,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm7,%xmm9
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	pxor	%xmm8,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	4(%rsp),%eax
+	andl	%ebp,%esi
+	pslld	$2,%xmm0
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	psrld	$30,%xmm8
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	por	%xmm8,%xmm0
+	movl	%ecx,%edi
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%ecx
+	movdqa	%xmm0,%xmm9
+	addl	8(%rsp),%ebp
+	andl	%edx,%edi
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	12(%rsp),%edx
+	andl	%ecx,%esi
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%edi
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,207,8
+	xorl	%ebx,%eax
+	addl	16(%rsp),%ecx
+	andl	%ebx,%edi
+	pxor	%xmm2,%xmm1
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm0,%xmm10
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	pxor	%xmm9,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movdqa	%xmm1,%xmm9
+	movdqa	%xmm10,0(%rsp)
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	20(%rsp),%ebx
+	andl	%eax,%esi
+	pslld	$2,%xmm1
+	andl	%ebp,%edi
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	rorl	$7,%edx
+	psrld	$30,%xmm9
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	por	%xmm9,%xmm1
+	movl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm1,%xmm10
+	addl	24(%rsp),%eax
+	andl	%ebp,%edi
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movl	%ecx,%esi
+	cmpl	$11,%r8d
+	jb	.Laesenclast3
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast3
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast3:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	xorl	%edx,%ecx
+	addl	28(%rsp),%ebp
+	andl	%edx,%esi
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%edi
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,208,8
+	xorl	%ecx,%ebx
+	addl	32(%rsp),%edx
+	andl	%ecx,%edi
+	pxor	%xmm3,%xmm2
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm1,%xmm8
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	pxor	%xmm10,%xmm2
+	roll	$5,%ebp
+	movups	48(%r12),%xmm12
+	xorps	%xmm13,%xmm12
+	movups	%xmm11,32(%r13,%r12,1)
+	xorps	%xmm12,%xmm11
+.byte	102,69,15,56,220,222
+	movups	32(%r15),%xmm15
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movdqa	%xmm2,%xmm10
+	movdqa	%xmm8,16(%rsp)
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	36(%rsp),%ecx
+	andl	%ebx,%esi
+	pslld	$2,%xmm2
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	psrld	$30,%xmm10
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	por	%xmm10,%xmm2
+	movl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm2,%xmm8
+	addl	40(%rsp),%ebx
+	andl	%eax,%edi
+	andl	%ebp,%esi
+.byte	102,69,15,56,220,223
+	movups	48(%r15),%xmm14
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	44(%rsp),%eax
+	andl	%ebp,%esi
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	addl	48(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm15
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,193,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm2,%xmm9
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm8,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm3,%xmm8
+	movdqa	%xmm9,32(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm8
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	80(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm8,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	0(%rsp),%eax
+	paddd	%xmm3,%xmm10
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	movdqa	%xmm10,48(%rsp)
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	4(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	12(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+.byte	102,69,15,56,220,223
+	movups	112(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	cmpq	%r14,%r10
+	je	.Ldone_ssse3
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r10),%xmm0
+	movdqu	16(%r10),%xmm1
+	movdqu	32(%r10),%xmm2
+	movdqu	48(%r10),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r10
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+.byte	102,15,56,0,206
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	paddd	%xmm9,%xmm0
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movdqa	%xmm0,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	psubd	%xmm9,%xmm0
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+.byte	102,15,56,0,214
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm9,%xmm1
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movdqa	%xmm1,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	psubd	%xmm9,%xmm1
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+.byte	102,15,56,0,222
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	paddd	%xmm9,%xmm2
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movdqa	%xmm2,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	psubd	%xmm9,%xmm2
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	cmpl	$11,%r8d
+	jb	.Laesenclast4
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast4
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast4:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movups	%xmm11,48(%r13,%r12,1)
+	leaq	64(%r12),%r12
+
+	addl	0(%r9),%eax
+	addl	4(%r9),%esi
+	addl	8(%r9),%ecx
+	addl	12(%r9),%edx
+	movl	%eax,0(%r9)
+	addl	16(%r9),%ebp
+	movl	%esi,4(%r9)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+	movl	%ebp,16(%r9)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	cmpl	$11,%r8d
+	jb	.Laesenclast5
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast5
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast5:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movups	%xmm11,48(%r13,%r12,1)
+	movq	88(%rsp),%r8
+
+	addl	0(%r9),%eax
+	addl	4(%r9),%esi
+	addl	8(%r9),%ecx
+	movl	%eax,0(%r9)
+	addl	12(%r9),%edx
+	movl	%esi,4(%r9)
+	addl	16(%r9),%ebp
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+	movl	%ebp,16(%r9)
+	movups	%xmm11,(%r8)
+	leaq	104(%rsp),%rsi
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_ssse3:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	
+
+.byte	65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl b/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 00000000..3c8f6c19
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1250 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+#		AES-128-CBC	+SHA1		stitch      gain
+# Westmere	3.77[+5.6]	9.37		6.65	    +41%
+# Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
+#
+#		AES-192-CBC
+# Westmere	4.51		10.11		6.97	    +45%
+# Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
+#
+#		AES-256-CBC
+# Westmere	5.25		10.85		7.25	    +50%
+# Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
+#
+# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+#	background information. Above numbers in parentheses are SSSE3
+#	results collected on AVX-capable CPU, i.e. apply on OSes that
+#	don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+#		AES-128-CBC	AES-192-CBC	AES-256-CBC
+# Westmere	1.31		1.55		1.80
+# Sandy Bridge	0.93		1.06		1.22
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+	   $1>=10);
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# void aesni_cbc_sha1_enc(const void *inp,
+#			void *out,
+#			size_t length,
+#			const AES_KEY *key,
+#			unsigned char *iv,
+#			SHA_CTX *ctx,
+#			const void *in0);
+
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
+
+.globl	aesni_cbc_sha1_enc
+.type	aesni_cbc_sha1_enc,\@abi-omnipotent
+.align	16
+aesni_cbc_sha1_enc:
+	# caller should check for SSSE3 and AES-NI bits
+	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r11d		# mask AVX bit
+	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
+	or	%r11d,%r10d
+	cmp	\$`1<<28|1<<30`,%r10d
+	je	aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+	jmp	aesni_cbc_sha1_enc_ssse3
+	ret
+.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_ssse3,\@function,6
+.align	16
+aesni_cbc_sha1_enc_ssse3:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_ssse3		# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	movdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	add	\$64,$inp
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	movups	($key),$rndkey0		# $key[0]
+	movups	16($key),$rndkey[0]	# forward reference
+	jmp	.Loop_ssse3
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	movups		`16*$n`($in0),$in		# load input
+	xorps		$rndkey0,$in
+___
+      $code.=<<___ if ($n);
+	movups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	xorps		$in,$iv
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Laesenclast$sn
+	movups		`32+16*($k+0)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+1)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+	je		.Laesenclast$sn
+	movups		`32+16*($k+2)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+3)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+.Laesenclast$sn:
+	aesenclast	$rndkey[0],$iv
+	movups		16($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@Tx[0],@X[0]);
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_20_39 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_40_59 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+$code.=<<___;
+.align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	movups	$iv,($ivp)			# write IV
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+$j=$jj=$r=$sn=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_avx,\@function,6
+.align	16
+aesni_cbc_sha1_enc_avx:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_avx			# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	vzeroall
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	vmovdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	\$112,$key		# size optimization
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	@Tx[1],@X[-3&7],@X[1]
+	vpaddd	@Tx[1],@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	vmovups	-112($key),$rndkey0	# $key[0]
+	vmovups	16-112($key),$rndkey[0]	# forward reference
+	jmp	.Loop_avx
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	vmovups		`16*$n`($in0),$in		# load input
+	vxorps		$rndkey0,$in,$in
+___
+      $code.=<<___ if ($n);
+	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	vxorps		$in,$iv,$iv
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
+	je		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+	vaesenclast	$rndkey[0],$iv,$iv
+	vmovups		16-112($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_avx");
+
+	unshift(@Tx,pop(@Tx));
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	vmovups	$iv,($ivp)			# write IV
+	vzeroall
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+
+.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	96(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
+
+	mov	0(%rax),%r15
+	mov	8(%rax),%r14
+	mov	16(%rax),%r13
+	mov	24(%rax),%r12
+	mov	32(%rax),%rbp
+	mov	40(%rax),%rbx
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	ssse3_handler,.-ssse3_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd
+	);
+	return undef if (!defined($opcodelet{$1}));
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/aesni-x86.S b/main/openssl/crypto/aes/asm/aesni-x86.S
new file mode 100644
index 00000000..0766bb54
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-x86.S
@@ -0,0 +1,2143 @@
+.file	"crypto/aes/asm/aesni-x86.s"
+.text
+.globl	aesni_encrypt
+.type	aesni_encrypt,@function
+.align	16
+aesni_encrypt:
+.L_aesni_encrypt_begin:
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L000enc1_loop_1:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L000enc1_loop_1
+.byte	102,15,56,221,209
+	movups	%xmm2,(%eax)
+	ret
+.size	aesni_encrypt,.-.L_aesni_encrypt_begin
+.globl	aesni_decrypt
+.type	aesni_decrypt,@function
+.align	16
+aesni_decrypt:
+.L_aesni_decrypt_begin:
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L001dec1_loop_2:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L001dec1_loop_2
+.byte	102,15,56,223,209
+	movups	%xmm2,(%eax)
+	ret
+.size	aesni_decrypt,.-.L_aesni_decrypt_begin
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+	movups	(%edx),%xmm0
+	shrl	$1,%ecx
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	(%edx),%xmm0
+.L002enc3_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%ecx
+.byte	102,15,56,220,225
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leal	32(%edx),%edx
+.byte	102,15,56,220,224
+	movups	(%edx),%xmm0
+	jnz	.L002enc3_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.type	_aesni_decrypt3,@function
+.align	16
+_aesni_decrypt3:
+	movups	(%edx),%xmm0
+	shrl	$1,%ecx
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	(%edx),%xmm0
+.L003dec3_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%ecx
+.byte	102,15,56,222,225
+	movups	16(%edx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leal	32(%edx),%edx
+.byte	102,15,56,222,224
+	movups	(%edx),%xmm0
+	jnz	.L003dec3_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	ret
+.size	_aesni_decrypt3,.-_aesni_decrypt3
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shrl	$1,%ecx
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	(%edx),%xmm0
+.L004enc4_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%ecx
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leal	32(%edx),%edx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	(%edx),%xmm0
+	jnz	.L004enc4_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.type	_aesni_decrypt4,@function
+.align	16
+_aesni_decrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shrl	$1,%ecx
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	(%edx),%xmm0
+.L005dec4_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%ecx
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	16(%edx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leal	32(%edx),%edx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	(%edx),%xmm0
+	jnz	.L005dec4_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	ret
+.size	_aesni_decrypt4,.-_aesni_decrypt4
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+	movups	(%edx),%xmm0
+	shrl	$1,%ecx
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	decl	%ecx
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,220,249
+	jmp	.L_aesni_encrypt6_enter
+.align	16
+.L006enc6_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%ecx
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.align	16
+.L_aesni_encrypt6_enter:
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leal	32(%edx),%edx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%edx),%xmm0
+	jnz	.L006enc6_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.type	_aesni_decrypt6,@function
+.align	16
+_aesni_decrypt6:
+	movups	(%edx),%xmm0
+	shrl	$1,%ecx
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	decl	%ecx
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,222,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,222,249
+	jmp	.L_aesni_decrypt6_enter
+.align	16
+.L007dec6_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%ecx
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.align	16
+.L_aesni_decrypt6_enter:
+	movups	16(%edx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leal	32(%edx),%edx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	(%edx),%xmm0
+	jnz	.L007dec6_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	ret
+.size	_aesni_decrypt6,.-_aesni_decrypt6
+.globl	aesni_ecb_encrypt
+.type	aesni_ecb_encrypt,@function
+.align	16
+aesni_ecb_encrypt:
+.L_aesni_ecb_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	andl	$-16,%eax
+	jz	.L008ecb_ret
+	movl	240(%edx),%ecx
+	testl	%ebx,%ebx
+	jz	.L009ecb_decrypt
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	.L010ecb_enc_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	.L011ecb_enc_loop6_enter
+.align	16
+.L012ecb_enc_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+.L011ecb_enc_loop6_enter:
+	call	_aesni_encrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	.L012ecb_enc_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	.L008ecb_ret
+.L010ecb_enc_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	.L013ecb_enc_one
+	movups	16(%esi),%xmm3
+	je	.L014ecb_enc_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	.L015ecb_enc_three
+	movups	48(%esi),%xmm5
+	je	.L016ecb_enc_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L013ecb_enc_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L017enc1_loop_3:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L017enc1_loop_3
+.byte	102,15,56,221,209
+	movups	%xmm2,(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L014ecb_enc_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L015ecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L016ecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L009ecb_decrypt:
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	.L018ecb_dec_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	.L019ecb_dec_loop6_enter
+.align	16
+.L020ecb_dec_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+.L019ecb_dec_loop6_enter:
+	call	_aesni_decrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	.L020ecb_dec_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	.L008ecb_ret
+.L018ecb_dec_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	.L021ecb_dec_one
+	movups	16(%esi),%xmm3
+	je	.L022ecb_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	.L023ecb_dec_three
+	movups	48(%esi),%xmm5
+	je	.L024ecb_dec_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L021ecb_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L025dec1_loop_4:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L025dec1_loop_4
+.byte	102,15,56,223,209
+	movups	%xmm2,(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L022ecb_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L023ecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L024ecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+.L008ecb_ret:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_ecb_encrypt,.-.L_aesni_ecb_encrypt_begin
+.globl	aesni_ccm64_encrypt_blocks
+.type	aesni_ccm64_encrypt_blocks,@function
+.align	16
+aesni_ccm64_encrypt_blocks:
+.L_aesni_ccm64_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	shrl	$1,%ecx
+	leal	(%edx),%ebp
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	movl	%ecx,%ebx
+.byte	102,15,56,0,253
+.L026ccm64_enc_outer:
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm6
+	xorps	%xmm0,%xmm2
+	movups	16(%ebp),%xmm1
+	xorps	%xmm6,%xmm0
+	leal	32(%ebp),%edx
+	xorps	%xmm0,%xmm3
+	movups	(%edx),%xmm0
+.L027ccm64_enc2_loop:
+.byte	102,15,56,220,209
+	decl	%ecx
+.byte	102,15,56,220,217
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+	leal	32(%edx),%edx
+.byte	102,15,56,220,216
+	movups	(%edx),%xmm0
+	jnz	.L027ccm64_enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	paddq	16(%esp),%xmm7
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	decl	%eax
+	leal	16(%esi),%esi
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+	leal	16(%edi),%edi
+.byte	102,15,56,0,213
+	jnz	.L026ccm64_enc_outer
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_ccm64_encrypt_blocks,.-.L_aesni_ccm64_encrypt_blocks_begin
+.globl	aesni_ccm64_decrypt_blocks
+.type	aesni_ccm64_decrypt_blocks,@function
+.align	16
+aesni_ccm64_decrypt_blocks:
+.L_aesni_ccm64_decrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+.byte	102,15,56,0,253
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L028enc1_loop_5:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L028enc1_loop_5
+.byte	102,15,56,221,209
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+	leal	16(%esi),%esi
+	jmp	.L029ccm64_dec_outer
+.align	16
+.L029ccm64_dec_outer:
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movl	%ebx,%ecx
+	movups	%xmm6,(%edi)
+	leal	16(%edi),%edi
+.byte	102,15,56,0,213
+	subl	$1,%eax
+	jz	.L030ccm64_dec_break
+	movups	(%ebp),%xmm0
+	shrl	$1,%ecx
+	movups	16(%ebp),%xmm1
+	xorps	%xmm0,%xmm6
+	leal	32(%ebp),%edx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	(%edx),%xmm0
+.L031ccm64_dec2_loop:
+.byte	102,15,56,220,209
+	decl	%ecx
+.byte	102,15,56,220,217
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+	leal	32(%edx),%edx
+.byte	102,15,56,220,216
+	movups	(%edx),%xmm0
+	jnz	.L031ccm64_dec2_loop
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leal	16(%esi),%esi
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	jmp	.L029ccm64_dec_outer
+.align	16
+.L030ccm64_dec_break:
+	movl	%ebp,%edx
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm6
+	leal	32(%edx),%edx
+	xorps	%xmm6,%xmm3
+.L032enc1_loop_6:
+.byte	102,15,56,220,217
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L032enc1_loop_6
+.byte	102,15,56,221,217
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_ccm64_decrypt_blocks,.-.L_aesni_ccm64_decrypt_blocks_begin
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,@function
+.align	16
+aesni_ctr32_encrypt_blocks:
+.L_aesni_ctr32_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$88,%esp
+	andl	$-16,%esp
+	movl	%ebp,80(%esp)
+	cmpl	$1,%eax
+	je	.L033ctr32_one_shortcut
+	movdqu	(%ebx),%xmm7
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$6,%ecx
+	xorl	%ebp,%ebp
+	movl	%ecx,16(%esp)
+	movl	%ecx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%ebp,28(%esp)
+.byte	102,15,58,22,251,3
+.byte	102,15,58,34,253,3
+	movl	240(%edx),%ecx
+	bswap	%ebx
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	(%esp),%xmm2
+.byte	102,15,58,34,203,0
+	leal	3(%ebx),%ebp
+.byte	102,15,58,34,197,0
+	incl	%ebx
+.byte	102,15,58,34,203,1
+	incl	%ebp
+.byte	102,15,58,34,197,1
+	incl	%ebx
+.byte	102,15,58,34,203,2
+	incl	%ebp
+.byte	102,15,58,34,197,2
+	movdqa	%xmm1,48(%esp)
+.byte	102,15,56,0,202
+	movdqa	%xmm0,64(%esp)
+.byte	102,15,56,0,194
+	pshufd	$192,%xmm1,%xmm2
+	pshufd	$128,%xmm1,%xmm3
+	cmpl	$6,%eax
+	jb	.L034ctr32_tail
+	movdqa	%xmm7,32(%esp)
+	shrl	$1,%ecx
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	subl	$6,%eax
+	jmp	.L035ctr32_loop6
+.align	16
+.L035ctr32_loop6:
+	pshufd	$64,%xmm1,%xmm4
+	movdqa	32(%esp),%xmm1
+	pshufd	$192,%xmm0,%xmm5
+	por	%xmm1,%xmm2
+	pshufd	$128,%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	pshufd	$64,%xmm0,%xmm7
+	por	%xmm1,%xmm4
+	por	%xmm1,%xmm5
+	por	%xmm1,%xmm6
+	por	%xmm1,%xmm7
+	movups	(%ebp),%xmm0
+	movups	16(%ebp),%xmm1
+	leal	32(%ebp),%edx
+	decl	%ecx
+	pxor	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,220,249
+	call	.L_aesni_encrypt6_enter
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	%xmm2,(%edi)
+	movdqa	16(%esp),%xmm0
+	xorps	%xmm1,%xmm4
+	movdqa	48(%esp),%xmm1
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	paddd	%xmm0,%xmm1
+	paddd	64(%esp),%xmm0
+	movdqa	(%esp),%xmm2
+	movups	48(%esi),%xmm3
+	movups	64(%esi),%xmm4
+	xorps	%xmm3,%xmm5
+	movups	80(%esi),%xmm3
+	leal	96(%esi),%esi
+	movdqa	%xmm1,48(%esp)
+.byte	102,15,56,0,202
+	xorps	%xmm4,%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm3,%xmm7
+	movdqa	%xmm0,64(%esp)
+.byte	102,15,56,0,194
+	movups	%xmm6,64(%edi)
+	pshufd	$192,%xmm1,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movl	%ebx,%ecx
+	pshufd	$128,%xmm1,%xmm3
+	subl	$6,%eax
+	jnc	.L035ctr32_loop6
+	addl	$6,%eax
+	jz	.L036ctr32_ret
+	movl	%ebp,%edx
+	leal	1(,%ecx,2),%ecx
+	movdqa	32(%esp),%xmm7
+.L034ctr32_tail:
+	por	%xmm7,%xmm2
+	cmpl	$2,%eax
+	jb	.L037ctr32_one
+	pshufd	$64,%xmm1,%xmm4
+	por	%xmm7,%xmm3
+	je	.L038ctr32_two
+	pshufd	$192,%xmm0,%xmm5
+	por	%xmm7,%xmm4
+	cmpl	$4,%eax
+	jb	.L039ctr32_three
+	pshufd	$128,%xmm0,%xmm6
+	por	%xmm7,%xmm5
+	je	.L040ctr32_four
+	por	%xmm7,%xmm6
+	call	_aesni_encrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm4
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L036ctr32_ret
+.align	16
+.L033ctr32_one_shortcut:
+	movups	(%ebx),%xmm2
+	movl	240(%edx),%ecx
+.L037ctr32_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L041enc1_loop_7:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L041enc1_loop_7
+.byte	102,15,56,221,209
+	movups	(%esi),%xmm6
+	xorps	%xmm2,%xmm6
+	movups	%xmm6,(%edi)
+	jmp	.L036ctr32_ret
+.align	16
+.L038ctr32_two:
+	call	_aesni_encrypt3
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L036ctr32_ret
+.align	16
+.L039ctr32_three:
+	call	_aesni_encrypt3
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	movups	32(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L036ctr32_ret
+.align	16
+.L040ctr32_four:
+	call	_aesni_encrypt4
+	movups	(%esi),%xmm6
+	movups	16(%esi),%xmm7
+	movups	32(%esi),%xmm1
+	xorps	%xmm6,%xmm2
+	movups	48(%esi),%xmm0
+	xorps	%xmm7,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+.L036ctr32_ret:
+	movl	80(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_ctr32_encrypt_blocks,.-.L_aesni_ctr32_encrypt_blocks_begin
+.globl	aesni_xts_encrypt
+.type	aesni_xts_encrypt,@function
+.align	16
+aesni_xts_encrypt:
+.L_aesni_xts_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L042enc1_loop_8:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L042enc1_loop_8
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	movl	240(%edx),%ecx
+	andl	$-16,%esp
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	subl	$96,%eax
+	jc	.L043xts_enc_short
+	shrl	$1,%ecx
+	movl	%ecx,%ebx
+	jmp	.L044xts_enc_loop6
+.align	16
+.L044xts_enc_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	leal	32(%ebp),%edx
+	pxor	16(%esp),%xmm3
+.byte	102,15,56,220,209
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,220,217
+	pxor	48(%esp),%xmm5
+	decl	%ecx
+.byte	102,15,56,220,225
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,220,249
+	call	.L_aesni_encrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	movl	%ebx,%ecx
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	.L044xts_enc_loop6
+	leal	1(,%ecx,2),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+.L043xts_enc_short:
+	addl	$96,%eax
+	jz	.L045xts_enc_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	.L046xts_enc_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	.L047xts_enc_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	.L048xts_enc_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	.L049xts_enc_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	_aesni_encrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	.L050xts_enc_done
+.align	16
+.L046xts_enc_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L051enc1_loop_9:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L051enc1_loop_9
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	.L050xts_enc_done
+.align	16
+.L047xts_enc_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L050xts_enc_done
+.align	16
+.L048xts_enc_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	_aesni_encrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	.L050xts_enc_done
+.align	16
+.L049xts_enc_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	_aesni_encrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L050xts_enc_done
+.align	16
+.L045xts_enc_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	.L052xts_enc_ret
+	movdqa	%xmm1,%xmm5
+	movl	%eax,112(%esp)
+	jmp	.L053xts_enc_steal
+.align	16
+.L050xts_enc_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	.L052xts_enc_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm5
+	paddq	%xmm1,%xmm1
+	pand	96(%esp),%xmm5
+	pxor	%xmm1,%xmm5
+.L053xts_enc_steal:
+	movzbl	(%esi),%ecx
+	movzbl	-16(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,-16(%edi)
+	movb	%dl,(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	.L053xts_enc_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	-16(%edi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L054enc1_loop_10:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L054enc1_loop_10
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,-16(%edi)
+.L052xts_enc_ret:
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_xts_encrypt,.-.L_aesni_xts_encrypt_begin
+.globl	aesni_xts_decrypt
+.type	aesni_xts_decrypt,@function
+.align	16
+aesni_xts_decrypt:
+.L_aesni_xts_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L055enc1_loop_11:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L055enc1_loop_11
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	andl	$-16,%esp
+	xorl	%ebx,%ebx
+	testl	$15,%eax
+	setnz	%bl
+	shll	$4,%ebx
+	subl	%ebx,%eax
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	subl	$96,%eax
+	jc	.L056xts_dec_short
+	shrl	$1,%ecx
+	movl	%ecx,%ebx
+	jmp	.L057xts_dec_loop6
+.align	16
+.L057xts_dec_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	leal	32(%ebp),%edx
+	pxor	16(%esp),%xmm3
+.byte	102,15,56,222,209
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,222,217
+	pxor	48(%esp),%xmm5
+	decl	%ecx
+.byte	102,15,56,222,225
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,222,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,222,249
+	call	.L_aesni_decrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	movl	%ebx,%ecx
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	.L057xts_dec_loop6
+	leal	1(,%ecx,2),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+.L056xts_dec_short:
+	addl	$96,%eax
+	jz	.L058xts_dec_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	.L059xts_dec_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	.L060xts_dec_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	.L061xts_dec_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	.L062xts_dec_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	_aesni_decrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	.L063xts_dec_done
+.align	16
+.L059xts_dec_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L064dec1_loop_12:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L064dec1_loop_12
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	.L063xts_dec_done
+.align	16
+.L060xts_dec_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	_aesni_decrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L063xts_dec_done
+.align	16
+.L061xts_dec_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	_aesni_decrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	.L063xts_dec_done
+.align	16
+.L062xts_dec_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	_aesni_decrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L063xts_dec_done
+.align	16
+.L058xts_dec_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	.L065xts_dec_ret
+	movl	%eax,112(%esp)
+	jmp	.L066xts_dec_only_one_more
+.align	16
+.L063xts_dec_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	.L065xts_dec_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+.L066xts_dec_only_one_more:
+	pshufd	$19,%xmm0,%xmm5
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm5
+	pxor	%xmm1,%xmm5
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L067dec1_loop_13:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L067dec1_loop_13
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+.L068xts_dec_steal:
+	movzbl	16(%esi),%ecx
+	movzbl	(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,(%edi)
+	movb	%dl,16(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	.L068xts_dec_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%edi),%xmm2
+	xorps	%xmm6,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L069dec1_loop_14:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L069dec1_loop_14
+.byte	102,15,56,223,209
+	xorps	%xmm6,%xmm2
+	movups	%xmm2,(%edi)
+.L065xts_dec_ret:
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_xts_decrypt,.-.L_aesni_xts_decrypt_begin
+.globl	aesni_cbc_encrypt
+.type	aesni_cbc_encrypt,@function
+.align	16
+aesni_cbc_encrypt:
+.L_aesni_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	%esp,%ebx
+	movl	24(%esp),%edi
+	subl	$24,%ebx
+	movl	28(%esp),%eax
+	andl	$-16,%ebx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebp
+	testl	%eax,%eax
+	jz	.L070cbc_abort
+	cmpl	$0,40(%esp)
+	xchgl	%esp,%ebx
+	movups	(%ebp),%xmm7
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ecx,%ebx
+	je	.L071cbc_decrypt
+	movaps	%xmm7,%xmm2
+	cmpl	$16,%eax
+	jb	.L072cbc_enc_tail
+	subl	$16,%eax
+	jmp	.L073cbc_enc_loop
+.align	16
+.L073cbc_enc_loop:
+	movups	(%esi),%xmm7
+	leal	16(%esi),%esi
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm7
+	leal	32(%edx),%edx
+	xorps	%xmm7,%xmm2
+.L074enc1_loop_15:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L074enc1_loop_15
+.byte	102,15,56,221,209
+	movl	%ebx,%ecx
+	movl	%ebp,%edx
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	subl	$16,%eax
+	jnc	.L073cbc_enc_loop
+	addl	$16,%eax
+	jnz	.L072cbc_enc_tail
+	movaps	%xmm2,%xmm7
+	jmp	.L075cbc_ret
+.L072cbc_enc_tail:
+	movl	%eax,%ecx
+.long	2767451785
+	movl	$16,%ecx
+	subl	%eax,%ecx
+	xorl	%eax,%eax
+.long	2868115081
+	leal	-16(%edi),%edi
+	movl	%ebx,%ecx
+	movl	%edi,%esi
+	movl	%ebp,%edx
+	jmp	.L073cbc_enc_loop
+.align	16
+.L071cbc_decrypt:
+	cmpl	$80,%eax
+	jbe	.L076cbc_dec_tail
+	movaps	%xmm7,(%esp)
+	subl	$80,%eax
+	jmp	.L077cbc_dec_loop6_enter
+.align	16
+.L078cbc_dec_loop6:
+	movaps	%xmm0,(%esp)
+	movups	%xmm7,(%edi)
+	leal	16(%edi),%edi
+.L077cbc_dec_loop6_enter:
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	call	_aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%esi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	96(%esi),%esi
+	movups	%xmm4,32(%edi)
+	movl	%ebx,%ecx
+	movups	%xmm5,48(%edi)
+	movl	%ebp,%edx
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	subl	$96,%eax
+	ja	.L078cbc_dec_loop6
+	movaps	%xmm7,%xmm2
+	movaps	%xmm0,%xmm7
+	addl	$80,%eax
+	jle	.L079cbc_dec_tail_collected
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+.L076cbc_dec_tail:
+	movups	(%esi),%xmm2
+	movaps	%xmm2,%xmm6
+	cmpl	$16,%eax
+	jbe	.L080cbc_dec_one
+	movups	16(%esi),%xmm3
+	movaps	%xmm3,%xmm5
+	cmpl	$32,%eax
+	jbe	.L081cbc_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$48,%eax
+	jbe	.L082cbc_dec_three
+	movups	48(%esi),%xmm5
+	cmpl	$64,%eax
+	jbe	.L083cbc_dec_four
+	movups	64(%esi),%xmm6
+	movaps	%xmm7,(%esp)
+	movups	(%esi),%xmm2
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm7
+	xorps	%xmm0,%xmm6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movaps	%xmm6,%xmm2
+	subl	$80,%eax
+	jmp	.L079cbc_dec_tail_collected
+.align	16
+.L080cbc_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L084dec1_loop_16:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L084dec1_loop_16
+.byte	102,15,56,223,209
+	xorps	%xmm7,%xmm2
+	movaps	%xmm6,%xmm7
+	subl	$16,%eax
+	jmp	.L079cbc_dec_tail_collected
+.align	16
+.L081cbc_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movaps	%xmm3,%xmm2
+	leal	16(%edi),%edi
+	movaps	%xmm5,%xmm7
+	subl	$32,%eax
+	jmp	.L079cbc_dec_tail_collected
+.align	16
+.L082cbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm5,%xmm4
+	movups	%xmm2,(%edi)
+	movaps	%xmm4,%xmm2
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movups	32(%esi),%xmm7
+	subl	$48,%eax
+	jmp	.L079cbc_dec_tail_collected
+.align	16
+.L083cbc_dec_four:
+	call	_aesni_decrypt4
+	movups	16(%esi),%xmm1
+	movups	32(%esi),%xmm0
+	xorps	%xmm7,%xmm2
+	movups	48(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movaps	%xmm5,%xmm2
+	subl	$64,%eax
+.L079cbc_dec_tail_collected:
+	andl	$15,%eax
+	jnz	.L085cbc_dec_tail_partial
+	movups	%xmm2,(%edi)
+	jmp	.L075cbc_ret
+.align	16
+.L085cbc_dec_tail_partial:
+	movaps	%xmm2,(%esp)
+	movl	$16,%ecx
+	movl	%esp,%esi
+	subl	%eax,%ecx
+.long	2767451785
+.L075cbc_ret:
+	movl	16(%esp),%esp
+	movl	36(%esp),%ebp
+	movups	%xmm7,(%ebp)
+.L070cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_cbc_encrypt,.-.L_aesni_cbc_encrypt_begin
+.type	_aesni_set_encrypt_key,@function
+.align	16
+_aesni_set_encrypt_key:
+	testl	%eax,%eax
+	jz	.L086bad_pointer
+	testl	%edx,%edx
+	jz	.L086bad_pointer
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	leal	16(%edx),%edx
+	cmpl	$256,%ecx
+	je	.L08714rounds
+	cmpl	$192,%ecx
+	je	.L08812rounds
+	cmpl	$128,%ecx
+	jne	.L089bad_keybits
+.align	16
+.L09010rounds:
+	movl	$9,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,200,1
+	call	.L091key_128_cold
+.byte	102,15,58,223,200,2
+	call	.L092key_128
+.byte	102,15,58,223,200,4
+	call	.L092key_128
+.byte	102,15,58,223,200,8
+	call	.L092key_128
+.byte	102,15,58,223,200,16
+	call	.L092key_128
+.byte	102,15,58,223,200,32
+	call	.L092key_128
+.byte	102,15,58,223,200,64
+	call	.L092key_128
+.byte	102,15,58,223,200,128
+	call	.L092key_128
+.byte	102,15,58,223,200,27
+	call	.L092key_128
+.byte	102,15,58,223,200,54
+	call	.L092key_128
+	movups	%xmm0,(%edx)
+	movl	%ecx,80(%edx)
+	xorl	%eax,%eax
+	ret
+.align	16
+.L092key_128:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.L091key_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L08812rounds:
+	movq	16(%eax),%xmm2
+	movl	$11,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L093key_192a_cold
+.byte	102,15,58,223,202,2
+	call	.L094key_192b
+.byte	102,15,58,223,202,4
+	call	.L095key_192a
+.byte	102,15,58,223,202,8
+	call	.L094key_192b
+.byte	102,15,58,223,202,16
+	call	.L095key_192a
+.byte	102,15,58,223,202,32
+	call	.L094key_192b
+.byte	102,15,58,223,202,64
+	call	.L095key_192a
+.byte	102,15,58,223,202,128
+	call	.L094key_192b
+	movups	%xmm0,(%edx)
+	movl	%ecx,48(%edx)
+	xorl	%eax,%eax
+	ret
+.align	16
+.L095key_192a:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.align	16
+.L093key_192a_cold:
+	movaps	%xmm2,%xmm5
+.L096key_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+.align	16
+.L094key_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%edx)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%edx)
+	leal	32(%edx),%edx
+	jmp	.L096key_192b_warm
+.align	16
+.L08714rounds:
+	movups	16(%eax),%xmm2
+	movl	$13,%ecx
+	leal	16(%edx),%edx
+	movups	%xmm0,-32(%edx)
+	movups	%xmm2,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L097key_256a_cold
+.byte	102,15,58,223,200,1
+	call	.L098key_256b
+.byte	102,15,58,223,202,2
+	call	.L099key_256a
+.byte	102,15,58,223,200,2
+	call	.L098key_256b
+.byte	102,15,58,223,202,4
+	call	.L099key_256a
+.byte	102,15,58,223,200,4
+	call	.L098key_256b
+.byte	102,15,58,223,202,8
+	call	.L099key_256a
+.byte	102,15,58,223,200,8
+	call	.L098key_256b
+.byte	102,15,58,223,202,16
+	call	.L099key_256a
+.byte	102,15,58,223,200,16
+	call	.L098key_256b
+.byte	102,15,58,223,202,32
+	call	.L099key_256a
+.byte	102,15,58,223,200,32
+	call	.L098key_256b
+.byte	102,15,58,223,202,64
+	call	.L099key_256a
+	movups	%xmm0,(%edx)
+	movl	%ecx,16(%edx)
+	xorl	%eax,%eax
+	ret
+.align	16
+.L099key_256a:
+	movups	%xmm2,(%edx)
+	leal	16(%edx),%edx
+.L097key_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L098key_256b:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.align	4
+.L086bad_pointer:
+	movl	$-1,%eax
+	ret
+.align	4
+.L089bad_keybits:
+	movl	$-2,%eax
+	ret
+.size	_aesni_set_encrypt_key,.-_aesni_set_encrypt_key
+.globl	aesni_set_encrypt_key
+.type	aesni_set_encrypt_key,@function
+.align	16
+aesni_set_encrypt_key:
+.L_aesni_set_encrypt_key_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	_aesni_set_encrypt_key
+	ret
+.size	aesni_set_encrypt_key,.-.L_aesni_set_encrypt_key_begin
+.globl	aesni_set_decrypt_key
+.type	aesni_set_decrypt_key,@function
+.align	16
+aesni_set_decrypt_key:
+.L_aesni_set_decrypt_key_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	_aesni_set_encrypt_key
+	movl	12(%esp),%edx
+	shll	$4,%ecx
+	testl	%eax,%eax
+	jnz	.L100dec_key_ret
+	leal	16(%edx,%ecx,1),%eax
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+	movups	%xmm0,(%eax)
+	movups	%xmm1,(%edx)
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+.L101dec_key_inverse:
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+	movups	%xmm0,16(%eax)
+	movups	%xmm1,-16(%edx)
+	cmpl	%edx,%eax
+	ja	.L101dec_key_inverse
+	movups	(%edx),%xmm0
+.byte	102,15,56,219,192
+	movups	%xmm0,(%edx)
+	xorl	%eax,%eax
+.L100dec_key_ret:
+	ret
+.size	aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte	115,108,46,111,114,103,62,0
diff --git a/main/openssl/crypto/aes/asm/aesni-x86.pl b/main/openssl/crypto/aes/asm/aesni-x86.pl
new file mode 100644
index 00000000..3dc345b5
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-x86.pl
@@ -0,0 +1,2189 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+#	53-67%      67-84%      91-94%      95-98%      97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-586.pl:-)
+$inline=1;		# inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+if ($PREFIX eq "aesni")	{ $movekey=*movups; }
+else			{ $movekey=*movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx";	# backup copy for $rounds
+$key_="ebp";	# backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5";	$in1="xmm5";
+$inout4="xmm6";	$in0="xmm6";
+$inout5="xmm7";	$ivec="xmm7";
+
+# AESNI extenstion
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc	{ aescommon(0xdb,@_); }
+sub aesenc	{ aescommon(0xdc,@_); }
+sub aesenclast	{ aescommon(0xdd,@_); }
+sub aesdec	{ aescommon(0xde,@_); }
+sub aesdeclast	{ aescommon(0xdf,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+  $sn++;
+
+    &$movekey		($rndkey0,&QWP(0,$key));
+    &$movekey		($rndkey1,&QWP(16,$key));
+    &xorps		($ivec,$rndkey0)	if (defined($ivec));
+    &lea		($key,&DWP(32,$key));
+    &xorps		($inout,$ivec)		if (defined($ivec));
+    &xorps		($inout,$rndkey0)	if (!defined($ivec));
+    &set_label("${p}1_loop_$sn");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&dec		($rounds);
+	&$movekey	($rndkey1,&QWP(0,$key));
+	&lea		($key,&DWP(16,$key));
+    &jnz		(&label("${p}1_loop_$sn"));
+    eval"&aes${p}last	($inout,$rndkey1)";
+}}
+
+sub aesni_generate1	# fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+
+    &function_begin_B("_aesni_${p}rypt1");
+	&movups		($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(0x10,$key));
+	&xorps		($inout,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0x20,$key));
+	&lea		($key,&DWP(0x30,$key));
+	&cmp		($rounds,11);
+	&jb		(&label("${p}128"));
+	&lea		($key,&DWP(0x20,$key));
+	&je		(&label("${p}192"));
+	&lea		($key,&DWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x30,$key));
+    &set_label("${p}192");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x10,$key));
+    &set_label("${p}128");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x10,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x30,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x50,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x60,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x70,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+    eval"&aes${p}last	($inout,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt1");
+}
+
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+	&mov	("eax",&wparam(0));
+	&mov	($key,&wparam(2));
+	&movups	($inout0,&QWP(0,"eax"));
+	&mov	($rounds,&DWP(240,$key));
+	&mov	("eax",&wparam(1));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	(&QWP(0,"eax"),$inout0);
+	&ret	();
+&function_end_B("${PREFIX}_encrypt");
+
+# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("dec") if(!$inline);
+&function_begin_B("${PREFIX}_decrypt");
+	&mov	("eax",&wparam(0));
+	&mov	($key,&wparam(2));
+	&movups	($inout0,&QWP(0,"eax"));
+	&mov	($rounds,&DWP(240,$key));
+	&mov	("eax",&wparam(1));
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&movups	(&QWP(0,"eax"),$inout0);
+	&ret	();
+&function_end_B("${PREFIX}_decrypt");
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+
+sub aesni_generate3
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt3");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}3_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("${p}3_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement  would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt4");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&shr		($rounds,1);
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}4_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+    &jnz		(&label("${p}4_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt6");
+    &static_label("_aesni_${p}rypt6_enter");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);	# pxor does better here
+	eval"&aes${p}	($inout0,$rndkey1)";
+	&pxor		($inout2,$rndkey0);
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&pxor		($inout3,$rndkey0);
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&pxor		($inout4,$rndkey0);
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&pxor		($inout5,$rndkey0);
+	eval"&aes${p}	($inout4,$rndkey1)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	eval"&aes${p}	($inout5,$rndkey1)";
+	&jmp		(&label("_aesni_${p}rypt6_enter"));
+
+    &set_label("${p}6_loop",16);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	eval"&aes${p}	($inout4,$rndkey1)";
+	eval"&aes${p}	($inout5,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_enter",16);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	eval"&aes${p}	($inout4,$rndkey0)";
+	eval"&aes${p}	($inout5,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+    &jnz		(&label("${p}6_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}	($inout4,$rndkey1)";
+    eval"&aes${p}	($inout5,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    eval"&aes${p}last	($inout4,$rndkey0)";
+    eval"&aes${p}last	($inout5,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+
+if ($PREFIX eq "aesni") {
+######################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#                         size_t length, const AES_KEY *key,
+#                         int enc);
+&function_begin("aesni_ecb_encrypt");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&and	($len,-16);
+	&jz	(&label("ecb_ret"));
+	&mov	($rounds,&DWP(240,$key));
+	&test	($rounds_,$rounds_);
+	&jz	(&label("ecb_decrypt"));
+
+	&mov	($key_,$key);		# backup $key
+	&mov	($rounds_,$rounds);	# backup $rounds
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_enc_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
+
+	&call	("_aesni_encrypt6");
+
+	&mov	($key,$key_);		# restore $key
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_enc_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
+
+&set_label("ecb_enc_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&cmp	($len,0x20);
+	&jb	(&label("ecb_enc_one"));
+	&movups	($inout1,&QWP(0x10,$inp));
+	&je	(&label("ecb_enc_two"));
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_enc_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&je	(&label("ecb_enc_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&jmp	(&label("ecb_ret"));
+######################################################################
+&set_label("ecb_decrypt",16);
+	&mov	($key_,$key);		# backup $key
+	&mov	($rounds_,$rounds);	# backup $rounds
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_dec_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+	&call	("_aesni_decrypt6");
+
+	&mov	($key,$key_);		# restore $key
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_dec_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
+
+&set_label("ecb_dec_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&cmp	($len,0x20);
+	&jb	(&label("ecb_dec_one"));
+	&movups	($inout1,&QWP(0x10,$inp));
+	&je	(&label("ecb_dec_two"));
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_dec_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&je	(&label("ecb_dec_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_decrypt6");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_three",16);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+	&call	("_aesni_decrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ecb_ret");
+&function_end("aesni_ecb_encrypt");
+
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{ my $cmac=$inout1;
+&function_begin("aesni_ccm64_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($rounds,&wparam(5));
+	&mov	($key_,"esp");
+	&sub	("esp",60);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(48,"esp"),$key_);
+
+	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds_,1);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds_);
+	&mov	(&DWP(20,"esp"),$key_);
+	&mov	(&DWP(24,"esp"),$key_);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&shr	($rounds,1);
+	&lea	($key_,&DWP(0,$key));
+	&movdqa	($inout3,&QWP(0,"esp"));
+	&movdqa	($inout0,$ivec);
+	&mov	($rounds_,$rounds);
+	&pshufb	($ivec,$inout3);
+
+&set_label("ccm64_enc_outer");
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&mov		($rounds,$rounds_);
+	&movups		($in0,&QWP(0,$inp));
+
+	&xorps		($inout0,$rndkey0);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&xorps		($rndkey0,$in0);
+	&lea		($key,&DWP(32,$key_));
+	&xorps		($cmac,$rndkey0);		# cmac^=inp
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_enc2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_enc2_loop"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&paddq		($ivec,&QWP(16,"esp"));
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
+
+	&dec	($len);
+	&lea	($inp,&DWP(16,$inp));
+	&xorps	($in0,$inout0);			# inp^=E(ivec)
+	&movdqa	($inout0,$ivec);
+	&movups	(&QWP(0,$out),$in0);		# save output
+	&lea	($out,&DWP(16,$out));
+	&pshufb	($inout0,$inout3);
+	&jnz	(&label("ccm64_enc_outer"));
+
+	&mov	("esp",&DWP(48,"esp"));
+	&mov	($out,&wparam(5));
+	&movups	(&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_encrypt_blocks");
+
+&function_begin("aesni_ccm64_decrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($rounds,&wparam(5));
+	&mov	($key_,"esp");
+	&sub	("esp",60);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(48,"esp"),$key_);
+
+	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds_,1);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds_);
+	&mov	(&DWP(20,"esp"),$key_);
+	&mov	(&DWP(24,"esp"),$key_);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
+	&movdqa	($inout0,$ivec);
+
+	&mov	($key_,$key);
+	&mov	($rounds_,$rounds);
+
+	&pshufb	($ivec,$inout3);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));		# load inp
+	&paddq	($ivec,&QWP(16,"esp"));
+	&lea	($inp,&QWP(16,$inp));
+	&jmp	(&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_outer",16);
+	&xorps	($in0,$inout0);			# inp ^= E(ivec)
+	&movdqa	($inout0,$ivec);
+	&mov	($rounds,$rounds_);
+	&movups	(&QWP(0,$out),$in0);		# save output
+	&lea	($out,&DWP(16,$out));
+	&pshufb	($inout0,$inout3);
+
+	&sub	($len,1);
+	&jz	(&label("ccm64_dec_break"));
+
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&xorps		($in0,$rndkey0);
+	&lea		($key,&DWP(32,$key_));
+	&xorps		($inout0,$rndkey0);
+	&xorps		($cmac,$in0);		# cmac^=out
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_dec2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_dec2_loop"));
+	&movups		($in0,&QWP(0,$inp));	# load inp
+	&paddq		($ivec,&QWP(16,"esp"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&lea		($inp,&QWP(16,$inp));
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
+	&jmp	(&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_break",16);
+	&mov	($key,$key_);
+	if ($inline)
+	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
+	else
+	{   &call	("_aesni_encrypt1",$cmac);	}
+
+	&mov	("esp",&DWP(48,"esp"));
+	&mov	($out,&wparam(5));
+	&movups	(&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_decrypt_blocks");
+}
+
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# stack layout:
+#	0	pshufb mask
+#	16	vector addend: 0,6,6,6
+# 	32	counter-less ivec
+#	48	1st triplet of counter vector
+#	64	2nd triplet of counter vector
+#	80	saved %esp
+
+&function_begin("aesni_ctr32_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($key_,"esp");
+	&sub	("esp",88);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(80,"esp"),$key_);
+
+	&cmp	($len,1);
+	&je	(&label("ctr32_one_shortcut"));
+
+	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds,6);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds);
+	&mov	(&DWP(20,"esp"),$rounds);
+	&mov	(&DWP(24,"esp"),$rounds);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
+	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
+
+	&mov	($rounds,&DWP(240,$key));	# key->rounds
+
+	# compose 2 vectors of 3x32-bit counters
+	&bswap	($rounds_);
+	&pxor	($rndkey1,$rndkey1);
+	&pxor	($rndkey0,$rndkey0);
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
+	&pinsrd	($rndkey1,$rounds_,0);
+	&lea	($key_,&DWP(3,$rounds_));
+	&pinsrd	($rndkey0,$key_,0);
+	&inc	($rounds_);
+	&pinsrd	($rndkey1,$rounds_,1);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,1);
+	&inc	($rounds_);
+	&pinsrd	($rndkey1,$rounds_,2);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,2);
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+
+	&pshufd	($inout0,$rndkey1,3<<6);	# place counter to upper dword
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&cmp	($len,6);
+	&jb	(&label("ctr32_tail"));
+	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec
+	&shr	($rounds,1);
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&sub	($len,6);
+	&jmp	(&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&movdqa	($rndkey1,&QWP(32,"esp"));	# pull counter-less ivec
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout0,$rndkey1);		# merge counter-less ivec
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout1,$rndkey1);
+	&pshufd	($inout5,$rndkey0,1<<6);
+	&por	($inout2,$rndkey1);
+	&por	($inout3,$rndkey1);
+	&por	($inout4,$rndkey1);
+	&por	($inout5,$rndkey1);
+
+	# inlining _aesni_encrypt6's prologue gives ~4% improvement...
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&lea		($key,&DWP(32,$key_));
+	&dec		($rounds);
+	&pxor		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&aesenc		($inout0,$rndkey1);
+	&pxor		($inout2,$rndkey0);
+	&aesenc		($inout1,$rndkey1);
+	&pxor		($inout3,$rndkey0);
+	&aesenc		($inout2,$rndkey1);
+	&pxor		($inout4,$rndkey0);
+	&aesenc		($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	&aesenc		($inout4,$rndkey1);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&aesenc		($inout5,$rndkey1);
+
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
+	&xorps	($inout2,$rndkey1);
+	&movdqa	($rndkey1,&QWP(48,"esp"));	# load 1st triplet
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+
+	&paddd	($rndkey1,$rndkey0);		# 1st triplet increment
+	&paddd	($rndkey0,&QWP(64,"esp"));	# 2nd triplet increment
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
+
+	&movups	($inout1,&QWP(0x30,$inp));
+	&movups	($inout2,&QWP(0x40,$inp));
+	&xorps	($inout3,$inout1);
+	&movups	($inout1,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&xorps	($inout4,$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&xorps	($inout5,$inout1);
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+	&movups	(&QWP(0x40,$out),$inout4);
+	&pshufd	($inout0,$rndkey1,3<<6);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+
+	&mov	($rounds,$rounds_);
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&sub	($len,6);
+	&jnc	(&label("ctr32_loop6"));
+
+	&add	($len,6);
+	&jz	(&label("ctr32_ret"));
+	&mov	($key,$key_);
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&movdqa	($inout5,&QWP(32,"esp"));	# pull count-less ivec
+
+&set_label("ctr32_tail");
+	&por	($inout0,$inout5);
+	&cmp	($len,2);
+	&jb	(&label("ctr32_one"));
+
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&por	($inout1,$inout5);
+	&je	(&label("ctr32_two"));
+
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout2,$inout5);
+	&cmp	($len,4);
+	&jb	(&label("ctr32_three"));
+
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout3,$inout5);
+	&je	(&label("ctr32_four"));
+
+	&por	($inout4,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout2,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout4,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
+	&mov	($rounds,&DWP(240,$key));
+	
+&set_label("ctr32_one");
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));
+	&xorps	($in0,$inout0);
+	&movups	(&QWP(0,$out),$in0);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&movups	($inout5,&QWP(0x20,$inp));
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	($inout4,&QWP(0,$inp));
+	&movups	($inout5,&QWP(0x10,$inp));
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout0,$inout4);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout1,$inout5);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+	&mov	("esp",&DWP(80,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2
+#	const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+
+&function_begin("aesni_xts_encrypt");
+	&mov	($key,&wparam(4));		# key2
+	&mov	($inp,&wparam(5));		# clear-text tweak
+
+	&mov	($rounds,&DWP(240,$key));	# key2->rounds
+	&movups	($inout0,&QWP(0,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));		# key1
+
+	&mov	($key_,"esp");
+	&sub	("esp",16*7+8);
+	&mov	($rounds,&DWP(240,$key));	# key1->rounds
+	&and	("esp",-16);			# align stack
+
+	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
+	&mov	(&DWP(16*6+4,"esp"),0);
+	&mov	(&DWP(16*6+8,"esp"),1);
+	&mov	(&DWP(16*6+12,"esp"),0);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
+	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
+
+	&movdqa	($tweak,$inout0);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+
+	&and	($len,-16);
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&sub	($len,16*6);
+	&jc	(&label("xts_enc_short"));
+
+	&shr	($rounds,1);
+	&mov	($rounds_,$rounds);
+	&jmp	(&label("xts_enc_loop6"));
+
+&set_label("xts_enc_loop6",16);
+	for ($i=0;$i<4;$i++) {
+	    &pshufd	($twres,$twtmp,0x13);
+	    &pxor	($twtmp,$twtmp);
+	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
+	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
+	    &pand	($twres,$twmask);	# isolate carry and residue
+	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
+	    &pxor	($tweak,$twres);
+	}
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	 &$movekey	($rndkey0,&QWP(0,$key_));
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	 &movups	($inout0,&QWP(0,$inp));	# load input
+	&pxor	($inout5,$tweak);
+
+	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	 &pxor		($inout1,$rndkey0);
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	 &pxor		($inout2,$rndkey0);
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	 &pxor		($inout3,$rndkey0);
+	&movdqu	($rndkey1,&QWP(16*5,$inp));
+	 &pxor		($inout4,$rndkey0);
+	&lea	($inp,&DWP(16*6,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
+	&pxor	($inout5,$rndkey1);
+
+	 &$movekey	($rndkey1,&QWP(16,$key_));
+	 &lea		($key,&DWP(32,$key_));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	 &aesenc	($inout0,$rndkey1);
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	 &aesenc	($inout1,$rndkey1);
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	 &dec		($rounds);
+	 &aesenc	($inout2,$rndkey1);
+	&pxor	($inout4,&QWP(16*4,"esp"));
+	 &aesenc	($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	 &aesenc	($inout4,$rndkey1);
+	 &$movekey	($rndkey0,&QWP(0,$key));
+	 &aesenc	($inout5,$rndkey1);
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
+       &pxor	($twtmp,$twtmp);
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*2,$out),$inout2);
+	&xorps	($inout4,&QWP(16*4,"esp"));
+	&movups	(&QWP(16*3,$out),$inout3);
+	&xorps	($inout5,$tweak);
+	&movups	(&QWP(16*4,$out),$inout4);
+       &pshufd	($twres,$twtmp,0x13);
+	&movups	(&QWP(16*5,$out),$inout5);
+	&lea	($out,&DWP(16*6,$out));
+       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
+
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	($rounds,$rounds_);		# restore $rounds
+	&pxor	($tweak,$twres);
+
+	&sub	($len,16*6);
+	&jnc	(&label("xts_enc_loop6"));
+
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds_,$rounds);
+
+&set_label("xts_enc_short");
+	&add	($len,16*6);
+	&jz	(&label("xts_enc_done6x"));
+
+	&movdqa	($inout3,$tweak);		# put aside previous tweak
+	&cmp	($len,0x20);
+	&jb	(&label("xts_enc_one"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&je	(&label("xts_enc_two"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&cmp	($len,0x40);
+	&jb	(&label("xts_enc_three"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout5,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&movdqa	(&QWP(16*0,"esp"),$inout3);
+	&movdqa	(&QWP(16*1,"esp"),$inout4);
+	&je	(&label("xts_enc_four"));
+
+	&movdqa	(&QWP(16*2,"esp"),$inout5);
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*3,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	&pxor	($inout5,$tweak);
+
+	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	&lea	($inp,&DWP(16*5,$inp));
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
+	&pxor	($inout4,$inout5);
+
+	&call	("_aesni_encrypt6");
+
+	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout4,$tweak);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&movups	(&QWP(16*4,$out),$inout4);
+	&lea	($out,&DWP(16*5,$out));
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_one",16);
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&lea	($inp,&DWP(16*1,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&lea	($out,&DWP(16*1,$out));
+
+	&movdqa	($tweak,$inout3);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_two",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&lea	($inp,&DWP(16*2,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout2);
+
+	&call	("_aesni_encrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&lea	($out,&DWP(16*2,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_three",16);
+	&movaps	($inout5,$tweak);		# put aside last tweak
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&lea	($inp,&DWP(16*3,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+
+	&call	("_aesni_encrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&lea	($out,&DWP(16*3,$out));
+
+	&movdqa	($tweak,$inout5);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_four",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movups	($inout3,&QWP(16*3,$inp));
+	&lea	($inp,&DWP(16*4,$inp));
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&xorps	($inout3,$inout4);
+
+	&call	("_aesni_encrypt4");
+
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,$inout4);
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&lea	($out,&DWP(16*4,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&and	($len,15);
+	&jz	(&label("xts_enc_ret"));
+	&movdqa	($inout3,$tweak);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&jmp	(&label("xts_enc_steal"));
+
+&set_label("xts_enc_done",16);
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&pxor	($twtmp,$twtmp);
+	&and	($len,15);
+	&jz	(&label("xts_enc_ret"));
+
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&pshufd	($inout3,$twtmp,0x13);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
+	&pxor	($inout3,$tweak);
+
+&set_label("xts_enc_steal");
+	&movz	($rounds,&BP(0,$inp));
+	&movz	($key,&BP(-16,$out));
+	&lea	($inp,&DWP(1,$inp));
+	&mov	(&BP(-16,$out),&LB($rounds));
+	&mov	(&BP(0,$out),&LB($key));
+	&lea	($out,&DWP(1,$out));
+	&sub	($len,1);
+	&jnz	(&label("xts_enc_steal"));
+
+	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(-16,$out));	# load input
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(-16,$out),$inout0);	# write output
+
+&set_label("xts_enc_ret");
+	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
+&function_end("aesni_xts_encrypt");
+
+&function_begin("aesni_xts_decrypt");
+	&mov	($key,&wparam(4));		# key2
+	&mov	($inp,&wparam(5));		# clear-text tweak
+
+	&mov	($rounds,&DWP(240,$key));	# key2->rounds
+	&movups	($inout0,&QWP(0,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));		# key1
+
+	&mov	($key_,"esp");
+	&sub	("esp",16*7+8);
+	&and	("esp",-16);			# align stack
+
+	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
+	&test	($len,15);
+	&setnz	(&LB($rounds_));
+	&shl	($rounds_,4);
+	&sub	($len,$rounds_);
+
+	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
+	&mov	(&DWP(16*6+4,"esp"),0);
+	&mov	(&DWP(16*6+8,"esp"),1);
+	&mov	(&DWP(16*6+12,"esp"),0);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
+	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
+
+	&mov	($rounds,&DWP(240,$key));	# key1->rounds
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+
+	&movdqa	($tweak,$inout0);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+
+	&and	($len,-16);
+	&sub	($len,16*6);
+	&jc	(&label("xts_dec_short"));
+
+	&shr	($rounds,1);
+	&mov	($rounds_,$rounds);
+	&jmp	(&label("xts_dec_loop6"));
+
+&set_label("xts_dec_loop6",16);
+	for ($i=0;$i<4;$i++) {
+	    &pshufd	($twres,$twtmp,0x13);
+	    &pxor	($twtmp,$twtmp);
+	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
+	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
+	    &pand	($twres,$twmask);	# isolate carry and residue
+	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
+	    &pxor	($tweak,$twres);
+	}
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	 &$movekey	($rndkey0,&QWP(0,$key_));
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	 &movups	($inout0,&QWP(0,$inp));	# load input
+	&pxor	($inout5,$tweak);
+
+	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	 &pxor		($inout1,$rndkey0);
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	 &pxor		($inout2,$rndkey0);
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	 &pxor		($inout3,$rndkey0);
+	&movdqu	($rndkey1,&QWP(16*5,$inp));
+	 &pxor		($inout4,$rndkey0);
+	&lea	($inp,&DWP(16*6,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
+	&pxor	($inout5,$rndkey1);
+
+	 &$movekey	($rndkey1,&QWP(16,$key_));
+	 &lea		($key,&DWP(32,$key_));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	 &aesdec	($inout0,$rndkey1);
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	 &aesdec	($inout1,$rndkey1);
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	 &dec		($rounds);
+	 &aesdec	($inout2,$rndkey1);
+	&pxor	($inout4,&QWP(16*4,"esp"));
+	 &aesdec	($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	 &aesdec	($inout4,$rndkey1);
+	 &$movekey	($rndkey0,&QWP(0,$key));
+	 &aesdec	($inout5,$rndkey1);
+	&call		(&label("_aesni_decrypt6_enter"));
+
+	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
+       &pxor	($twtmp,$twtmp);
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*2,$out),$inout2);
+	&xorps	($inout4,&QWP(16*4,"esp"));
+	&movups	(&QWP(16*3,$out),$inout3);
+	&xorps	($inout5,$tweak);
+	&movups	(&QWP(16*4,$out),$inout4);
+       &pshufd	($twres,$twtmp,0x13);
+	&movups	(&QWP(16*5,$out),$inout5);
+	&lea	($out,&DWP(16*6,$out));
+       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
+
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	($rounds,$rounds_);		# restore $rounds
+	&pxor	($tweak,$twres);
+
+	&sub	($len,16*6);
+	&jnc	(&label("xts_dec_loop6"));
+
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds_,$rounds);
+
+&set_label("xts_dec_short");
+	&add	($len,16*6);
+	&jz	(&label("xts_dec_done6x"));
+
+	&movdqa	($inout3,$tweak);		# put aside previous tweak
+	&cmp	($len,0x20);
+	&jb	(&label("xts_dec_one"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&je	(&label("xts_dec_two"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&cmp	($len,0x40);
+	&jb	(&label("xts_dec_three"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout5,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&movdqa	(&QWP(16*0,"esp"),$inout3);
+	&movdqa	(&QWP(16*1,"esp"),$inout4);
+	&je	(&label("xts_dec_four"));
+
+	&movdqa	(&QWP(16*2,"esp"),$inout5);
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*3,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	&pxor	($inout5,$tweak);
+
+	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	&lea	($inp,&DWP(16*5,$inp));
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
+	&pxor	($inout4,$inout5);
+
+	&call	("_aesni_decrypt6");
+
+	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout4,$tweak);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&movups	(&QWP(16*4,$out),$inout4);
+	&lea	($out,&DWP(16*5,$out));
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_one",16);
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&lea	($inp,&DWP(16*1,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&lea	($out,&DWP(16*1,$out));
+
+	&movdqa	($tweak,$inout3);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_two",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&lea	($inp,&DWP(16*2,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+
+	&call	("_aesni_decrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&lea	($out,&DWP(16*2,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_three",16);
+	&movaps	($inout5,$tweak);		# put aside last tweak
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&lea	($inp,&DWP(16*3,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+
+	&call	("_aesni_decrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&lea	($out,&DWP(16*3,$out));
+
+	&movdqa	($tweak,$inout5);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_four",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movups	($inout3,&QWP(16*3,$inp));
+	&lea	($inp,&DWP(16*4,$inp));
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&xorps	($inout3,$inout4);
+
+	&call	("_aesni_decrypt4");
+
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,$inout4);
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&lea	($out,&DWP(16*4,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&and	($len,15);
+	&jz	(&label("xts_dec_ret"));
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&jmp	(&label("xts_dec_only_one_more"));
+
+&set_label("xts_dec_done",16);
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&pxor	($twtmp,$twtmp);
+	&and	($len,15);
+	&jz	(&label("xts_dec_ret"));
+
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(16*6,"esp"));
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+
+&set_label("xts_dec_only_one_more");
+	&pshufd	($inout3,$twtmp,0x13);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($inout3,$twmask);		# isolate carry and residue
+	&pxor	($inout3,$tweak);
+
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(0,$inp));		# load input
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(0,$out),$inout0);		# write output
+
+&set_label("xts_dec_steal");
+	&movz	($rounds,&BP(16,$inp));
+	&movz	($key,&BP(0,$out));
+	&lea	($inp,&DWP(1,$inp));
+	&mov	(&BP(0,$out),&LB($rounds));
+	&mov	(&BP(16,$out),&LB($key));
+	&lea	($out,&DWP(1,$out));
+	&sub	($len,1);
+	&jnz	(&label("xts_dec_steal"));
+
+	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(0,$out));		# load input
+	&xorps	($inout0,$inout4);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout4);		# output^=tweak
+	&movups	(&QWP(0,$out),$inout0);		# write output
+
+&set_label("xts_dec_ret");
+	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
+&function_end("aesni_xts_decrypt");
+}
+}
+
+######################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#                           size_t length, const AES_KEY *key,
+#                           unsigned char *ivp,const int enc);
+&function_begin("${PREFIX}_cbc_encrypt");
+	&mov	($inp,&wparam(0));
+	&mov	($rounds_,"esp");
+	&mov	($out,&wparam(1));
+	&sub	($rounds_,24);
+	&mov	($len,&wparam(2));
+	&and	($rounds_,-16);
+	&mov	($key,&wparam(3));
+	&mov	($key_,&wparam(4));
+	&test	($len,$len);
+	&jz	(&label("cbc_abort"));
+
+	&cmp	(&wparam(5),0);
+	&xchg	($rounds_,"esp");		# alloca
+	&movups	($ivec,&QWP(0,$key_));		# load IV
+	&mov	($rounds,&DWP(240,$key));
+	&mov	($key_,$key);			# backup $key
+	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&je	(&label("cbc_decrypt"));
+
+	&movaps	($inout0,$ivec);
+	&cmp	($len,16);
+	&jb	(&label("cbc_enc_tail"));
+	&sub	($len,16);
+	&jmp	(&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+	&movups	($ivec,&QWP(0,$inp));		# input actually
+	&lea	($inp,&DWP(16,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
+	else
+	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&mov	($key,$key_);		# restore $key
+	&movups	(&QWP(0,$out),$inout0);	# store output
+	&lea	($out,&DWP(16,$out));
+	&sub	($len,16);
+	&jnc	(&label("cbc_enc_loop"));
+	&add	($len,16);
+	&jnz	(&label("cbc_enc_tail"));
+	&movaps	($ivec,$inout0);
+	&jmp	(&label("cbc_ret"));
+
+&set_label("cbc_enc_tail");
+	&mov	("ecx",$len);		# zaps $rounds
+	&data_word(0xA4F3F689);		# rep movsb
+	&mov	("ecx",16);		# zero tail
+	&sub	("ecx",$len);
+	&xor	("eax","eax");		# zaps $len
+	&data_word(0xAAF3F689);		# rep stosb
+	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&mov	($inp,$out);		# $inp and $out are the same
+	&mov	($key,$key_);		# restore $key
+	&jmp	(&label("cbc_enc_loop"));
+######################################################################
+&set_label("cbc_decrypt",16);
+	&cmp	($len,0x50);
+	&jbe	(&label("cbc_dec_tail"));
+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
+	&sub	($len,0x50);
+	&jmp	(&label("cbc_dec_loop6_enter"));
+
+&set_label("cbc_dec_loop6",16);
+	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
+	&movups	(&QWP(0,$out),$inout5);
+	&lea	($out,&DWP(0x10,$out));
+&set_label("cbc_dec_loop6_enter");
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+
+	&call	("_aesni_decrypt6");
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
+	&xorps	($inout1,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout2,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout3,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout4,$rndkey0);
+	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
+	&xorps	($inout5,$rndkey1);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($inp,&DWP(0x60,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&mov	($rounds,$rounds_)		# restore $rounds
+	&movups	(&QWP(0x30,$out),$inout3);
+	&mov	($key,$key_);			# restore $key
+	&movups	(&QWP(0x40,$out),$inout4);
+	&lea	($out,&DWP(0x50,$out));
+	&sub	($len,0x60);
+	&ja	(&label("cbc_dec_loop6"));
+
+	&movaps	($inout0,$inout5);
+	&movaps	($ivec,$rndkey0);
+	&add	($len,0x50);
+	&jle	(&label("cbc_dec_tail_collected"));
+	&movups	(&QWP(0,$out),$inout0);
+	&lea	($out,&DWP(0x10,$out));
+&set_label("cbc_dec_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&movaps	($in0,$inout0);
+	&cmp	($len,0x10);
+	&jbe	(&label("cbc_dec_one"));
+
+	&movups	($inout1,&QWP(0x10,$inp));
+	&movaps	($in1,$inout1);
+	&cmp	($len,0x20);
+	&jbe	(&label("cbc_dec_two"));
+
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x30);
+	&jbe	(&label("cbc_dec_three"));
+
+	&movups	($inout3,&QWP(0x30,$inp));
+	&cmp	($len,0x40);
+	&jbe	(&label("cbc_dec_four"));
+
+	&movups	($inout4,&QWP(0x40,$inp));
+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
+	&movups	($inout0,&QWP(0,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_decrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
+	&xorps	($inout1,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout2,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout3,$rndkey1);
+	&movups	($ivec,&QWP(0x40,$inp));	# IV
+	&xorps	($inout4,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&lea	($out,&DWP(0x40,$out));
+	&movaps	($inout0,$inout4);
+	&sub	($len,0x50);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$ivec);
+	&movaps	($ivec,$in0);
+	&sub	($len,0x10);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_decrypt3");
+	&xorps	($inout0,$ivec);
+	&xorps	($inout1,$in0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movaps	($inout0,$inout1);
+	&lea	($out,&DWP(0x10,$out));
+	&movaps	($ivec,$in1);
+	&sub	($len,0x20);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_three",16);
+	&call	("_aesni_decrypt3");
+	&xorps	($inout0,$ivec);
+	&xorps	($inout1,$in0);
+	&xorps	($inout2,$in1);
+	&movups	(&QWP(0,$out),$inout0);
+	&movaps	($inout0,$inout2);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($out,&DWP(0x20,$out));
+	&movups	($ivec,&QWP(0x20,$inp));
+	&sub	($len,0x30);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_four",16);
+	&call	("_aesni_decrypt4");
+	&movups	($rndkey1,&QWP(0x10,$inp));
+	&movups	($rndkey0,&QWP(0x20,$inp));
+	&xorps	($inout0,$ivec);
+	&movups	($ivec,&QWP(0x30,$inp));
+	&xorps	($inout1,$in0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&lea	($out,&DWP(0x30,$out));
+	&movaps	($inout0,$inout3);
+	&sub	($len,0x40);
+
+&set_label("cbc_dec_tail_collected");
+	&and	($len,15);
+	&jnz	(&label("cbc_dec_tail_partial"));
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("cbc_ret"));
+
+&set_label("cbc_dec_tail_partial",16);
+	&movaps	(&QWP(0,"esp"),$inout0);
+	&mov	("ecx",16);
+	&mov	($inp,"esp");
+	&sub	("ecx",$len);
+	&data_word(0xA4F3F689);		# rep movsb
+
+&set_label("cbc_ret");
+	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
+	&mov	($key_,&wparam(4));
+	&movups	(&QWP(0,$key_),$ivec);	# output IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+#	"eax"	const unsigned char *userKey
+#	$rounds	int bits
+#	$key	AES_KEY *key
+# output:
+#	"eax"	return code
+#	$round	rounds
+
+&function_begin_B("_aesni_set_encrypt_key");
+	&test	("eax","eax");
+	&jz	(&label("bad_pointer"));
+	&test	($key,$key);
+	&jz	(&label("bad_pointer"));
+
+	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
+	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
+	&lea	($key,&DWP(16,$key));
+	&cmp	($rounds,256);
+	&je	(&label("14rounds"));
+	&cmp	($rounds,192);
+	&je	(&label("12rounds"));
+	&cmp	($rounds,128);
+	&jne	(&label("bad_keybits"));
+
+&set_label("10rounds",16);
+	&mov		($rounds,9);
+	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
+	&call		(&label("key_128_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
+	&call		(&label("key_128"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(80,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_128",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_128_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("12rounds",16);
+	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
+	&mov		($rounds,11);
+	&$movekey	(&QWP(-16,$key),"xmm0")		# round 0
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
+	&call		(&label("key_192a_cold"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
+	&call		(&label("key_192b"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(48,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_192a",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+	&movaps		("xmm5","xmm2");
+&set_label("key_192b_warm");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&movdqa		("xmm3","xmm2");
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&pslldq		("xmm3",4);
+	&xorps		("xmm0","xmm4");
+	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
+	&pxor		("xmm2","xmm3");
+	&pxor		("xmm0","xmm1");
+	&pshufd		("xmm3","xmm0",0b11111111);
+	&pxor		("xmm2","xmm3");
+	&ret();
+
+&set_label("key_192b",16);
+	&movaps		("xmm3","xmm0");
+	&shufps		("xmm5","xmm0",0b01000100);
+	&$movekey	(&QWP(0,$key),"xmm5");
+	&shufps		("xmm3","xmm2",0b01001110);
+	&$movekey	(&QWP(16,$key),"xmm3");
+	&lea		($key,&DWP(32,$key));
+	&jmp		(&label("key_192b_warm"));
+
+&set_label("14rounds",16);
+	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
+	&mov		($rounds,13);
+	&lea		($key,&DWP(16,$key));
+	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
+	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
+	&call		(&label("key_256a_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
+	&call		(&label("key_256a"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(16,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_256a",16);
+	&$movekey	(&QWP(0,$key),"xmm2");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("key_256b",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+
+	&shufps		("xmm4","xmm2",0b00010000);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm4","xmm2",0b10001100);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm1","xmm1",0b10101010);	# critical path
+	&xorps		("xmm2","xmm1");
+	&ret();
+
+&set_label("bad_pointer",4);
+	&mov	("eax",-1);
+	&ret	();
+&set_label("bad_keybits",4);
+	&mov	("eax",-2);
+	&ret	();
+&function_end_B("_aesni_set_encrypt_key");
+
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&call	("_aesni_set_encrypt_key");
+	&ret	();
+&function_end_B("${PREFIX}_set_encrypt_key");
+
+# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_decrypt_key");
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&call	("_aesni_set_encrypt_key");
+	&mov	($key,&wparam(2));
+	&shl	($rounds,4)	# rounds-1 after _aesni_set_encrypt_key
+	&test	("eax","eax");
+	&jnz	(&label("dec_key_ret"));
+	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
+
+	&$movekey	("xmm0",&QWP(0,$key));	# just swap
+	&$movekey	("xmm1",&QWP(0,"eax"));
+	&$movekey	(&QWP(0,"eax"),"xmm0");
+	&$movekey	(&QWP(0,$key),"xmm1");
+	&lea		($key,&DWP(16,$key));
+	&lea		("eax",&DWP(-16,"eax"));
+
+&set_label("dec_key_inverse");
+	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
+	&$movekey	("xmm1",&QWP(0,"eax"));
+	&aesimc		("xmm0","xmm0");
+	&aesimc		("xmm1","xmm1");
+	&lea		($key,&DWP(16,$key));
+	&lea		("eax",&DWP(-16,"eax"));
+	&$movekey	(&QWP(16,"eax"),"xmm0");
+	&$movekey	(&QWP(-16,$key),"xmm1");
+	&cmp		("eax",$key);
+	&ja		(&label("dec_key_inverse"));
+
+	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
+	&aesimc		("xmm0","xmm0");
+	&$movekey	(&QWP(0,$key),"xmm0");
+
+	&xor		("eax","eax");		# return success
+&set_label("dec_key_ret");
+	&ret	();
+&function_end_B("${PREFIX}_set_decrypt_key");
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/main/openssl/crypto/aes/asm/aesni-x86_64.S b/main/openssl/crypto/aes/asm/aesni-x86_64.S
new file mode 100644
index 00000000..917c8323
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-x86_64.S
@@ -0,0 +1,2535 @@
+.text	
+.globl	aesni_encrypt
+.type	aesni_encrypt,@function
+.align	16
+aesni_encrypt:
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_1:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_enc1_1	
+.byte	102,15,56,221,209
+	movups	%xmm2,(%rsi)
+	.byte	0xf3,0xc3
+.size	aesni_encrypt,.-aesni_encrypt
+
+.globl	aesni_decrypt
+.type	aesni_decrypt,@function
+.align	16
+aesni_decrypt:
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_2:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_dec1_2	
+.byte	102,15,56,223,209
+	movups	%xmm2,(%rsi)
+	.byte	0xf3,0xc3
+.size	aesni_decrypt, .-aesni_decrypt
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	(%rcx),%xmm0
+
+.Lenc_loop3:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop3
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.type	_aesni_decrypt3,@function
+.align	16
+_aesni_decrypt3:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	(%rcx),%xmm0
+
+.Ldec_loop3:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop3
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt3,.-_aesni_decrypt3
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	(%rcx),%xmm0
+
+.Lenc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.type	_aesni_decrypt4,@function
+.align	16
+_aesni_decrypt4:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	(%rcx),%xmm0
+
+.Ldec_loop4:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop4
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt4,.-_aesni_decrypt4
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,220,241
+	movups	(%rcx),%xmm0
+.byte	102,15,56,220,249
+	jmp	.Lenc_loop6_enter
+.align	16
+.Lenc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.Lenc_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.type	_aesni_decrypt6,@function
+.align	16
+_aesni_decrypt6:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,222,241
+	movups	(%rcx),%xmm0
+.byte	102,15,56,222,249
+	jmp	.Ldec_loop6_enter
+.align	16
+.Ldec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.Ldec_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop6
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt6,.-_aesni_decrypt6
+.type	_aesni_encrypt8,@function
+.align	16
+_aesni_encrypt8:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,220,241
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,220,249
+	pxor	%xmm0,%xmm9
+	movups	(%rcx),%xmm0
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	16(%rcx),%xmm1
+	jmp	.Lenc_loop8_enter
+.align	16
+.Lenc_loop8:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	16(%rcx),%xmm1
+.Lenc_loop8_enter:
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop8
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+.byte	102,68,15,56,221,192
+.byte	102,68,15,56,221,200
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt8,.-_aesni_encrypt8
+.type	_aesni_decrypt8,@function
+.align	16
+_aesni_decrypt8:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,222,241
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm9
+	movups	(%rcx),%xmm0
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	16(%rcx),%xmm1
+	jmp	.Ldec_loop8_enter
+.align	16
+.Ldec_loop8:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	16(%rcx),%xmm1
+.Ldec_loop8_enter:
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop8
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+.byte	102,68,15,56,223,192
+.byte	102,68,15,56,223,200
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt8,.-_aesni_decrypt8
+.globl	aesni_ecb_encrypt
+.type	aesni_ecb_encrypt,@function
+.align	16
+aesni_ecb_encrypt:
+	andq	$-16,%rdx
+	jz	.Lecb_ret
+
+	movl	240(%rcx),%eax
+	movups	(%rcx),%xmm0
+	movq	%rcx,%r11
+	movl	%eax,%r10d
+	testl	%r8d,%r8d
+	jz	.Lecb_decrypt
+
+	cmpq	$128,%rdx
+	jb	.Lecb_enc_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	jmp	.Lecb_enc_loop8_enter
+.align	16
+.Lecb_enc_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	subq	$128,%rdx
+	jnc	.Lecb_enc_loop8
+
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	addq	$128,%rdx
+	jz	.Lecb_ret
+
+.Lecb_enc_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$32,%rdx
+	jb	.Lecb_enc_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_enc_two
+	movups	32(%rdi),%xmm4
+	cmpq	$64,%rdx
+	jb	.Lecb_enc_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_enc_four
+	movups	64(%rdi),%xmm6
+	cmpq	$96,%rdx
+	jb	.Lecb_enc_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_enc_six
+	movdqu	96(%rdi),%xmm8
+	call	_aesni_encrypt8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_3:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_3	
+.byte	102,15,56,221,209
+	movups	%xmm2,(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_six:
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	jmp	.Lecb_ret
+
+.align	16
+.Lecb_decrypt:
+	cmpq	$128,%rdx
+	jb	.Lecb_dec_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	jmp	.Lecb_dec_loop8_enter
+.align	16
+.Lecb_dec_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	movups	(%r11),%xmm0
+	subq	$128,%rdx
+	jnc	.Lecb_dec_loop8
+
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	addq	$128,%rdx
+	jz	.Lecb_ret
+
+.Lecb_dec_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$32,%rdx
+	jb	.Lecb_dec_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_dec_two
+	movups	32(%rdi),%xmm4
+	cmpq	$64,%rdx
+	jb	.Lecb_dec_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_dec_four
+	movups	64(%rdi),%xmm6
+	cmpq	$96,%rdx
+	jb	.Lecb_dec_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_dec_six
+	movups	96(%rdi),%xmm8
+	movups	(%rcx),%xmm0
+	call	_aesni_decrypt8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_4:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_4	
+.byte	102,15,56,223,209
+	movups	%xmm2,(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_six:
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+
+.Lecb_ret:
+	.byte	0xf3,0xc3
+.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
+.globl	aesni_ccm64_encrypt_blocks
+.type	aesni_ccm64_encrypt_blocks,@function
+.align	16
+aesni_ccm64_encrypt_blocks:
+	movl	240(%rcx),%eax
+	movdqu	(%r8),%xmm9
+	movdqa	.Lincrement64(%rip),%xmm6
+	movdqa	.Lbswap_mask(%rip),%xmm7
+
+	shrl	$1,%eax
+	leaq	0(%rcx),%r11
+	movdqu	(%r9),%xmm3
+	movdqa	%xmm9,%xmm2
+	movl	%eax,%r10d
+.byte	102,68,15,56,0,207
+	jmp	.Lccm64_enc_outer
+.align	16
+.Lccm64_enc_outer:
+	movups	(%r11),%xmm0
+	movl	%r10d,%eax
+	movups	(%rdi),%xmm8
+
+	xorps	%xmm0,%xmm2
+	movups	16(%r11),%xmm1
+	xorps	%xmm8,%xmm0
+	leaq	32(%r11),%rcx
+	xorps	%xmm0,%xmm3
+	movups	(%rcx),%xmm0
+
+.Lccm64_enc2_loop:
+.byte	102,15,56,220,209
+	decl	%eax
+.byte	102,15,56,220,217
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,216
+	movups	0(%rcx),%xmm0
+	jnz	.Lccm64_enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	paddq	%xmm6,%xmm9
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+
+	decq	%rdx
+	leaq	16(%rdi),%rdi
+	xorps	%xmm2,%xmm8
+	movdqa	%xmm9,%xmm2
+	movups	%xmm8,(%rsi)
+	leaq	16(%rsi),%rsi
+.byte	102,15,56,0,215
+	jnz	.Lccm64_enc_outer
+
+	movups	%xmm3,(%r9)
+	.byte	0xf3,0xc3
+.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+.globl	aesni_ccm64_decrypt_blocks
+.type	aesni_ccm64_decrypt_blocks,@function
+.align	16
+aesni_ccm64_decrypt_blocks:
+	movl	240(%rcx),%eax
+	movups	(%r8),%xmm9
+	movdqu	(%r9),%xmm3
+	movdqa	.Lincrement64(%rip),%xmm6
+	movdqa	.Lbswap_mask(%rip),%xmm7
+
+	movaps	%xmm9,%xmm2
+	movl	%eax,%r10d
+	movq	%rcx,%r11
+.byte	102,68,15,56,0,207
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_5:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_5	
+.byte	102,15,56,221,209
+	movups	(%rdi),%xmm8
+	paddq	%xmm6,%xmm9
+	leaq	16(%rdi),%rdi
+	jmp	.Lccm64_dec_outer
+.align	16
+.Lccm64_dec_outer:
+	xorps	%xmm2,%xmm8
+	movdqa	%xmm9,%xmm2
+	movl	%r10d,%eax
+	movups	%xmm8,(%rsi)
+	leaq	16(%rsi),%rsi
+.byte	102,15,56,0,215
+
+	subq	$1,%rdx
+	jz	.Lccm64_dec_break
+
+	movups	(%r11),%xmm0
+	shrl	$1,%eax
+	movups	16(%r11),%xmm1
+	xorps	%xmm0,%xmm8
+	leaq	32(%r11),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm8,%xmm3
+	movups	(%rcx),%xmm0
+
+.Lccm64_dec2_loop:
+.byte	102,15,56,220,209
+	decl	%eax
+.byte	102,15,56,220,217
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,216
+	movups	0(%rcx),%xmm0
+	jnz	.Lccm64_dec2_loop
+	movups	(%rdi),%xmm8
+	paddq	%xmm6,%xmm9
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leaq	16(%rdi),%rdi
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	jmp	.Lccm64_dec_outer
+
+.align	16
+.Lccm64_dec_break:
+
+	movups	(%r11),%xmm0
+	movups	16(%r11),%xmm1
+	xorps	%xmm0,%xmm8
+	leaq	32(%r11),%r11
+	xorps	%xmm8,%xmm3
+.Loop_enc1_6:
+.byte	102,15,56,220,217
+	decl	%eax
+	movups	(%r11),%xmm1
+	leaq	16(%r11),%r11
+	jnz	.Loop_enc1_6	
+.byte	102,15,56,221,217
+	movups	%xmm3,(%r9)
+	.byte	0xf3,0xc3
+.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,@function
+.align	16
+aesni_ctr32_encrypt_blocks:
+	cmpq	$1,%rdx
+	je	.Lctr32_one_shortcut
+
+	movdqu	(%r8),%xmm14
+	movdqa	.Lbswap_mask(%rip),%xmm15
+	xorl	%eax,%eax
+.byte	102,69,15,58,22,242,3
+.byte	102,68,15,58,34,240,3
+
+	movl	240(%rcx),%eax
+	bswapl	%r10d
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+.byte	102,69,15,58,34,226,0
+	leaq	3(%r10),%r11
+.byte	102,69,15,58,34,235,0
+	incl	%r10d
+.byte	102,69,15,58,34,226,1
+	incq	%r11
+.byte	102,69,15,58,34,235,1
+	incl	%r10d
+.byte	102,69,15,58,34,226,2
+	incq	%r11
+.byte	102,69,15,58,34,235,2
+	movdqa	%xmm12,-40(%rsp)
+.byte	102,69,15,56,0,231
+	movdqa	%xmm13,-24(%rsp)
+.byte	102,69,15,56,0,239
+
+	pshufd	$192,%xmm12,%xmm2
+	pshufd	$128,%xmm12,%xmm3
+	pshufd	$64,%xmm12,%xmm4
+	cmpq	$6,%rdx
+	jb	.Lctr32_tail
+	shrl	$1,%eax
+	movq	%rcx,%r11
+	movl	%eax,%r10d
+	subq	$6,%rdx
+	jmp	.Lctr32_loop6
+
+.align	16
+.Lctr32_loop6:
+	pshufd	$192,%xmm13,%xmm5
+	por	%xmm14,%xmm2
+	movups	(%r11),%xmm0
+	pshufd	$128,%xmm13,%xmm6
+	por	%xmm14,%xmm3
+	movups	16(%r11),%xmm1
+	pshufd	$64,%xmm13,%xmm7
+	por	%xmm14,%xmm4
+	por	%xmm14,%xmm5
+	xorps	%xmm0,%xmm2
+	por	%xmm14,%xmm6
+	por	%xmm14,%xmm7
+
+
+
+
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	leaq	32(%r11),%rcx
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	movdqa	.Lincrement32(%rip),%xmm13
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	movdqa	-40(%rsp),%xmm12
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	movups	(%rcx),%xmm0
+	decl	%eax
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	jmp	.Lctr32_enc_loop6_enter
+.align	16
+.Lctr32_enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.Lctr32_enc_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%rcx),%xmm0
+	jnz	.Lctr32_enc_loop6
+
+.byte	102,15,56,220,209
+	paddd	%xmm13,%xmm12
+.byte	102,15,56,220,217
+	paddd	-24(%rsp),%xmm13
+.byte	102,15,56,220,225
+	movdqa	%xmm12,-40(%rsp)
+.byte	102,15,56,220,233
+	movdqa	%xmm13,-24(%rsp)
+.byte	102,15,56,220,241
+.byte	102,69,15,56,0,231
+.byte	102,15,56,220,249
+.byte	102,69,15,56,0,239
+
+.byte	102,15,56,221,208
+	movups	(%rdi),%xmm8
+.byte	102,15,56,221,216
+	movups	16(%rdi),%xmm9
+.byte	102,15,56,221,224
+	movups	32(%rdi),%xmm10
+.byte	102,15,56,221,232
+	movups	48(%rdi),%xmm11
+.byte	102,15,56,221,240
+	movups	64(%rdi),%xmm1
+.byte	102,15,56,221,248
+	movups	80(%rdi),%xmm0
+	leaq	96(%rdi),%rdi
+
+	xorps	%xmm2,%xmm8
+	pshufd	$192,%xmm12,%xmm2
+	xorps	%xmm3,%xmm9
+	pshufd	$128,%xmm12,%xmm3
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	pshufd	$64,%xmm12,%xmm4
+	movups	%xmm9,16(%rsi)
+	xorps	%xmm5,%xmm11
+	movups	%xmm10,32(%rsi)
+	xorps	%xmm6,%xmm1
+	movups	%xmm11,48(%rsi)
+	xorps	%xmm7,%xmm0
+	movups	%xmm1,64(%rsi)
+	movups	%xmm0,80(%rsi)
+	leaq	96(%rsi),%rsi
+	movl	%r10d,%eax
+	subq	$6,%rdx
+	jnc	.Lctr32_loop6
+
+	addq	$6,%rdx
+	jz	.Lctr32_done
+	movq	%r11,%rcx
+	leal	1(%rax,%rax,1),%eax
+
+.Lctr32_tail:
+	por	%xmm14,%xmm2
+	movups	(%rdi),%xmm8
+	cmpq	$2,%rdx
+	jb	.Lctr32_one
+
+	por	%xmm14,%xmm3
+	movups	16(%rdi),%xmm9
+	je	.Lctr32_two
+
+	pshufd	$192,%xmm13,%xmm5
+	por	%xmm14,%xmm4
+	movups	32(%rdi),%xmm10
+	cmpq	$4,%rdx
+	jb	.Lctr32_three
+
+	pshufd	$128,%xmm13,%xmm6
+	por	%xmm14,%xmm5
+	movups	48(%rdi),%xmm11
+	je	.Lctr32_four
+
+	por	%xmm14,%xmm6
+	xorps	%xmm7,%xmm7
+
+	call	_aesni_encrypt6
+
+	movups	64(%rdi),%xmm1
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	movups	%xmm9,16(%rsi)
+	xorps	%xmm5,%xmm11
+	movups	%xmm10,32(%rsi)
+	xorps	%xmm6,%xmm1
+	movups	%xmm11,48(%rsi)
+	movups	%xmm1,64(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_one_shortcut:
+	movups	(%r8),%xmm2
+	movups	(%rdi),%xmm8
+	movl	240(%rcx),%eax
+.Lctr32_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_7:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_7	
+.byte	102,15,56,221,209
+	xorps	%xmm2,%xmm8
+	movups	%xmm8,(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	movups	%xmm9,16(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_three:
+	call	_aesni_encrypt3
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	movups	%xmm9,16(%rsi)
+	movups	%xmm10,32(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_four:
+	call	_aesni_encrypt4
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	movups	%xmm9,16(%rsi)
+	xorps	%xmm5,%xmm11
+	movups	%xmm10,32(%rsi)
+	movups	%xmm11,48(%rsi)
+
+.Lctr32_done:
+	.byte	0xf3,0xc3
+.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+.globl	aesni_xts_encrypt
+.type	aesni_xts_encrypt,@function
+.align	16
+aesni_xts_encrypt:
+	leaq	-104(%rsp),%rsp
+	movups	(%r9),%xmm15
+	movl	240(%r8),%eax
+	movl	240(%rcx),%r10d
+	movups	(%r8),%xmm0
+	movups	16(%r8),%xmm1
+	leaq	32(%r8),%r8
+	xorps	%xmm0,%xmm15
+.Loop_enc1_8:
+.byte	102,68,15,56,220,249
+	decl	%eax
+	movups	(%r8),%xmm1
+	leaq	16(%r8),%r8
+	jnz	.Loop_enc1_8	
+.byte	102,68,15,56,221,249
+	movq	%rcx,%r11
+	movl	%r10d,%eax
+	movq	%rdx,%r9
+	andq	$-16,%rdx
+
+	movdqa	.Lxts_magic(%rip),%xmm8
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	subq	$96,%rdx
+	jc	.Lxts_enc_short
+
+	shrl	$1,%eax
+	subl	$1,%eax
+	movl	%eax,%r10d
+	jmp	.Lxts_enc_grandloop
+
+.align	16
+.Lxts_enc_grandloop:
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	0(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm12,%xmm4
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+	pxor	%xmm13,%xmm5
+	movups	(%r11),%xmm0
+	pxor	%xmm14,%xmm6
+	pxor	%xmm15,%xmm7
+
+
+
+	movups	16(%r11),%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,220,209
+	leaq	32(%r11),%rcx
+	pxor	%xmm0,%xmm4
+	movdqa	%xmm11,16(%rsp)
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	movdqa	%xmm12,32(%rsp)
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm13,48(%rsp)
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	movups	(%rcx),%xmm0
+	decl	%eax
+	movdqa	%xmm14,64(%rsp)
+.byte	102,15,56,220,241
+	movdqa	%xmm15,80(%rsp)
+.byte	102,15,56,220,249
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	jmp	.Lxts_enc_loop6_enter
+
+.align	16
+.Lxts_enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.Lxts_enc_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%rcx),%xmm0
+	jnz	.Lxts_enc_loop6
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,220,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,220,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,220,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	16(%rcx),%xmm1
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,220,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,220,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,220,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	32(%rcx),%xmm0
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,220,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,220,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,220,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,221,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,221,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,221,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	xorps	0(%rsp),%xmm2
+	pand	%xmm8,%xmm9
+	xorps	16(%rsp),%xmm3
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+
+	xorps	32(%rsp),%xmm4
+	movups	%xmm2,0(%rsi)
+	xorps	48(%rsp),%xmm5
+	movups	%xmm3,16(%rsi)
+	xorps	64(%rsp),%xmm6
+	movups	%xmm4,32(%rsi)
+	xorps	80(%rsp),%xmm7
+	movups	%xmm5,48(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	subq	$96,%rdx
+	jnc	.Lxts_enc_grandloop
+
+	leal	3(%rax,%rax,1),%eax
+	movq	%r11,%rcx
+	movl	%eax,%r10d
+
+.Lxts_enc_short:
+	addq	$96,%rdx
+	jz	.Lxts_enc_done
+
+	cmpq	$32,%rdx
+	jb	.Lxts_enc_one
+	je	.Lxts_enc_two
+
+	cmpq	$64,%rdx
+	jb	.Lxts_enc_three
+	je	.Lxts_enc_four
+
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	leaq	80(%rdi),%rdi
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm6
+
+	call	_aesni_encrypt6
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm15,%xmm10
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movdqu	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movdqu	%xmm3,16(%rsi)
+	xorps	%xmm14,%xmm6
+	movdqu	%xmm4,32(%rsi)
+	movdqu	%xmm5,48(%rsi)
+	movdqu	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_one:
+	movups	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_9:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_9	
+.byte	102,15,56,221,209
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm11,%xmm10
+	movups	%xmm2,(%rsi)
+	leaq	16(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_two:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	leaq	32(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+
+	call	_aesni_encrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm12,%xmm10
+	xorps	%xmm11,%xmm3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	leaq	32(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_three:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	32(%rdi),%xmm4
+	leaq	48(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+
+	call	_aesni_encrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm13,%xmm10
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	leaq	48(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_four:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	32(%rdi),%xmm4
+	xorps	%xmm10,%xmm2
+	movups	48(%rdi),%xmm5
+	leaq	64(%rdi),%rdi
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	xorps	%xmm13,%xmm5
+
+	call	_aesni_encrypt4
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm15,%xmm10
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	leaq	64(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_done:
+	andq	$15,%r9
+	jz	.Lxts_enc_ret
+	movq	%r9,%rdx
+
+.Lxts_enc_steal:
+	movzbl	(%rdi),%eax
+	movzbl	-16(%rsi),%ecx
+	leaq	1(%rdi),%rdi
+	movb	%al,-16(%rsi)
+	movb	%cl,0(%rsi)
+	leaq	1(%rsi),%rsi
+	subq	$1,%rdx
+	jnz	.Lxts_enc_steal
+
+	subq	%r9,%rsi
+	movq	%r11,%rcx
+	movl	%r10d,%eax
+
+	movups	-16(%rsi),%xmm2
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_10:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_10	
+.byte	102,15,56,221,209
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,-16(%rsi)
+
+.Lxts_enc_ret:
+	leaq	104(%rsp),%rsp
+.Lxts_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_xts_encrypt,.-aesni_xts_encrypt
+.globl	aesni_xts_decrypt
+.type	aesni_xts_decrypt,@function
+.align	16
+aesni_xts_decrypt:
+	leaq	-104(%rsp),%rsp
+	movups	(%r9),%xmm15
+	movl	240(%r8),%eax
+	movl	240(%rcx),%r10d
+	movups	(%r8),%xmm0
+	movups	16(%r8),%xmm1
+	leaq	32(%r8),%r8
+	xorps	%xmm0,%xmm15
+.Loop_enc1_11:
+.byte	102,68,15,56,220,249
+	decl	%eax
+	movups	(%r8),%xmm1
+	leaq	16(%r8),%r8
+	jnz	.Loop_enc1_11	
+.byte	102,68,15,56,221,249
+	xorl	%eax,%eax
+	testq	$15,%rdx
+	setnz	%al
+	shlq	$4,%rax
+	subq	%rax,%rdx
+
+	movq	%rcx,%r11
+	movl	%r10d,%eax
+	movq	%rdx,%r9
+	andq	$-16,%rdx
+
+	movdqa	.Lxts_magic(%rip),%xmm8
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	subq	$96,%rdx
+	jc	.Lxts_dec_short
+
+	shrl	$1,%eax
+	subl	$1,%eax
+	movl	%eax,%r10d
+	jmp	.Lxts_dec_grandloop
+
+.align	16
+.Lxts_dec_grandloop:
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	0(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm12,%xmm4
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+	pxor	%xmm13,%xmm5
+	movups	(%r11),%xmm0
+	pxor	%xmm14,%xmm6
+	pxor	%xmm15,%xmm7
+
+
+
+	movups	16(%r11),%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,222,209
+	leaq	32(%r11),%rcx
+	pxor	%xmm0,%xmm4
+	movdqa	%xmm11,16(%rsp)
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	movdqa	%xmm12,32(%rsp)
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm13,48(%rsp)
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	movups	(%rcx),%xmm0
+	decl	%eax
+	movdqa	%xmm14,64(%rsp)
+.byte	102,15,56,222,241
+	movdqa	%xmm15,80(%rsp)
+.byte	102,15,56,222,249
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	jmp	.Lxts_dec_loop6_enter
+
+.align	16
+.Lxts_dec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.Lxts_dec_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	(%rcx),%xmm0
+	jnz	.Lxts_dec_loop6
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,222,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,222,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,222,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	16(%rcx),%xmm1
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,222,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,222,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,222,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	32(%rcx),%xmm0
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,222,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,222,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,222,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,223,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,223,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,223,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	xorps	0(%rsp),%xmm2
+	pand	%xmm8,%xmm9
+	xorps	16(%rsp),%xmm3
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+
+	xorps	32(%rsp),%xmm4
+	movups	%xmm2,0(%rsi)
+	xorps	48(%rsp),%xmm5
+	movups	%xmm3,16(%rsi)
+	xorps	64(%rsp),%xmm6
+	movups	%xmm4,32(%rsi)
+	xorps	80(%rsp),%xmm7
+	movups	%xmm5,48(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	subq	$96,%rdx
+	jnc	.Lxts_dec_grandloop
+
+	leal	3(%rax,%rax,1),%eax
+	movq	%r11,%rcx
+	movl	%eax,%r10d
+
+.Lxts_dec_short:
+	addq	$96,%rdx
+	jz	.Lxts_dec_done
+
+	cmpq	$32,%rdx
+	jb	.Lxts_dec_one
+	je	.Lxts_dec_two
+
+	cmpq	$64,%rdx
+	jb	.Lxts_dec_three
+	je	.Lxts_dec_four
+
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	leaq	80(%rdi),%rdi
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm6
+
+	call	_aesni_decrypt6
+
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movdqu	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movdqu	%xmm3,16(%rsi)
+	xorps	%xmm14,%xmm6
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm14,%xmm14
+	movdqu	%xmm5,48(%rsi)
+	pcmpgtd	%xmm15,%xmm14
+	movdqu	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	pshufd	$19,%xmm14,%xmm11
+	andq	$15,%r9
+	jz	.Lxts_dec_ret
+
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm11
+	pxor	%xmm15,%xmm11
+	jmp	.Lxts_dec_done2
+
+.align	16
+.Lxts_dec_one:
+	movups	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_12:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_12	
+.byte	102,15,56,223,209
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm11,%xmm10
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm12,%xmm11
+	leaq	16(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_two:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	leaq	32(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+
+	call	_aesni_decrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm12,%xmm10
+	xorps	%xmm11,%xmm3
+	movdqa	%xmm13,%xmm11
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	leaq	32(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_three:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	32(%rdi),%xmm4
+	leaq	48(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+
+	call	_aesni_decrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm13,%xmm10
+	xorps	%xmm11,%xmm3
+	movdqa	%xmm15,%xmm11
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	leaq	48(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_four:
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movups	(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movups	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movups	32(%rdi),%xmm4
+	xorps	%xmm10,%xmm2
+	movups	48(%rdi),%xmm5
+	leaq	64(%rdi),%rdi
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	xorps	%xmm13,%xmm5
+
+	call	_aesni_decrypt4
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm14,%xmm10
+	xorps	%xmm11,%xmm3
+	movdqa	%xmm15,%xmm11
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	leaq	64(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_done:
+	andq	$15,%r9
+	jz	.Lxts_dec_ret
+.Lxts_dec_done2:
+	movq	%r9,%rdx
+	movq	%r11,%rcx
+	movl	%r10d,%eax
+
+	movups	(%rdi),%xmm2
+	xorps	%xmm11,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_13:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_13	
+.byte	102,15,56,223,209
+	xorps	%xmm11,%xmm2
+	movups	%xmm2,(%rsi)
+
+.Lxts_dec_steal:
+	movzbl	16(%rdi),%eax
+	movzbl	(%rsi),%ecx
+	leaq	1(%rdi),%rdi
+	movb	%al,(%rsi)
+	movb	%cl,16(%rsi)
+	leaq	1(%rsi),%rsi
+	subq	$1,%rdx
+	jnz	.Lxts_dec_steal
+
+	subq	%r9,%rsi
+	movq	%r11,%rcx
+	movl	%r10d,%eax
+
+	movups	(%rsi),%xmm2
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_14:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_14	
+.byte	102,15,56,223,209
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+
+.Lxts_dec_ret:
+	leaq	104(%rsp),%rsp
+.Lxts_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_xts_decrypt,.-aesni_xts_decrypt
+.globl	aesni_cbc_encrypt
+.type	aesni_cbc_encrypt,@function
+.align	16
+aesni_cbc_encrypt:
+	testq	%rdx,%rdx
+	jz	.Lcbc_ret
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	testl	%r9d,%r9d
+	jz	.Lcbc_decrypt
+
+	movups	(%r8),%xmm2
+	movl	%r10d,%eax
+	cmpq	$16,%rdx
+	jb	.Lcbc_enc_tail
+	subq	$16,%rdx
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movups	(%rdi),%xmm3
+	leaq	16(%rdi),%rdi
+
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm3
+	leaq	32(%rcx),%rcx
+	xorps	%xmm3,%xmm2
+.Loop_enc1_15:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_15	
+.byte	102,15,56,221,209
+	movl	%r10d,%eax
+	movq	%r11,%rcx
+	movups	%xmm2,0(%rsi)
+	leaq	16(%rsi),%rsi
+	subq	$16,%rdx
+	jnc	.Lcbc_enc_loop
+	addq	$16,%rdx
+	jnz	.Lcbc_enc_tail
+	movups	%xmm2,(%r8)
+	jmp	.Lcbc_ret
+
+.Lcbc_enc_tail:
+	movq	%rdx,%rcx
+	xchgq	%rdi,%rsi
+.long	0x9066A4F3	
+	movl	$16,%ecx
+	subq	%rdx,%rcx
+	xorl	%eax,%eax
+.long	0x9066AAF3	
+	leaq	-16(%rdi),%rdi
+	movl	%r10d,%eax
+	movq	%rdi,%rsi
+	movq	%r11,%rcx
+	xorq	%rdx,%rdx
+	jmp	.Lcbc_enc_loop	
+
+.align	16
+.Lcbc_decrypt:
+	movups	(%r8),%xmm9
+	movl	%r10d,%eax
+	cmpq	$112,%rdx
+	jbe	.Lcbc_dec_tail
+	shrl	$1,%r10d
+	subq	$112,%rdx
+	movl	%r10d,%eax
+	movaps	%xmm9,-24(%rsp)
+	jmp	.Lcbc_dec_loop8_enter
+.align	16
+.Lcbc_dec_loop8:
+	movaps	%xmm0,-24(%rsp)
+	movups	%xmm9,(%rsi)
+	leaq	16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+	movups	(%rcx),%xmm0
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	16(%rcx),%xmm1
+
+	leaq	32(%rcx),%rcx
+	movdqu	32(%rdi),%xmm4
+	xorps	%xmm0,%xmm2
+	movdqu	48(%rdi),%xmm5
+	xorps	%xmm0,%xmm3
+	movdqu	64(%rdi),%xmm6
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+	movdqu	80(%rdi),%xmm7
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	movdqu	96(%rdi),%xmm8
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+	movdqu	112(%rdi),%xmm9
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,222,241
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm9
+	movups	(%rcx),%xmm0
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	16(%rcx),%xmm1
+
+	call	.Ldec_loop8_enter
+
+	movups	(%rdi),%xmm1
+	movups	16(%rdi),%xmm0
+	xorps	-24(%rsp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%rdi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%rdi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%rdi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%rdi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	96(%rdi),%xmm1
+	xorps	%xmm0,%xmm8
+	movups	112(%rdi),%xmm0
+	xorps	%xmm1,%xmm9
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm6,64(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm7,80(%rsi)
+	leaq	128(%rdi),%rdi
+	movups	%xmm8,96(%rsi)
+	leaq	112(%rsi),%rsi
+	subq	$128,%rdx
+	ja	.Lcbc_dec_loop8
+
+	movaps	%xmm9,%xmm2
+	movaps	%xmm0,%xmm9
+	addq	$112,%rdx
+	jle	.Lcbc_dec_tail_collected
+	movups	%xmm2,(%rsi)
+	leal	1(%r10,%r10,1),%eax
+	leaq	16(%rsi),%rsi
+.Lcbc_dec_tail:
+	movups	(%rdi),%xmm2
+	movaps	%xmm2,%xmm8
+	cmpq	$16,%rdx
+	jbe	.Lcbc_dec_one
+
+	movups	16(%rdi),%xmm3
+	movaps	%xmm3,%xmm7
+	cmpq	$32,%rdx
+	jbe	.Lcbc_dec_two
+
+	movups	32(%rdi),%xmm4
+	movaps	%xmm4,%xmm6
+	cmpq	$48,%rdx
+	jbe	.Lcbc_dec_three
+
+	movups	48(%rdi),%xmm5
+	cmpq	$64,%rdx
+	jbe	.Lcbc_dec_four
+
+	movups	64(%rdi),%xmm6
+	cmpq	$80,%rdx
+	jbe	.Lcbc_dec_five
+
+	movups	80(%rdi),%xmm7
+	cmpq	$96,%rdx
+	jbe	.Lcbc_dec_six
+
+	movups	96(%rdi),%xmm8
+	movaps	%xmm9,-24(%rsp)
+	call	_aesni_decrypt8
+	movups	(%rdi),%xmm1
+	movups	16(%rdi),%xmm0
+	xorps	-24(%rsp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%rdi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%rdi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%rdi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%rdi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	96(%rdi),%xmm9
+	xorps	%xmm0,%xmm8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	movaps	%xmm8,%xmm2
+	subq	$112,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_16:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_16	
+.byte	102,15,56,223,209
+	xorps	%xmm9,%xmm2
+	movaps	%xmm8,%xmm9
+	subq	$16,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	movups	%xmm2,(%rsi)
+	movaps	%xmm7,%xmm9
+	movaps	%xmm3,%xmm2
+	leaq	16(%rsi),%rsi
+	subq	$32,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%rsi)
+	movaps	%xmm6,%xmm9
+	movaps	%xmm4,%xmm2
+	leaq	32(%rsi),%rsi
+	subq	$48,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_four:
+	call	_aesni_decrypt4
+	xorps	%xmm9,%xmm2
+	movups	48(%rdi),%xmm9
+	xorps	%xmm8,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%rsi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm4,32(%rsi)
+	movaps	%xmm5,%xmm2
+	leaq	48(%rsi),%rsi
+	subq	$64,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	16(%rdi),%xmm1
+	movups	32(%rdi),%xmm0
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	xorps	%xmm1,%xmm4
+	movups	48(%rdi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	64(%rdi),%xmm9
+	xorps	%xmm1,%xmm6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	leaq	64(%rsi),%rsi
+	movaps	%xmm6,%xmm2
+	subq	$80,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_six:
+	call	_aesni_decrypt6
+	movups	16(%rdi),%xmm1
+	movups	32(%rdi),%xmm0
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	xorps	%xmm1,%xmm4
+	movups	48(%rdi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	64(%rdi),%xmm0
+	xorps	%xmm1,%xmm6
+	movups	80(%rdi),%xmm9
+	xorps	%xmm0,%xmm7
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	movaps	%xmm7,%xmm2
+	subq	$96,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_tail_collected:
+	andq	$15,%rdx
+	movups	%xmm9,(%r8)
+	jnz	.Lcbc_dec_tail_partial
+	movups	%xmm2,(%rsi)
+	jmp	.Lcbc_dec_ret
+.align	16
+.Lcbc_dec_tail_partial:
+	movaps	%xmm2,-24(%rsp)
+	movq	$16,%rcx
+	movq	%rsi,%rdi
+	subq	%rdx,%rcx
+	leaq	-24(%rsp),%rsi
+.long	0x9066A4F3	
+
+.Lcbc_dec_ret:
+.Lcbc_ret:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_encrypt,.-aesni_cbc_encrypt
+.globl	aesni_set_decrypt_key
+.type	aesni_set_decrypt_key,@function
+.align	16
+aesni_set_decrypt_key:
+.byte	0x48,0x83,0xEC,0x08	
+	call	__aesni_set_encrypt_key
+	shll	$4,%esi
+	testl	%eax,%eax
+	jnz	.Ldec_key_ret
+	leaq	16(%rdx,%rsi,1),%rdi
+
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+	movups	%xmm0,(%rdi)
+	movups	%xmm1,(%rdx)
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+
+.Ldec_key_inverse:
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+	movups	%xmm0,16(%rdi)
+	movups	%xmm1,-16(%rdx)
+	cmpq	%rdx,%rdi
+	ja	.Ldec_key_inverse
+
+	movups	(%rdx),%xmm0
+.byte	102,15,56,219,192
+	movups	%xmm0,(%rdi)
+.Ldec_key_ret:
+	addq	$8,%rsp
+	.byte	0xf3,0xc3
+.LSEH_end_set_decrypt_key:
+.size	aesni_set_decrypt_key,.-aesni_set_decrypt_key
+.globl	aesni_set_encrypt_key
+.type	aesni_set_encrypt_key,@function
+.align	16
+aesni_set_encrypt_key:
+__aesni_set_encrypt_key:
+.byte	0x48,0x83,0xEC,0x08	
+	movq	$-1,%rax
+	testq	%rdi,%rdi
+	jz	.Lenc_key_ret
+	testq	%rdx,%rdx
+	jz	.Lenc_key_ret
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	.L14rounds
+	cmpl	$192,%esi
+	je	.L12rounds
+	cmpl	$128,%esi
+	jne	.Lbad_keybits
+
+.L10rounds:
+	movl	$9,%esi
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_128_cold
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,64
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,128
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,27
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,54
+	call	.Lkey_expansion_128
+	movups	%xmm0,(%rax)
+	movl	%esi,80(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_192a_cold
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,128
+	call	.Lkey_expansion_192b
+	movups	%xmm0,(%rax)
+	movl	%esi,48(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	movq	$-2,%rax
+.Lenc_key_ret:
+	addq	$8,%rsp
+	.byte	0xf3,0xc3
+.LSEH_end_set_encrypt_key:
+
+.align	16
+.Lkey_expansion_128:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_192a:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%rax)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%rax)
+	leaq	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+
+.align	16
+.Lkey_expansion_256a:
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_256b:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	.byte	0xf3,0xc3
+.size	aesni_set_encrypt_key,.-aesni_set_encrypt_key
+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long	6,6,6,0
+.Lincrement64:
+.long	1,0,0,0
+.Lxts_magic:
+.long	0x87,0,1,0
+
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/main/openssl/crypto/aes/asm/aesni-x86_64.pl b/main/openssl/crypto/aes/asm/aesni-x86_64.pl
new file mode 100644
index 00000000..0dbb194b
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-x86_64.pl
@@ -0,0 +1,3069 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
+# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
+# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
+# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
+# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
+# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3 
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor		3x	6x	8x
+# theoretical asymptotic limit		1.67	0.83	0.625
+# measured performance for 8KB block	1.05	0.86	0.84
+#
+# "as if" interleave factor		4.7x	5.8x	6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt				1.16	0.93	0.93
+# CTR					1.14	0.91	n/a
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instuctions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions  still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizies 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+$code=".text\n";
+
+$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8";	# cbc, ctr, ...
+
+$rnds_="%r10d";	# backup copy for $rounds
+$key_="%r11";	# backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0";	$rndkey1="%xmm1";
+$inout0="%xmm2";	$inout1="%xmm3";
+$inout2="%xmm4";	$inout3="%xmm5";
+$inout4="%xmm6";	$inout5="%xmm7";
+$inout6="%xmm8";	$inout7="%xmm9";
+
+$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
+$in0="%xmm8";		$iv="%xmm9";
+
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
+++$sn;
+$code.=<<___;
+	$movkey	($key),$rndkey0
+	$movkey	16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+	xorps	$rndkey0,$ivec
+	lea	32($key),$key
+	xorps	$ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+	lea	32($key),$key
+	xorps	$rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+	aes${p}	$rndkey1,$inout
+	dec	$rounds
+	$movkey	($key),$rndkey1
+	lea	16($key),$key
+	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
+	aes${p}last	$rndkey1,$inout
+___
+}}
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+#
+{ my ($inp,$out,$key) = @_4args;
+
+$code.=<<___;
+.globl	${PREFIX}_encrypt
+.type	${PREFIX}_encrypt,\@abi-omnipotent
+.align	16
+${PREFIX}_encrypt:
+	movups	($inp),$inout0		# load input
+	mov	240($key),$rounds	# key->rounds
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)		# output
+	ret
+.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl	${PREFIX}_decrypt
+.type	${PREFIX}_decrypt,\@abi-omnipotent
+.align	16
+${PREFIX}_decrypt:
+	movups	($inp),$inout0		# load input
+	mov	240($key),$rounds	# key->rounds
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)		# output
+	ret
+.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
+___
+}
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x...
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt3,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt3:
+	$movkey	($key),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key),$rndkey1
+	lea	32($key),$key
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	$movkey		($key),$rndkey0
+
+.L${dir}_loop3:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop3
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	ret
+.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt4,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt4:
+	$movkey	($key),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key),$rndkey1
+	lea	32($key),$key
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	xorps	$rndkey0,$inout3
+	$movkey	($key),$rndkey0
+
+.L${dir}_loop4:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop4
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	ret
+.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt6,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt6:
+	$movkey		($key),$rndkey0
+	shr		\$1,$rounds
+	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+	xorps		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	aes${dir}	$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	aes${dir}	$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	aes${dir}	$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout4
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout5
+	jmp		.L${dir}_loop6_enter
+.align	16
+.L${dir}_loop6:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+.L${dir}_loop6_enter:				# happens to be 16-byte aligned
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop6
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	ret
+.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt8,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt8:
+	$movkey		($key),$rndkey0
+	shr		\$1,$rounds
+	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+	xorps		$rndkey0,$inout0
+	xorps		$rndkey0,$inout1
+	aes${dir}	$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	aes${dir}	$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	aes${dir}	$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout4
+	pxor		$rndkey0,$inout6
+	aes${dir}	$rndkey1,$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+	jmp		.L${dir}_loop8_enter
+.align	16
+.L${dir}_loop8:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+.L${dir}_loop8_enter:				# happens to be 16-byte aligned
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	aes${dir}	$rndkey0,$inout6
+	aes${dir}	$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop8
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	aes${dir}last	$rndkey0,$inout6
+	aes${dir}last	$rndkey0,$inout7
+	ret
+.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
+
+if ($PREFIX eq "aesni") {
+########################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#			  size_t length, const AES_KEY *key,
+#			  int enc);
+$code.=<<___;
+.globl	aesni_ecb_encrypt
+.type	aesni_ecb_encrypt,\@function,5
+.align	16
+aesni_ecb_encrypt:
+	and	\$-16,$len
+	jz	.Lecb_ret
+
+	mov	240($key),$rounds	# key->rounds
+	$movkey	($key),$rndkey0
+	mov	$key,$key_		# backup $key
+	mov	$rounds,$rnds_		# backup $rounds
+	test	%r8d,%r8d		# 5th argument
+	jz	.Lecb_decrypt
+#--------------------------- ECB ENCRYPT ------------------------------#
+	cmp	\$0x80,$len
+	jb	.Lecb_enc_tail
+
+	movdqu	($inp),$inout0
+	movdqu	0x10($inp),$inout1
+	movdqu	0x20($inp),$inout2
+	movdqu	0x30($inp),$inout3
+	movdqu	0x40($inp),$inout4
+	movdqu	0x50($inp),$inout5
+	movdqu	0x60($inp),$inout6
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+	sub	\$0x80,$len
+	jmp	.Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movdqu	($inp),$inout0
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout1,0x10($out)
+	movdqu	0x10($inp),$inout1
+	movups	$inout2,0x20($out)
+	movdqu	0x20($inp),$inout2
+	movups	$inout3,0x30($out)
+	movdqu	0x30($inp),$inout3
+	movups	$inout4,0x40($out)
+	movdqu	0x40($inp),$inout4
+	movups	$inout5,0x50($out)
+	movdqu	0x50($inp),$inout5
+	movups	$inout6,0x60($out)
+	movdqu	0x60($inp),$inout6
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+.Lecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	sub	\$0x80,$len
+	jnc	.Lecb_enc_loop8
+
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout1,0x10($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	add	\$0x80,$len
+	jz	.Lecb_ret
+
+.Lecb_enc_tail:
+	movups	($inp),$inout0
+	cmp	\$0x20,$len
+	jb	.Lecb_enc_one
+	movups	0x10($inp),$inout1
+	je	.Lecb_enc_two
+	movups	0x20($inp),$inout2
+	cmp	\$0x40,$len
+	jb	.Lecb_enc_three
+	movups	0x30($inp),$inout3
+	je	.Lecb_enc_four
+	movups	0x40($inp),$inout4
+	cmp	\$0x60,$len
+	jb	.Lecb_enc_five
+	movups	0x50($inp),$inout5
+	je	.Lecb_enc_six
+	movdqu	0x60($inp),$inout6
+	call	_aesni_encrypt8
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_one:
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_two:
+	xorps	$inout2,$inout2
+	call	_aesni_encrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_three:
+	call	_aesni_encrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_four:
+	call	_aesni_encrypt4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_five:
+	xorps	$inout5,$inout5
+	call	_aesni_encrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_six:
+	call	_aesni_encrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	jmp	.Lecb_ret
+#--------------------------- ECB DECRYPT ------------------------------#
+.align	16
+.Lecb_decrypt:
+	cmp	\$0x80,$len
+	jb	.Lecb_dec_tail
+
+	movdqu	($inp),$inout0
+	movdqu	0x10($inp),$inout1
+	movdqu	0x20($inp),$inout2
+	movdqu	0x30($inp),$inout3
+	movdqu	0x40($inp),$inout4
+	movdqu	0x50($inp),$inout5
+	movdqu	0x60($inp),$inout6
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+	sub	\$0x80,$len
+	jmp	.Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movdqu	($inp),$inout0
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout1,0x10($out)
+	movdqu	0x10($inp),$inout1
+	movups	$inout2,0x20($out)
+	movdqu	0x20($inp),$inout2
+	movups	$inout3,0x30($out)
+	movdqu	0x30($inp),$inout3
+	movups	$inout4,0x40($out)
+	movdqu	0x40($inp),$inout4
+	movups	$inout5,0x50($out)
+	movdqu	0x50($inp),$inout5
+	movups	$inout6,0x60($out)
+	movdqu	0x60($inp),$inout6
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+.Lecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	$movkey	($key_),$rndkey0
+	sub	\$0x80,$len
+	jnc	.Lecb_dec_loop8
+
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout1,0x10($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	add	\$0x80,$len
+	jz	.Lecb_ret
+
+.Lecb_dec_tail:
+	movups	($inp),$inout0
+	cmp	\$0x20,$len
+	jb	.Lecb_dec_one
+	movups	0x10($inp),$inout1
+	je	.Lecb_dec_two
+	movups	0x20($inp),$inout2
+	cmp	\$0x40,$len
+	jb	.Lecb_dec_three
+	movups	0x30($inp),$inout3
+	je	.Lecb_dec_four
+	movups	0x40($inp),$inout4
+	cmp	\$0x60,$len
+	jb	.Lecb_dec_five
+	movups	0x50($inp),$inout5
+	je	.Lecb_dec_six
+	movups	0x60($inp),$inout6
+	$movkey	($key),$rndkey0
+	call	_aesni_decrypt8
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_one:
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_two:
+	xorps	$inout2,$inout2
+	call	_aesni_decrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_three:
+	call	_aesni_decrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_four:
+	call	_aesni_decrypt4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_five:
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_six:
+	call	_aesni_decrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+
+.Lecb_ret:
+	ret
+.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9";	# 6th argument
+
+my $increment="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl	aesni_ccm64_encrypt_blocks
+.type	aesni_ccm64_encrypt_blocks,\@function,6
+.align	16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lccm64_enc_body:
+___
+$code.=<<___;
+	mov	240($key),$rounds		# key->rounds
+	movdqu	($ivp),$iv
+	movdqa	.Lincrement64(%rip),$increment
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+
+	shr	\$1,$rounds
+	lea	0($key),$key_
+	movdqu	($cmac),$inout1
+	movdqa	$iv,$inout0
+	mov	$rounds,$rnds_
+	pshufb	$bswap_mask,$iv
+	jmp	.Lccm64_enc_outer
+.align	16
+.Lccm64_enc_outer:
+	$movkey	($key_),$rndkey0
+	mov	$rnds_,$rounds
+	movups	($inp),$in0			# load inp
+
+	xorps	$rndkey0,$inout0		# counter
+	$movkey	16($key_),$rndkey1
+	xorps	$in0,$rndkey0
+	lea	32($key_),$key
+	xorps	$rndkey0,$inout1		# cmac^=inp
+	$movkey	($key),$rndkey0
+
+.Lccm64_enc2_loop:
+	aesenc	$rndkey1,$inout0
+	dec	$rounds
+	aesenc	$rndkey1,$inout1
+	$movkey	16($key),$rndkey1
+	aesenc	$rndkey0,$inout0
+	lea	32($key),$key
+	aesenc	$rndkey0,$inout1
+	$movkey	0($key),$rndkey0
+	jnz	.Lccm64_enc2_loop
+	aesenc	$rndkey1,$inout0
+	aesenc	$rndkey1,$inout1
+	paddq	$increment,$iv
+	aesenclast	$rndkey0,$inout0
+	aesenclast	$rndkey0,$inout1
+
+	dec	$len
+	lea	16($inp),$inp
+	xorps	$inout0,$in0			# inp ^= E(iv)
+	movdqa	$iv,$inout0
+	movups	$in0,($out)			# save output
+	lea	16($out),$out
+	pshufb	$bswap_mask,$inout0
+	jnz	.Lccm64_enc_outer
+
+	movups	$inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl	aesni_ccm64_decrypt_blocks
+.type	aesni_ccm64_decrypt_blocks,\@function,6
+.align	16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lccm64_dec_body:
+___
+$code.=<<___;
+	mov	240($key),$rounds		# key->rounds
+	movups	($ivp),$iv
+	movdqu	($cmac),$inout1
+	movdqa	.Lincrement64(%rip),$increment
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+
+	movaps	$iv,$inout0
+	mov	$rounds,$rnds_
+	mov	$key,$key_
+	pshufb	$bswap_mask,$iv
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	($inp),$in0			# load inp
+	paddq	$increment,$iv
+	lea	16($inp),$inp
+	jmp	.Lccm64_dec_outer
+.align	16
+.Lccm64_dec_outer:
+	xorps	$inout0,$in0			# inp ^= E(iv)
+	movdqa	$iv,$inout0
+	mov	$rnds_,$rounds
+	movups	$in0,($out)			# save output
+	lea	16($out),$out
+	pshufb	$bswap_mask,$inout0
+
+	sub	\$1,$len
+	jz	.Lccm64_dec_break
+
+	$movkey	($key_),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key_),$rndkey1
+	xorps	$rndkey0,$in0
+	lea	32($key_),$key
+	xorps	$rndkey0,$inout0
+	xorps	$in0,$inout1			# cmac^=out
+	$movkey	($key),$rndkey0
+
+.Lccm64_dec2_loop:
+	aesenc	$rndkey1,$inout0
+	dec	$rounds
+	aesenc	$rndkey1,$inout1
+	$movkey	16($key),$rndkey1
+	aesenc	$rndkey0,$inout0
+	lea	32($key),$key
+	aesenc	$rndkey0,$inout1
+	$movkey	0($key),$rndkey0
+	jnz	.Lccm64_dec2_loop
+	movups	($inp),$in0			# load inp
+	paddq	$increment,$iv
+	aesenc	$rndkey1,$inout0
+	aesenc	$rndkey1,$inout1
+	lea	16($inp),$inp
+	aesenclast	$rndkey0,$inout0
+	aesenclast	$rndkey0,$inout1
+	jmp	.Lccm64_dec_outer
+
+.align	16
+.Lccm64_dec_break:
+	#xorps	$in0,$inout1			# cmac^=out
+___
+	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+	movups	$inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+{
+my $reserved = $win64?0:-0x28;
+my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
+my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
+my $bswap_mask="%xmm15";
+
+$code.=<<___;
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,\@function,5
+.align	16
+aesni_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0xc8(%rsp),%rsp
+	movaps	%xmm6,0x20(%rsp)
+	movaps	%xmm7,0x30(%rsp)
+	movaps	%xmm8,0x40(%rsp)
+	movaps	%xmm9,0x50(%rsp)
+	movaps	%xmm10,0x60(%rsp)
+	movaps	%xmm11,0x70(%rsp)
+	movaps	%xmm12,0x80(%rsp)
+	movaps	%xmm13,0x90(%rsp)
+	movaps	%xmm14,0xa0(%rsp)
+	movaps	%xmm15,0xb0(%rsp)
+.Lctr32_body:
+___
+$code.=<<___;
+	cmp	\$1,$len
+	je	.Lctr32_one_shortcut
+
+	movdqu	($ivp),$ivec
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+	xor	$rounds,$rounds
+	pextrd	\$3,$ivec,$rnds_		# pull 32-bit counter
+	pinsrd	\$3,$rounds,$ivec		# wipe 32-bit counter
+
+	mov	240($key),$rounds		# key->rounds
+	bswap	$rnds_
+	pxor	$iv0,$iv0			# vector of 3 32-bit counters
+	pxor	$iv1,$iv1			# vector of 3 32-bit counters
+	pinsrd	\$0,$rnds_,$iv0
+	lea	3($rnds_),$key_
+	pinsrd	\$0,$key_,$iv1
+	inc	$rnds_
+	pinsrd	\$1,$rnds_,$iv0
+	inc	$key_
+	pinsrd	\$1,$key_,$iv1
+	inc	$rnds_
+	pinsrd	\$2,$rnds_,$iv0
+	inc	$key_
+	pinsrd	\$2,$key_,$iv1
+	movdqa	$iv0,$reserved(%rsp)
+	pshufb	$bswap_mask,$iv0
+	movdqa	$iv1,`$reserved+0x10`(%rsp)
+	pshufb	$bswap_mask,$iv1
+
+	pshufd	\$`3<<6`,$iv0,$inout0		# place counter to upper dword
+	pshufd	\$`2<<6`,$iv0,$inout1
+	pshufd	\$`1<<6`,$iv0,$inout2
+	cmp	\$6,$len
+	jb	.Lctr32_tail
+	shr	\$1,$rounds
+	mov	$key,$key_			# backup $key
+	mov	$rounds,$rnds_			# backup $rounds
+	sub	\$6,$len
+	jmp	.Lctr32_loop6
+
+.align	16
+.Lctr32_loop6:
+	pshufd	\$`3<<6`,$iv1,$inout3
+	por	$ivec,$inout0			# merge counter-less ivec
+	 $movkey	($key_),$rndkey0
+	pshufd	\$`2<<6`,$iv1,$inout4
+	por	$ivec,$inout1
+	 $movkey	16($key_),$rndkey1
+	pshufd	\$`1<<6`,$iv1,$inout5
+	por	$ivec,$inout2
+	por	$ivec,$inout3
+	 xorps		$rndkey0,$inout0
+	por	$ivec,$inout4
+	por	$ivec,$inout5
+
+	# inline _aesni_encrypt6 and interleave last rounds
+	# with own code...
+
+	pxor		$rndkey0,$inout1
+	aesenc		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	aesenc		$rndkey1,$inout1
+	 movdqa		.Lincrement32(%rip),$iv1
+	pxor		$rndkey0,$inout3
+	aesenc		$rndkey1,$inout2
+	 movdqa		$reserved(%rsp),$iv0
+	pxor		$rndkey0,$inout4
+	aesenc		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	jmp		.Lctr32_enc_loop6_enter
+.align	16
+.Lctr32_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	dec		$rounds
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+.Lctr32_enc_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	lea		32($key),$key
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lctr32_enc_loop6
+
+	aesenc		$rndkey1,$inout0
+	 paddd		$iv1,$iv0		# increment counter vector
+	aesenc		$rndkey1,$inout1
+	 paddd		`$reserved+0x10`(%rsp),$iv1
+	aesenc		$rndkey1,$inout2
+	 movdqa		$iv0,$reserved(%rsp)	# save counter vector
+	aesenc		$rndkey1,$inout3
+	 movdqa		$iv1,`$reserved+0x10`(%rsp)
+	aesenc		$rndkey1,$inout4
+	 pshufb		$bswap_mask,$iv0	# byte swap
+	aesenc		$rndkey1,$inout5
+	 pshufb		$bswap_mask,$iv1
+
+	aesenclast	$rndkey0,$inout0
+	 movups		($inp),$in0		# load input
+	aesenclast	$rndkey0,$inout1
+	 movups		0x10($inp),$in1
+	aesenclast	$rndkey0,$inout2
+	 movups		0x20($inp),$in2
+	aesenclast	$rndkey0,$inout3
+	 movups		0x30($inp),$in3
+	aesenclast	$rndkey0,$inout4
+	 movups		0x40($inp),$rndkey1
+	aesenclast	$rndkey0,$inout5
+	 movups		0x50($inp),$rndkey0
+	 lea	0x60($inp),$inp
+
+	xorps	$inout0,$in0			# xor
+	 pshufd	\$`3<<6`,$iv0,$inout0
+	xorps	$inout1,$in1
+	 pshufd	\$`2<<6`,$iv0,$inout1
+	movups	$in0,($out)			# store output
+	xorps	$inout2,$in2
+	 pshufd	\$`1<<6`,$iv0,$inout2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	xorps	$inout4,$rndkey1
+	movups	$in3,0x30($out)
+	xorps	$inout5,$rndkey0
+	movups	$rndkey1,0x40($out)
+	movups	$rndkey0,0x50($out)
+	lea	0x60($out),$out
+	mov	$rnds_,$rounds
+	sub	\$6,$len
+	jnc	.Lctr32_loop6
+
+	add	\$6,$len
+	jz	.Lctr32_done
+	mov	$key_,$key			# restore $key
+	lea	1($rounds,$rounds),$rounds	# restore original value
+
+.Lctr32_tail:
+	por	$ivec,$inout0
+	movups	($inp),$in0
+	cmp	\$2,$len
+	jb	.Lctr32_one
+
+	por	$ivec,$inout1
+	movups	0x10($inp),$in1
+	je	.Lctr32_two
+
+	pshufd	\$`3<<6`,$iv1,$inout3
+	por	$ivec,$inout2
+	movups	0x20($inp),$in2
+	cmp	\$4,$len
+	jb	.Lctr32_three
+
+	pshufd	\$`2<<6`,$iv1,$inout4
+	por	$ivec,$inout3
+	movups	0x30($inp),$in3
+	je	.Lctr32_four
+
+	por	$ivec,$inout4
+	xorps	$inout5,$inout5
+
+	call	_aesni_encrypt6
+
+	movups	0x40($inp),$rndkey1
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	xorps	$inout4,$rndkey1
+	movups	$in3,0x30($out)
+	movups	$rndkey1,0x40($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_one_shortcut:
+	movups	($ivp),$inout0
+	movups	($inp),$in0
+	mov	240($key),$rounds		# key->rounds
+.Lctr32_one:
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	$inout0,$in0
+	movups	$in0,($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_two:
+	xorps	$inout2,$inout2
+	call	_aesni_encrypt3
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	movups	$in1,0x10($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_three:
+	call	_aesni_encrypt3
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	movups	$in2,0x20($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_four:
+	call	_aesni_encrypt4
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	movups	$in3,0x30($out)
+
+.Lctr32_done:
+___
+$code.=<<___ if ($win64);
+	movaps	0x20(%rsp),%xmm6
+	movaps	0x30(%rsp),%xmm7
+	movaps	0x40(%rsp),%xmm8
+	movaps	0x50(%rsp),%xmm9
+	movaps	0x60(%rsp),%xmm10
+	movaps	0x70(%rsp),%xmm11
+	movaps	0x80(%rsp),%xmm12
+	movaps	0x90(%rsp),%xmm13
+	movaps	0xa0(%rsp),%xmm14
+	movaps	0xb0(%rsp),%xmm15
+	lea	0xc8(%rsp),%rsp
+.Lctr32_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2
+#	const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x68 + ($win64?160:0);
+
+$code.=<<___;
+.globl	aesni_xts_encrypt
+.type	aesni_xts_encrypt,\@function,6
+.align	16
+aesni_xts_encrypt:
+	lea	-$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x60(%rsp)
+	movaps	%xmm7,0x70(%rsp)
+	movaps	%xmm8,0x80(%rsp)
+	movaps	%xmm9,0x90(%rsp)
+	movaps	%xmm10,0xa0(%rsp)
+	movaps	%xmm11,0xb0(%rsp)
+	movaps	%xmm12,0xc0(%rsp)
+	movaps	%xmm13,0xd0(%rsp)
+	movaps	%xmm14,0xe0(%rsp)
+	movaps	%xmm15,0xf0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+	movups	($ivp),@tweak[5]		# load clear-text tweak
+	mov	240(%r8),$rounds		# key2->rounds
+	mov	240($key),$rnds_		# key1->rounds
+___
+	# generate the tweak
+	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+	mov	$key,$key_			# backup $key
+	mov	$rnds_,$rounds			# backup $rounds
+	mov	$len,$len_			# backup $len
+	and	\$-16,$len
+
+	movdqa	.Lxts_magic(%rip),$twmask
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[$i]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+___
+    }
+$code.=<<___;
+	sub	\$16*6,$len
+	jc	.Lxts_enc_short
+
+	shr	\$1,$rounds
+	sub	\$1,$rounds
+	mov	$rounds,$rnds_
+	jmp	.Lxts_enc_grandloop
+
+.align	16
+.Lxts_enc_grandloop:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	movdqu	`16*0`($inp),$inout0		# load input
+	pand	$twmask,$twres			# isolate carry and residue
+	movdqu	`16*1`($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	`16*2`($inp),$inout2
+	pxor	@tweak[0],$inout0		# input^=tweak
+	movdqu	`16*3`($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	`16*4`($inp),$inout4
+	pxor	@tweak[2],$inout2
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+	pxor	@tweak[3],$inout3
+	$movkey		($key_),$rndkey0
+	pxor	@tweak[4],$inout4
+	pxor	@tweak[5],$inout5
+
+	# inline _aesni_encrypt6 and interleave first and last rounds
+	# with own code...
+	$movkey		16($key_),$rndkey1
+	pxor		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
+	aesenc		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesenc		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesenc		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	aesenc		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesenc		$rndkey1,$inout4
+	 movdqa	@tweak[5],`16*5`(%rsp)
+	aesenc		$rndkey1,$inout5
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp
+	jmp		.Lxts_enc_loop6_enter
+
+.align	16
+.Lxts_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	dec		$rounds
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+.Lxts_enc_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	lea		32($key),$key
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lxts_enc_loop6
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	 aesenc		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey1,$inout3
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+	 $movkey	16($key),$rndkey1
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenc		$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey0,$inout3
+	 aesenc		$rndkey0,$inout4
+	 aesenc		$rndkey0,$inout5
+	 $movkey	32($key),$rndkey0
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[1]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenc		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey1,$inout3
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[2]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenclast	$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenclast	$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenclast	$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenclast	$rndkey0,$inout3
+	 aesenclast	$rndkey0,$inout4
+	 aesenclast	$rndkey0,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[3]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	 xorps	`16*1`(%rsp),$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+
+	xorps	`16*2`(%rsp),$inout2
+	movups	$inout0,`16*0`($out)		# write output
+	xorps	`16*3`(%rsp),$inout3
+	movups	$inout1,`16*1`($out)
+	xorps	`16*4`(%rsp),$inout4
+	movups	$inout2,`16*2`($out)
+	xorps	`16*5`(%rsp),$inout5
+	movups	$inout3,`16*3`($out)
+	mov	$rnds_,$rounds			# restore $rounds
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$16*6,$len
+	jnc	.Lxts_enc_grandloop
+
+	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	$key_,$key			# restore $key
+	mov	$rounds,$rnds_			# backup $rounds
+
+.Lxts_enc_short:
+	add	\$16*6,$len
+	jz	.Lxts_enc_done
+
+	cmp	\$0x20,$len
+	jb	.Lxts_enc_one
+	je	.Lxts_enc_two
+
+	cmp	\$0x40,$len
+	jb	.Lxts_enc_three
+	je	.Lxts_enc_four
+
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movdqu	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movdqu	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	16*2($inp),$inout2
+	pxor	@tweak[0],$inout0
+	movdqu	16*3($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	16*4($inp),$inout4
+	lea	16*5($inp),$inp
+	pxor	@tweak[2],$inout2
+	pxor	@tweak[3],$inout3
+	pxor	@tweak[4],$inout4
+
+	call	_aesni_encrypt6
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[5],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	xorps	@tweak[4],$inout4
+	movdqu	$inout2,16*2($out)
+	movdqu	$inout3,16*3($out)
+	movdqu	$inout4,16*4($out)
+	lea	16*5($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_one:
+	movups	($inp),$inout0
+	lea	16*1($inp),$inp
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[1],@tweak[0]
+	movups	$inout0,($out)
+	lea	16*1($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_two:
+	movups	($inp),$inout0
+	movups	16($inp),$inout1
+	lea	32($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+
+	call	_aesni_encrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[2],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	lea	16*2($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_three:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	lea	16*3($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+
+	call	_aesni_encrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[3],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	lea	16*3($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_four:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	xorps	@tweak[0],$inout0
+	movups	16*3($inp),$inout3
+	lea	16*4($inp),$inp
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	xorps	@tweak[3],$inout3
+
+	call	_aesni_encrypt4
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[5],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	movups	$inout3,16*3($out)
+	lea	16*4($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_done:
+	and	\$15,$len_
+	jz	.Lxts_enc_ret
+	mov	$len_,$len
+
+.Lxts_enc_steal:
+	movzb	($inp),%eax			# borrow $rounds ...
+	movzb	-16($out),%ecx			# ... and $key
+	lea	1($inp),$inp
+	mov	%al,-16($out)
+	mov	%cl,0($out)
+	lea	1($out),$out
+	sub	\$1,$len
+	jnz	.Lxts_enc_steal
+
+	sub	$len_,$out			# rewind $out
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	-16($out),$inout0
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movups	$inout0,-16($out)
+
+.Lxts_enc_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	0x60(%rsp),%xmm6
+	movaps	0x70(%rsp),%xmm7
+	movaps	0x80(%rsp),%xmm8
+	movaps	0x90(%rsp),%xmm9
+	movaps	0xa0(%rsp),%xmm10
+	movaps	0xb0(%rsp),%xmm11
+	movaps	0xc0(%rsp),%xmm12
+	movaps	0xd0(%rsp),%xmm13
+	movaps	0xe0(%rsp),%xmm14
+	movaps	0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	$frame_size(%rsp),%rsp
+.Lxts_enc_epilogue:
+	ret
+.size	aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl	aesni_xts_decrypt
+.type	aesni_xts_decrypt,\@function,6
+.align	16
+aesni_xts_decrypt:
+	lea	-$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x60(%rsp)
+	movaps	%xmm7,0x70(%rsp)
+	movaps	%xmm8,0x80(%rsp)
+	movaps	%xmm9,0x90(%rsp)
+	movaps	%xmm10,0xa0(%rsp)
+	movaps	%xmm11,0xb0(%rsp)
+	movaps	%xmm12,0xc0(%rsp)
+	movaps	%xmm13,0xd0(%rsp)
+	movaps	%xmm14,0xe0(%rsp)
+	movaps	%xmm15,0xf0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+	movups	($ivp),@tweak[5]		# load clear-text tweak
+	mov	240($key2),$rounds		# key2->rounds
+	mov	240($key),$rnds_		# key1->rounds
+___
+	# generate the tweak
+	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+	xor	%eax,%eax			# if ($len%16) len-=16;
+	test	\$15,$len
+	setnz	%al
+	shl	\$4,%rax
+	sub	%rax,$len
+
+	mov	$key,$key_			# backup $key
+	mov	$rnds_,$rounds			# backup $rounds
+	mov	$len,$len_			# backup $len
+	and	\$-16,$len
+
+	movdqa	.Lxts_magic(%rip),$twmask
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[$i]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+___
+    }
+$code.=<<___;
+	sub	\$16*6,$len
+	jc	.Lxts_dec_short
+
+	shr	\$1,$rounds
+	sub	\$1,$rounds
+	mov	$rounds,$rnds_
+	jmp	.Lxts_dec_grandloop
+
+.align	16
+.Lxts_dec_grandloop:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	movdqu	`16*0`($inp),$inout0		# load input
+	pand	$twmask,$twres			# isolate carry and residue
+	movdqu	`16*1`($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	`16*2`($inp),$inout2
+	pxor	@tweak[0],$inout0		# input^=tweak
+	movdqu	`16*3`($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	`16*4`($inp),$inout4
+	pxor	@tweak[2],$inout2
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+	pxor	@tweak[3],$inout3
+	$movkey		($key_),$rndkey0
+	pxor	@tweak[4],$inout4
+	pxor	@tweak[5],$inout5
+
+	# inline _aesni_decrypt6 and interleave first and last rounds
+	# with own code...
+	$movkey		16($key_),$rndkey1
+	pxor		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
+	aesdec		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesdec		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesdec		$rndkey1,$inout4
+	 movdqa	@tweak[5],`16*5`(%rsp)
+	aesdec		$rndkey1,$inout5
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp
+	jmp		.Lxts_dec_loop6_enter
+
+.align	16
+.Lxts_dec_loop6:
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	dec		$rounds
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	aesdec		$rndkey1,$inout4
+	aesdec		$rndkey1,$inout5
+.Lxts_dec_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	lea		32($key),$key
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	aesdec		$rndkey0,$inout4
+	aesdec		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lxts_dec_loop6
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	 aesdec		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey1,$inout3
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+	 $movkey	16($key),$rndkey1
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdec		$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey0,$inout3
+	 aesdec		$rndkey0,$inout4
+	 aesdec		$rndkey0,$inout5
+	 $movkey	32($key),$rndkey0
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[1]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdec		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey1,$inout3
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[2]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdeclast	$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdeclast	$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdeclast	$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdeclast	$rndkey0,$inout3
+	 aesdeclast	$rndkey0,$inout4
+	 aesdeclast	$rndkey0,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[3]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	 xorps	`16*1`(%rsp),$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+
+	xorps	`16*2`(%rsp),$inout2
+	movups	$inout0,`16*0`($out)		# write output
+	xorps	`16*3`(%rsp),$inout3
+	movups	$inout1,`16*1`($out)
+	xorps	`16*4`(%rsp),$inout4
+	movups	$inout2,`16*2`($out)
+	xorps	`16*5`(%rsp),$inout5
+	movups	$inout3,`16*3`($out)
+	mov	$rnds_,$rounds			# restore $rounds
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$16*6,$len
+	jnc	.Lxts_dec_grandloop
+
+	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	$key_,$key			# restore $key
+	mov	$rounds,$rnds_			# backup $rounds
+
+.Lxts_dec_short:
+	add	\$16*6,$len
+	jz	.Lxts_dec_done
+
+	cmp	\$0x20,$len
+	jb	.Lxts_dec_one
+	je	.Lxts_dec_two
+
+	cmp	\$0x40,$len
+	jb	.Lxts_dec_three
+	je	.Lxts_dec_four
+
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movdqu	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movdqu	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	16*2($inp),$inout2
+	pxor	@tweak[0],$inout0
+	movdqu	16*3($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	16*4($inp),$inout4
+	lea	16*5($inp),$inp
+	pxor	@tweak[2],$inout2
+	pxor	@tweak[3],$inout3
+	pxor	@tweak[4],$inout4
+
+	call	_aesni_decrypt6
+
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	xorps	@tweak[4],$inout4
+	movdqu	$inout2,16*2($out)
+	 pxor		$twtmp,$twtmp
+	movdqu	$inout3,16*3($out)
+	 pcmpgtd	@tweak[5],$twtmp
+	movdqu	$inout4,16*4($out)
+	lea	16*5($out),$out
+	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
+	and	\$15,$len_
+	jz	.Lxts_dec_ret
+
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	pand	$twmask,@tweak[1]		# isolate carry and residue
+	pxor	@tweak[5],@tweak[1]
+	jmp	.Lxts_dec_done2
+
+.align	16
+.Lxts_dec_one:
+	movups	($inp),$inout0
+	lea	16*1($inp),$inp
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[1],@tweak[0]
+	movups	$inout0,($out)
+	movdqa	@tweak[2],@tweak[1]
+	lea	16*1($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_two:
+	movups	($inp),$inout0
+	movups	16($inp),$inout1
+	lea	32($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+
+	call	_aesni_decrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[2],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[3],@tweak[1]
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	lea	16*2($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_three:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	lea	16*3($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+
+	call	_aesni_decrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[3],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[5],@tweak[1]
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	lea	16*3($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_four:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movups	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movups	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movups	16*2($inp),$inout2
+	xorps	@tweak[0],$inout0
+	movups	16*3($inp),$inout3
+	lea	16*4($inp),$inp
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	xorps	@tweak[3],$inout3
+
+	call	_aesni_decrypt4
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[4],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[5],@tweak[1]
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	movups	$inout3,16*3($out)
+	lea	16*4($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_done:
+	and	\$15,$len_
+	jz	.Lxts_dec_ret
+.Lxts_dec_done2:
+	mov	$len_,$len
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	($inp),$inout0
+	xorps	@tweak[1],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[1],$inout0
+	movups	$inout0,($out)
+
+.Lxts_dec_steal:
+	movzb	16($inp),%eax			# borrow $rounds ...
+	movzb	($out),%ecx			# ... and $key
+	lea	1($inp),$inp
+	mov	%al,($out)
+	mov	%cl,16($out)
+	lea	1($out),$out
+	sub	\$1,$len
+	jnz	.Lxts_dec_steal
+
+	sub	$len_,$out			# rewind $out
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	($out),$inout0
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movups	$inout0,($out)
+
+.Lxts_dec_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	0x60(%rsp),%xmm6
+	movaps	0x70(%rsp),%xmm7
+	movaps	0x80(%rsp),%xmm8
+	movaps	0x90(%rsp),%xmm9
+	movaps	0xa0(%rsp),%xmm10
+	movaps	0xb0(%rsp),%xmm11
+	movaps	0xc0(%rsp),%xmm12
+	movaps	0xd0(%rsp),%xmm13
+	movaps	0xe0(%rsp),%xmm14
+	movaps	0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	$frame_size(%rsp),%rsp
+.Lxts_dec_epilogue:
+	ret
+.size	aesni_xts_decrypt,.-aesni_xts_decrypt
+___
+} }}
+
+########################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#			    size_t length, const AES_KEY *key,
+#			    unsigned char *ivp,const int enc);
+{
+my $reserved = $win64?0x40:-0x18;	# used in decrypt
+$code.=<<___;
+.globl	${PREFIX}_cbc_encrypt
+.type	${PREFIX}_cbc_encrypt,\@function,6
+.align	16
+${PREFIX}_cbc_encrypt:
+	test	$len,$len		# check length
+	jz	.Lcbc_ret
+
+	mov	240($key),$rnds_	# key->rounds
+	mov	$key,$key_		# backup $key
+	test	%r9d,%r9d		# 6th argument
+	jz	.Lcbc_decrypt
+#--------------------------- CBC ENCRYPT ------------------------------#
+	movups	($ivp),$inout0		# load iv as initial state
+	mov	$rnds_,$rounds
+	cmp	\$16,$len
+	jb	.Lcbc_enc_tail
+	sub	\$16,$len
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movups	($inp),$inout1		# load input
+	lea	16($inp),$inp
+	#xorps	$inout1,$inout0
+___
+	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+$code.=<<___;
+	mov	$rnds_,$rounds		# restore $rounds
+	mov	$key_,$key		# restore $key
+	movups	$inout0,0($out)		# store output
+	lea	16($out),$out
+	sub	\$16,$len
+	jnc	.Lcbc_enc_loop
+	add	\$16,$len
+	jnz	.Lcbc_enc_tail
+	movups	$inout0,($ivp)
+	jmp	.Lcbc_ret
+
+.Lcbc_enc_tail:
+	mov	$len,%rcx	# zaps $key
+	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
+	.long	0x9066A4F3	# rep movsb
+	mov	\$16,%ecx	# zero tail
+	sub	$len,%rcx
+	xor	%eax,%eax
+	.long	0x9066AAF3	# rep stosb
+	lea	-16(%rdi),%rdi	# rewind $out by 1 block
+	mov	$rnds_,$rounds	# restore $rounds
+	mov	%rdi,%rsi	# $inp and $out are the same
+	mov	$key_,$key	# restore $key
+	xor	$len,$len	# len=16
+	jmp	.Lcbc_enc_loop	# one more spin
+#--------------------------- CBC DECRYPT ------------------------------#
+.align	16
+.Lcbc_decrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lcbc_decrypt_body:
+___
+$code.=<<___;
+	movups	($ivp),$iv
+	mov	$rnds_,$rounds
+	cmp	\$0x70,$len
+	jbe	.Lcbc_dec_tail
+	shr	\$1,$rnds_
+	sub	\$0x70,$len
+	mov	$rnds_,$rounds
+	movaps	$iv,$reserved(%rsp)
+	jmp	.Lcbc_dec_loop8_enter
+.align	16
+.Lcbc_dec_loop8:
+	movaps	$rndkey0,$reserved(%rsp)	# save IV
+	movups	$inout7,($out)
+	lea	0x10($out),$out
+.Lcbc_dec_loop8_enter:
+	$movkey		($key),$rndkey0
+	movups	($inp),$inout0			# load input
+	movups	0x10($inp),$inout1
+	$movkey		16($key),$rndkey1
+
+	lea		32($key),$key
+	movdqu	0x20($inp),$inout2
+	xorps		$rndkey0,$inout0
+	movdqu	0x30($inp),$inout3
+	xorps		$rndkey0,$inout1
+	movdqu	0x40($inp),$inout4
+	aesdec		$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	movdqu	0x50($inp),$inout5
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	movdqu	0x60($inp),$inout6
+	aesdec		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	movdqu	0x70($inp),$inout7
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aesdec		$rndkey1,$inout4
+	pxor		$rndkey0,$inout6
+	aesdec		$rndkey1,$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	aesdec		$rndkey1,$inout6
+	aesdec		$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+
+	call		.Ldec_loop8_enter
+
+	movups	($inp),$rndkey1		# re-load input
+	movups	0x10($inp),$rndkey0
+	xorps	$reserved(%rsp),$inout0	# ^= IV
+	xorps	$rndkey1,$inout1
+	movups	0x20($inp),$rndkey1
+	xorps	$rndkey0,$inout2
+	movups	0x30($inp),$rndkey0
+	xorps	$rndkey1,$inout3
+	movups	0x40($inp),$rndkey1
+	xorps	$rndkey0,$inout4
+	movups	0x50($inp),$rndkey0
+	xorps	$rndkey1,$inout5
+	movups	0x60($inp),$rndkey1
+	xorps	$rndkey0,$inout6
+	movups	0x70($inp),$rndkey0	# IV
+	xorps	$rndkey1,$inout7
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout4,0x40($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout5,0x50($out)
+	lea	0x80($inp),$inp
+	movups	$inout6,0x60($out)
+	lea	0x70($out),$out
+	sub	\$0x80,$len
+	ja	.Lcbc_dec_loop8
+
+	movaps	$inout7,$inout0
+	movaps	$rndkey0,$iv
+	add	\$0x70,$len
+	jle	.Lcbc_dec_tail_collected
+	movups	$inout0,($out)
+	lea	1($rnds_,$rnds_),$rounds
+	lea	0x10($out),$out
+.Lcbc_dec_tail:
+	movups	($inp),$inout0
+	movaps	$inout0,$in0
+	cmp	\$0x10,$len
+	jbe	.Lcbc_dec_one
+
+	movups	0x10($inp),$inout1
+	movaps	$inout1,$in1
+	cmp	\$0x20,$len
+	jbe	.Lcbc_dec_two
+
+	movups	0x20($inp),$inout2
+	movaps	$inout2,$in2
+	cmp	\$0x30,$len
+	jbe	.Lcbc_dec_three
+
+	movups	0x30($inp),$inout3
+	cmp	\$0x40,$len
+	jbe	.Lcbc_dec_four
+
+	movups	0x40($inp),$inout4
+	cmp	\$0x50,$len
+	jbe	.Lcbc_dec_five
+
+	movups	0x50($inp),$inout5
+	cmp	\$0x60,$len
+	jbe	.Lcbc_dec_six
+
+	movups	0x60($inp),$inout6
+	movaps	$iv,$reserved(%rsp)	# save IV
+	call	_aesni_decrypt8
+	movups	($inp),$rndkey1
+	movups	0x10($inp),$rndkey0
+	xorps	$reserved(%rsp),$inout0	# ^= IV
+	xorps	$rndkey1,$inout1
+	movups	0x20($inp),$rndkey1
+	xorps	$rndkey0,$inout2
+	movups	0x30($inp),$rndkey0
+	xorps	$rndkey1,$inout3
+	movups	0x40($inp),$rndkey1
+	xorps	$rndkey0,$inout4
+	movups	0x50($inp),$rndkey0
+	xorps	$rndkey1,$inout5
+	movups	0x60($inp),$iv		# IV
+	xorps	$rndkey0,$inout6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	lea	0x60($out),$out
+	movaps	$inout6,$inout0
+	sub	\$0x70,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_one:
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	$iv,$inout0
+	movaps	$in0,$iv
+	sub	\$0x10,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_two:
+	xorps	$inout2,$inout2
+	call	_aesni_decrypt3
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	movaps	$in1,$iv
+	movaps	$inout1,$inout0
+	lea	0x10($out),$out
+	sub	\$0x20,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	xorps	$in1,$inout2
+	movups	$inout1,0x10($out)
+	movaps	$in2,$iv
+	movaps	$inout2,$inout0
+	lea	0x20($out),$out
+	sub	\$0x30,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_four:
+	call	_aesni_decrypt4
+	xorps	$iv,$inout0
+	movups	0x30($inp),$iv
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	xorps	$in1,$inout2
+	movups	$inout1,0x10($out)
+	xorps	$in2,$inout3
+	movups	$inout2,0x20($out)
+	movaps	$inout3,$inout0
+	lea	0x30($out),$out
+	sub	\$0x40,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_five:
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	movups	0x10($inp),$rndkey1
+	movups	0x20($inp),$rndkey0
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	xorps	$rndkey1,$inout2
+	movups	0x30($inp),$rndkey1
+	xorps	$rndkey0,$inout3
+	movups	0x40($inp),$iv
+	xorps	$rndkey1,$inout4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	lea	0x40($out),$out
+	movaps	$inout4,$inout0
+	sub	\$0x50,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_six:
+	call	_aesni_decrypt6
+	movups	0x10($inp),$rndkey1
+	movups	0x20($inp),$rndkey0
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	xorps	$rndkey1,$inout2
+	movups	0x30($inp),$rndkey1
+	xorps	$rndkey0,$inout3
+	movups	0x40($inp),$rndkey0
+	xorps	$rndkey1,$inout4
+	movups	0x50($inp),$iv
+	xorps	$rndkey0,$inout5
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	lea	0x50($out),$out
+	movaps	$inout5,$inout0
+	sub	\$0x60,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_tail_collected:
+	and	\$15,$len
+	movups	$iv,($ivp)
+	jnz	.Lcbc_dec_tail_partial
+	movups	$inout0,($out)
+	jmp	.Lcbc_dec_ret
+.align	16
+.Lcbc_dec_tail_partial:
+	movaps	$inout0,$reserved(%rsp)
+	mov	\$16,%rcx
+	mov	$out,%rdi
+	sub	$len,%rcx
+	lea	$reserved(%rsp),%rsi
+	.long	0x9066A4F3	# rep movsb
+
+.Lcbc_dec_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+___
+$code.=<<___;
+.Lcbc_ret:
+	ret
+.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+} 
+# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+#				int bits, AES_KEY *key)
+{ my ($inp,$bits,$key) = @_4args;
+  $bits =~ s/%r/%e/;
+
+$code.=<<___;
+.globl	${PREFIX}_set_decrypt_key
+.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
+.align	16
+${PREFIX}_set_decrypt_key:
+	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+	call	__aesni_set_encrypt_key
+	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
+	test	%eax,%eax
+	jnz	.Ldec_key_ret
+	lea	16($key,$bits),$inp	# points at the end of key schedule
+
+	$movkey	($key),%xmm0		# just swap
+	$movkey	($inp),%xmm1
+	$movkey	%xmm0,($inp)
+	$movkey	%xmm1,($key)
+	lea	16($key),$key
+	lea	-16($inp),$inp
+
+.Ldec_key_inverse:
+	$movkey	($key),%xmm0		# swap and inverse
+	$movkey	($inp),%xmm1
+	aesimc	%xmm0,%xmm0
+	aesimc	%xmm1,%xmm1
+	lea	16($key),$key
+	lea	-16($inp),$inp
+	$movkey	%xmm0,16($inp)
+	$movkey	%xmm1,-16($key)
+	cmp	$key,$inp
+	ja	.Ldec_key_inverse
+
+	$movkey	($key),%xmm0		# inverse middle
+	aesimc	%xmm0,%xmm0
+	$movkey	%xmm0,($inp)
+.Ldec_key_ret:
+	add	\$8,%rsp
+	ret
+.LSEH_end_set_decrypt_key:
+.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+___
+
+# This is based on submission by
+#
+#	Huang Ying <ying.huang@intel.com>
+#	Vinodh Gopal <vinodh.gopal@intel.com>
+#	Kahraman Akdemir
+#
+# Agressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+$code.=<<___;
+.globl	${PREFIX}_set_encrypt_key
+.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.align	16
+${PREFIX}_set_encrypt_key:
+__aesni_set_encrypt_key:
+	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+	mov	\$-1,%rax
+	test	$inp,$inp
+	jz	.Lenc_key_ret
+	test	$key,$key
+	jz	.Lenc_key_ret
+
+	movups	($inp),%xmm0		# pull first 128 bits of *userKey
+	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
+	lea	16($key),%rax
+	cmp	\$256,$bits
+	je	.L14rounds
+	cmp	\$192,$bits
+	je	.L12rounds
+	cmp	\$128,$bits
+	jne	.Lbad_keybits
+
+.L10rounds:
+	mov	\$9,$bits			# 10 rounds for 128-bit key
+	$movkey	%xmm0,($key)			# round 0
+	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
+	call		.Lkey_expansion_128_cold
+	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
+	call		.Lkey_expansion_128
+	$movkey	%xmm0,(%rax)
+	mov	$bits,80(%rax)	# 240(%rdx)
+	xor	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds:
+	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
+	mov	\$11,$bits			# 12 rounds for 192
+	$movkey	%xmm0,($key)			# round 0
+	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
+	call		.Lkey_expansion_192a_cold
+	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
+	call		.Lkey_expansion_192b
+	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
+	call		.Lkey_expansion_192a
+	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
+	call		.Lkey_expansion_192b
+	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
+	call		.Lkey_expansion_192a
+	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
+	call		.Lkey_expansion_192b
+	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
+	call		.Lkey_expansion_192a
+	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
+	call		.Lkey_expansion_192b
+	$movkey	%xmm0,(%rax)
+	mov	$bits,48(%rax)	# 240(%rdx)
+	xor	%rax, %rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds:
+	movups	16($inp),%xmm2			# remaning half of *userKey
+	mov	\$13,$bits			# 14 rounds for 256
+	lea	16(%rax),%rax
+	$movkey	%xmm0,($key)			# round 0
+	$movkey	%xmm2,16($key)			# round 1
+	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
+	call		.Lkey_expansion_256a_cold
+	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
+	call		.Lkey_expansion_256a
+	$movkey	%xmm0,(%rax)
+	mov	$bits,16(%rax)	# 240(%rdx)
+	xor	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	mov	\$-2,%rax
+.Lenc_key_ret:
+	add	\$8,%rsp
+	ret
+.LSEH_end_set_encrypt_key:
+
+.align	16
+.Lkey_expansion_128:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	xorps	%xmm4, %xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	xorps	%xmm4, %xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
+	ret
+
+.align 16
+.Lkey_expansion_192a:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	pslldq	\$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	\$0b11111111,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+
+.align 16
+.Lkey_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	\$0b01000100,%xmm0,%xmm5
+	$movkey	%xmm5,(%rax)
+	shufps	\$0b01001110,%xmm2,%xmm3
+	$movkey	%xmm3,16(%rax)
+	lea	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+
+.align	16
+.Lkey_expansion_256a:
+	$movkey	%xmm2,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
+	ret
+
+.align 16
+.Lkey_expansion_256b:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+
+	shufps	\$0b00010000,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	\$0b10001100,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm2
+	ret
+.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+___
+}
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+	.long	6,6,6,0
+.Lincrement64:
+	.long	1,0,0,0
+.Lxts_magic:
+	.long	0x87,0,1,0
+
+.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type	ecb_se_handler,\@abi-omnipotent
+.align	16
+ecb_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	jmp	.Lcommon_seh_tail
+.size	ecb_se_handler,.-ecb_se_handler
+
+.type	ccm64_se_handler,\@abi-omnipotent
+.align	16
+ccm64_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	0(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x58(%rax),%rax		# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	ccm64_se_handler,.-ccm64_se_handler
+
+.type	ctr32_se_handler,\@abi-omnipotent
+.align	16
+ctr32_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lctr32_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lctr32_ret(%rip),%r10
+	cmp	%r10,%rbx
+	jae	.Lcommon_seh_tail
+
+	lea	0x20(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xc8(%rax),%rax		# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	ctr32_se_handler,.-ctr32_se_handler
+
+.type	xts_se_handler,\@abi-omnipotent
+.align	16
+xts_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue lable
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	0x60(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x68+160(%rax),%rax	# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	xts_se_handler,.-xts_se_handler
+___
+$code.=<<___;
+.type	cbc_se_handler,\@abi-omnipotent
+.align	16
+cbc_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lcbc_decrypt(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lcommon_seh_tail
+
+	lea	.Lcbc_decrypt_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
+	jb	.Lrestore_cbc_rax
+
+	lea	.Lcbc_ret(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>="epilogue" label
+	jae	.Lcommon_seh_tail
+
+	lea	0(%rax),%rsi		# top of stack
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x58(%rax),%rax		# adjust stack pointer
+	jmp	.Lcommon_seh_tail
+
+.Lrestore_cbc_rax:
+	mov	120($context),%rax
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	cbc_se_handler,.-cbc_se_handler
+
+.section	.pdata
+.align	4
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+	.rva	.LSEH_begin_aesni_ecb_encrypt
+	.rva	.LSEH_end_aesni_ecb_encrypt
+	.rva	.LSEH_info_ecb
+
+	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
+	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
+	.rva	.LSEH_info_ccm64_enc
+
+	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
+	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
+	.rva	.LSEH_info_ccm64_dec
+
+	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_info_ctr32
+
+	.rva	.LSEH_begin_aesni_xts_encrypt
+	.rva	.LSEH_end_aesni_xts_encrypt
+	.rva	.LSEH_info_xts_enc
+
+	.rva	.LSEH_begin_aesni_xts_decrypt
+	.rva	.LSEH_end_aesni_xts_decrypt
+	.rva	.LSEH_info_xts_dec
+___
+$code.=<<___;
+	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_info_cbc
+
+	.rva	${PREFIX}_set_decrypt_key
+	.rva	.LSEH_end_set_decrypt_key
+	.rva	.LSEH_info_key
+
+	.rva	${PREFIX}_set_encrypt_key
+	.rva	.LSEH_end_set_encrypt_key
+	.rva	.LSEH_info_key
+.section	.xdata
+.align	8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.LSEH_info_ecb:
+	.byte	9,0,0,0
+	.rva	ecb_se_handler
+.LSEH_info_ccm64_enc:
+	.byte	9,0,0,0
+	.rva	ccm64_se_handler
+	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
+.LSEH_info_ccm64_dec:
+	.byte	9,0,0,0
+	.rva	ccm64_se_handler
+	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
+.LSEH_info_ctr32:
+	.byte	9,0,0,0
+	.rva	ctr32_se_handler
+.LSEH_info_xts_enc:
+	.byte	9,0,0,0
+	.rva	xts_se_handler
+	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
+.LSEH_info_xts_dec:
+	.byte	9,0,0,0
+	.rva	xts_se_handler
+	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+___
+$code.=<<___;
+.LSEH_info_cbc:
+	.byte	9,0,0,0
+	.rva	cbc_se_handler
+.LSEH_info_key:
+	.byte	0x01,0x04,0x01,0x00
+	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
+___
+}
+
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	rex(\@opcode,$4,$3);
+	push @opcode,0x0f,0x3a,0xdf;
+	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
+	my $c=$2;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	return ".byte\t".join(',',@opcode);
+    }
+    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesimc" => 0xdb,
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
+		"aesdec" => 0xde,	"aesdeclast" => 0xdf
+	);
+	return undef if (!defined($opcodelet{$1}));
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/bsaes-x86_64.S b/main/openssl/crypto/aes/asm/bsaes-x86_64.S
new file mode 100644
index 00000000..dc92d4da
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/bsaes-x86_64.S
@@ -0,0 +1,2498 @@
+.text	
+
+
+
+
+.type	_bsaes_encrypt8,@function
+.align	64
+_bsaes_encrypt8:
+	leaq	.LBS0(%rip),%r11
+
+	movdqa	(%rax),%xmm8
+	leaq	16(%rax),%rax
+	movdqa	80(%r11),%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+.byte	102,68,15,56,0,255
+	pxor	%xmm8,%xmm1
+.byte	102,15,56,0,199
+	pxor	%xmm8,%xmm2
+.byte	102,15,56,0,207
+	pxor	%xmm8,%xmm3
+.byte	102,15,56,0,215
+	pxor	%xmm8,%xmm4
+.byte	102,15,56,0,223
+	pxor	%xmm8,%xmm5
+.byte	102,15,56,0,231
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,239
+.byte	102,15,56,0,247
+_bsaes_encrypt8_bitslice:
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm5,%xmm9
+	psrlq	$1,%xmm5
+	movdqa	%xmm3,%xmm10
+	psrlq	$1,%xmm3
+	pxor	%xmm6,%xmm5
+	pxor	%xmm4,%xmm3
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm3
+	pxor	%xmm5,%xmm6
+	psllq	$1,%xmm5
+	pxor	%xmm3,%xmm4
+	psllq	$1,%xmm3
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm1,%xmm9
+	psrlq	$1,%xmm1
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm2,%xmm1
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm1
+	pand	%xmm7,%xmm15
+	pxor	%xmm1,%xmm2
+	psllq	$1,%xmm1
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm1
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm4,%xmm9
+	psrlq	$2,%xmm4
+	movdqa	%xmm3,%xmm10
+	psrlq	$2,%xmm3
+	pxor	%xmm6,%xmm4
+	pxor	%xmm5,%xmm3
+	pand	%xmm8,%xmm4
+	pand	%xmm8,%xmm3
+	pxor	%xmm4,%xmm6
+	psllq	$2,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$2,%xmm3
+	pxor	%xmm9,%xmm4
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm2,%xmm0
+	pxor	%xmm1,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm2
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm1
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm2,%xmm9
+	psrlq	$4,%xmm2
+	movdqa	%xmm1,%xmm10
+	psrlq	$4,%xmm1
+	pxor	%xmm6,%xmm2
+	pxor	%xmm5,%xmm1
+	pand	%xmm7,%xmm2
+	pand	%xmm7,%xmm1
+	pxor	%xmm2,%xmm6
+	psllq	$4,%xmm2
+	pxor	%xmm1,%xmm5
+	psllq	$4,%xmm1
+	pxor	%xmm9,%xmm2
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm4
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm3
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	decl	%r10d
+	jmp	.Lenc_sbox
+.align	16
+.Lenc_loop:
+	pxor	0(%rax),%xmm15
+	pxor	16(%rax),%xmm0
+.byte	102,68,15,56,0,255
+	pxor	32(%rax),%xmm1
+.byte	102,15,56,0,199
+	pxor	48(%rax),%xmm2
+.byte	102,15,56,0,207
+	pxor	64(%rax),%xmm3
+.byte	102,15,56,0,215
+	pxor	80(%rax),%xmm4
+.byte	102,15,56,0,223
+	pxor	96(%rax),%xmm5
+.byte	102,15,56,0,231
+	pxor	112(%rax),%xmm6
+.byte	102,15,56,0,239
+	leaq	128(%rax),%rax
+.byte	102,15,56,0,247
+.Lenc_sbox:
+	pxor	%xmm5,%xmm4
+	pxor	%xmm0,%xmm1
+	pxor	%xmm15,%xmm2
+	pxor	%xmm1,%xmm5
+	pxor	%xmm15,%xmm4
+
+	pxor	%xmm2,%xmm5
+	pxor	%xmm6,%xmm2
+	pxor	%xmm4,%xmm6
+	pxor	%xmm3,%xmm2
+	pxor	%xmm4,%xmm3
+	pxor	%xmm0,%xmm2
+
+	pxor	%xmm6,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm6,%xmm10
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm4,%xmm8
+	movdqa	%xmm1,%xmm12
+	movdqa	%xmm5,%xmm11
+
+	pxor	%xmm3,%xmm10
+	pxor	%xmm1,%xmm9
+	pxor	%xmm2,%xmm8
+	movdqa	%xmm10,%xmm13
+	pxor	%xmm3,%xmm12
+	movdqa	%xmm9,%xmm7
+	pxor	%xmm15,%xmm11
+	movdqa	%xmm10,%xmm14
+
+	por	%xmm8,%xmm9
+	por	%xmm11,%xmm10
+	pxor	%xmm7,%xmm14
+	pand	%xmm11,%xmm13
+	pxor	%xmm8,%xmm11
+	pand	%xmm8,%xmm7
+	pand	%xmm11,%xmm14
+	movdqa	%xmm2,%xmm11
+	pxor	%xmm15,%xmm11
+	pand	%xmm11,%xmm12
+	pxor	%xmm12,%xmm10
+	pxor	%xmm12,%xmm9
+	movdqa	%xmm6,%xmm12
+	movdqa	%xmm4,%xmm11
+	pxor	%xmm0,%xmm12
+	pxor	%xmm5,%xmm11
+	movdqa	%xmm12,%xmm8
+	pand	%xmm11,%xmm12
+	por	%xmm11,%xmm8
+	pxor	%xmm12,%xmm7
+	pxor	%xmm14,%xmm10
+	pxor	%xmm13,%xmm9
+	pxor	%xmm14,%xmm8
+	movdqa	%xmm1,%xmm11
+	pxor	%xmm13,%xmm7
+	movdqa	%xmm3,%xmm12
+	pxor	%xmm13,%xmm8
+	movdqa	%xmm0,%xmm13
+	pand	%xmm2,%xmm11
+	movdqa	%xmm6,%xmm14
+	pand	%xmm15,%xmm12
+	pand	%xmm4,%xmm13
+	por	%xmm5,%xmm14
+	pxor	%xmm11,%xmm10
+	pxor	%xmm12,%xmm9
+	pxor	%xmm13,%xmm8
+	pxor	%xmm14,%xmm7
+
+
+
+
+
+	movdqa	%xmm10,%xmm11
+	pand	%xmm8,%xmm10
+	pxor	%xmm9,%xmm11
+
+	movdqa	%xmm7,%xmm13
+	movdqa	%xmm11,%xmm14
+	pxor	%xmm10,%xmm13
+	pand	%xmm13,%xmm14
+
+	movdqa	%xmm8,%xmm12
+	pxor	%xmm9,%xmm14
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm9,%xmm10
+
+	pand	%xmm10,%xmm12
+
+	movdqa	%xmm13,%xmm9
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm12,%xmm9
+	pxor	%xmm12,%xmm8
+
+	pand	%xmm7,%xmm9
+
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm8
+
+	pand	%xmm14,%xmm13
+
+	pxor	%xmm11,%xmm13
+	movdqa	%xmm5,%xmm11
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm14,%xmm9
+	pxor	%xmm13,%xmm9
+	pand	%xmm5,%xmm9
+	pxor	%xmm4,%xmm5
+	pand	%xmm14,%xmm4
+	pand	%xmm13,%xmm5
+	pxor	%xmm4,%xmm5
+	pxor	%xmm9,%xmm4
+	pxor	%xmm15,%xmm11
+	pxor	%xmm2,%xmm7
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm15,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm2,%xmm15
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm2
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm15
+	pxor	%xmm11,%xmm7
+	pxor	%xmm2,%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm11,%xmm5
+	pxor	%xmm11,%xmm15
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm2
+
+	movdqa	%xmm6,%xmm11
+	movdqa	%xmm0,%xmm7
+	pxor	%xmm3,%xmm11
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm3,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm1,%xmm3
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm1
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm3
+	pxor	%xmm11,%xmm7
+	pxor	%xmm1,%xmm3
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm1
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	pxor	%xmm13,%xmm10
+	pand	%xmm6,%xmm10
+	pxor	%xmm0,%xmm6
+	pand	%xmm14,%xmm0
+	pand	%xmm13,%xmm6
+	pxor	%xmm0,%xmm6
+	pxor	%xmm10,%xmm0
+	pxor	%xmm11,%xmm6
+	pxor	%xmm11,%xmm3
+	pxor	%xmm7,%xmm0
+	pxor	%xmm7,%xmm1
+	pxor	%xmm15,%xmm6
+	pxor	%xmm5,%xmm0
+	pxor	%xmm6,%xmm3
+	pxor	%xmm15,%xmm5
+	pxor	%xmm0,%xmm15
+
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	pxor	%xmm2,%xmm1
+	pxor	%xmm4,%xmm2
+	pxor	%xmm4,%xmm3
+
+	pxor	%xmm2,%xmm5
+	decl	%r10d
+	jl	.Lenc_done
+	pshufd	$147,%xmm15,%xmm7
+	pshufd	$147,%xmm0,%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$147,%xmm3,%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$147,%xmm5,%xmm10
+	pxor	%xmm9,%xmm3
+	pshufd	$147,%xmm2,%xmm11
+	pxor	%xmm10,%xmm5
+	pshufd	$147,%xmm6,%xmm12
+	pxor	%xmm11,%xmm2
+	pshufd	$147,%xmm1,%xmm13
+	pxor	%xmm12,%xmm6
+	pshufd	$147,%xmm4,%xmm14
+	pxor	%xmm13,%xmm1
+	pxor	%xmm14,%xmm4
+
+	pxor	%xmm15,%xmm8
+	pxor	%xmm4,%xmm7
+	pxor	%xmm4,%xmm8
+	pshufd	$78,%xmm15,%xmm15
+	pxor	%xmm0,%xmm9
+	pshufd	$78,%xmm0,%xmm0
+	pxor	%xmm2,%xmm12
+	pxor	%xmm7,%xmm15
+	pxor	%xmm6,%xmm13
+	pxor	%xmm8,%xmm0
+	pxor	%xmm5,%xmm11
+	pshufd	$78,%xmm2,%xmm7
+	pxor	%xmm1,%xmm14
+	pshufd	$78,%xmm6,%xmm8
+	pxor	%xmm3,%xmm10
+	pshufd	$78,%xmm5,%xmm2
+	pxor	%xmm4,%xmm10
+	pshufd	$78,%xmm4,%xmm6
+	pxor	%xmm4,%xmm11
+	pshufd	$78,%xmm1,%xmm5
+	pxor	%xmm11,%xmm7
+	pshufd	$78,%xmm3,%xmm1
+	pxor	%xmm12,%xmm8
+	pxor	%xmm10,%xmm2
+	pxor	%xmm14,%xmm6
+	pxor	%xmm13,%xmm5
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm9,%xmm1
+	movdqa	%xmm8,%xmm4
+	movdqa	48(%r11),%xmm7
+	jnz	.Lenc_loop
+	movdqa	64(%r11),%xmm7
+	jmp	.Lenc_loop
+.align	16
+.Lenc_done:
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm1,%xmm9
+	psrlq	$1,%xmm1
+	movdqa	%xmm2,%xmm10
+	psrlq	$1,%xmm2
+	pxor	%xmm4,%xmm1
+	pxor	%xmm6,%xmm2
+	pand	%xmm7,%xmm1
+	pand	%xmm7,%xmm2
+	pxor	%xmm1,%xmm4
+	psllq	$1,%xmm1
+	pxor	%xmm2,%xmm6
+	psllq	$1,%xmm2
+	pxor	%xmm9,%xmm1
+	pxor	%xmm10,%xmm2
+	movdqa	%xmm3,%xmm9
+	psrlq	$1,%xmm3
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm5,%xmm3
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm3
+	pand	%xmm7,%xmm15
+	pxor	%xmm3,%xmm5
+	psllq	$1,%xmm3
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm3
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm6,%xmm9
+	psrlq	$2,%xmm6
+	movdqa	%xmm2,%xmm10
+	psrlq	$2,%xmm2
+	pxor	%xmm4,%xmm6
+	pxor	%xmm1,%xmm2
+	pand	%xmm8,%xmm6
+	pand	%xmm8,%xmm2
+	pxor	%xmm6,%xmm4
+	psllq	$2,%xmm6
+	pxor	%xmm2,%xmm1
+	psllq	$2,%xmm2
+	pxor	%xmm9,%xmm6
+	pxor	%xmm10,%xmm2
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm5,%xmm0
+	pxor	%xmm3,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm5
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm3
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm5,%xmm9
+	psrlq	$4,%xmm5
+	movdqa	%xmm3,%xmm10
+	psrlq	$4,%xmm3
+	pxor	%xmm4,%xmm5
+	pxor	%xmm1,%xmm3
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm3
+	pxor	%xmm5,%xmm4
+	psllq	$4,%xmm5
+	pxor	%xmm3,%xmm1
+	psllq	$4,%xmm3
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm6,%xmm0
+	pxor	%xmm2,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm6
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm2
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	(%rax),%xmm7
+	pxor	%xmm7,%xmm3
+	pxor	%xmm7,%xmm5
+	pxor	%xmm7,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm7,%xmm1
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm15
+	pxor	%xmm7,%xmm0
+	.byte	0xf3,0xc3
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type	_bsaes_decrypt8,@function
+.align	64
+_bsaes_decrypt8:
+	leaq	.LBS0(%rip),%r11
+
+	movdqa	(%rax),%xmm8
+	leaq	16(%rax),%rax
+	movdqa	-48(%r11),%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+.byte	102,68,15,56,0,255
+	pxor	%xmm8,%xmm1
+.byte	102,15,56,0,199
+	pxor	%xmm8,%xmm2
+.byte	102,15,56,0,207
+	pxor	%xmm8,%xmm3
+.byte	102,15,56,0,215
+	pxor	%xmm8,%xmm4
+.byte	102,15,56,0,223
+	pxor	%xmm8,%xmm5
+.byte	102,15,56,0,231
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,239
+.byte	102,15,56,0,247
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm5,%xmm9
+	psrlq	$1,%xmm5
+	movdqa	%xmm3,%xmm10
+	psrlq	$1,%xmm3
+	pxor	%xmm6,%xmm5
+	pxor	%xmm4,%xmm3
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm3
+	pxor	%xmm5,%xmm6
+	psllq	$1,%xmm5
+	pxor	%xmm3,%xmm4
+	psllq	$1,%xmm3
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm1,%xmm9
+	psrlq	$1,%xmm1
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm2,%xmm1
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm1
+	pand	%xmm7,%xmm15
+	pxor	%xmm1,%xmm2
+	psllq	$1,%xmm1
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm1
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm4,%xmm9
+	psrlq	$2,%xmm4
+	movdqa	%xmm3,%xmm10
+	psrlq	$2,%xmm3
+	pxor	%xmm6,%xmm4
+	pxor	%xmm5,%xmm3
+	pand	%xmm8,%xmm4
+	pand	%xmm8,%xmm3
+	pxor	%xmm4,%xmm6
+	psllq	$2,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$2,%xmm3
+	pxor	%xmm9,%xmm4
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm2,%xmm0
+	pxor	%xmm1,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm2
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm1
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm2,%xmm9
+	psrlq	$4,%xmm2
+	movdqa	%xmm1,%xmm10
+	psrlq	$4,%xmm1
+	pxor	%xmm6,%xmm2
+	pxor	%xmm5,%xmm1
+	pand	%xmm7,%xmm2
+	pand	%xmm7,%xmm1
+	pxor	%xmm2,%xmm6
+	psllq	$4,%xmm2
+	pxor	%xmm1,%xmm5
+	psllq	$4,%xmm1
+	pxor	%xmm9,%xmm2
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm4
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm3
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	decl	%r10d
+	jmp	.Ldec_sbox
+.align	16
+.Ldec_loop:
+	pxor	0(%rax),%xmm15
+	pxor	16(%rax),%xmm0
+.byte	102,68,15,56,0,255
+	pxor	32(%rax),%xmm1
+.byte	102,15,56,0,199
+	pxor	48(%rax),%xmm2
+.byte	102,15,56,0,207
+	pxor	64(%rax),%xmm3
+.byte	102,15,56,0,215
+	pxor	80(%rax),%xmm4
+.byte	102,15,56,0,223
+	pxor	96(%rax),%xmm5
+.byte	102,15,56,0,231
+	pxor	112(%rax),%xmm6
+.byte	102,15,56,0,239
+	leaq	128(%rax),%rax
+.byte	102,15,56,0,247
+.Ldec_sbox:
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm6,%xmm3
+	pxor	%xmm6,%xmm1
+	pxor	%xmm3,%xmm5
+	pxor	%xmm5,%xmm6
+	pxor	%xmm6,%xmm0
+
+	pxor	%xmm0,%xmm15
+	pxor	%xmm4,%xmm1
+	pxor	%xmm15,%xmm2
+	pxor	%xmm15,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm2,%xmm10
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm3,%xmm12
+	movdqa	%xmm4,%xmm11
+
+	pxor	%xmm15,%xmm10
+	pxor	%xmm3,%xmm9
+	pxor	%xmm5,%xmm8
+	movdqa	%xmm10,%xmm13
+	pxor	%xmm15,%xmm12
+	movdqa	%xmm9,%xmm7
+	pxor	%xmm1,%xmm11
+	movdqa	%xmm10,%xmm14
+
+	por	%xmm8,%xmm9
+	por	%xmm11,%xmm10
+	pxor	%xmm7,%xmm14
+	pand	%xmm11,%xmm13
+	pxor	%xmm8,%xmm11
+	pand	%xmm8,%xmm7
+	pand	%xmm11,%xmm14
+	movdqa	%xmm5,%xmm11
+	pxor	%xmm1,%xmm11
+	pand	%xmm11,%xmm12
+	pxor	%xmm12,%xmm10
+	pxor	%xmm12,%xmm9
+	movdqa	%xmm2,%xmm12
+	movdqa	%xmm0,%xmm11
+	pxor	%xmm6,%xmm12
+	pxor	%xmm4,%xmm11
+	movdqa	%xmm12,%xmm8
+	pand	%xmm11,%xmm12
+	por	%xmm11,%xmm8
+	pxor	%xmm12,%xmm7
+	pxor	%xmm14,%xmm10
+	pxor	%xmm13,%xmm9
+	pxor	%xmm14,%xmm8
+	movdqa	%xmm3,%xmm11
+	pxor	%xmm13,%xmm7
+	movdqa	%xmm15,%xmm12
+	pxor	%xmm13,%xmm8
+	movdqa	%xmm6,%xmm13
+	pand	%xmm5,%xmm11
+	movdqa	%xmm2,%xmm14
+	pand	%xmm1,%xmm12
+	pand	%xmm0,%xmm13
+	por	%xmm4,%xmm14
+	pxor	%xmm11,%xmm10
+	pxor	%xmm12,%xmm9
+	pxor	%xmm13,%xmm8
+	pxor	%xmm14,%xmm7
+
+
+
+
+
+	movdqa	%xmm10,%xmm11
+	pand	%xmm8,%xmm10
+	pxor	%xmm9,%xmm11
+
+	movdqa	%xmm7,%xmm13
+	movdqa	%xmm11,%xmm14
+	pxor	%xmm10,%xmm13
+	pand	%xmm13,%xmm14
+
+	movdqa	%xmm8,%xmm12
+	pxor	%xmm9,%xmm14
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm9,%xmm10
+
+	pand	%xmm10,%xmm12
+
+	movdqa	%xmm13,%xmm9
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm12,%xmm9
+	pxor	%xmm12,%xmm8
+
+	pand	%xmm7,%xmm9
+
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm8
+
+	pand	%xmm14,%xmm13
+
+	pxor	%xmm11,%xmm13
+	movdqa	%xmm4,%xmm11
+	movdqa	%xmm0,%xmm7
+	movdqa	%xmm14,%xmm9
+	pxor	%xmm13,%xmm9
+	pand	%xmm4,%xmm9
+	pxor	%xmm0,%xmm4
+	pand	%xmm14,%xmm0
+	pand	%xmm13,%xmm4
+	pxor	%xmm0,%xmm4
+	pxor	%xmm9,%xmm0
+	pxor	%xmm1,%xmm11
+	pxor	%xmm5,%xmm7
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm1,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm5,%xmm1
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm5
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm1
+	pxor	%xmm11,%xmm7
+	pxor	%xmm5,%xmm1
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm5
+	pxor	%xmm11,%xmm4
+	pxor	%xmm11,%xmm1
+	pxor	%xmm7,%xmm0
+	pxor	%xmm7,%xmm5
+
+	movdqa	%xmm2,%xmm11
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm15,%xmm11
+	pxor	%xmm3,%xmm7
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm15,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm3,%xmm15
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm3
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm15
+	pxor	%xmm11,%xmm7
+	pxor	%xmm3,%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm3
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	pxor	%xmm13,%xmm10
+	pand	%xmm2,%xmm10
+	pxor	%xmm6,%xmm2
+	pand	%xmm14,%xmm6
+	pand	%xmm13,%xmm2
+	pxor	%xmm6,%xmm2
+	pxor	%xmm10,%xmm6
+	pxor	%xmm11,%xmm2
+	pxor	%xmm11,%xmm15
+	pxor	%xmm7,%xmm6
+	pxor	%xmm7,%xmm3
+	pxor	%xmm6,%xmm0
+	pxor	%xmm4,%xmm5
+
+	pxor	%xmm0,%xmm3
+	pxor	%xmm6,%xmm1
+	pxor	%xmm6,%xmm4
+	pxor	%xmm1,%xmm3
+	pxor	%xmm15,%xmm6
+	pxor	%xmm4,%xmm3
+	pxor	%xmm5,%xmm2
+	pxor	%xmm0,%xmm5
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm15,%xmm3
+	pxor	%xmm2,%xmm6
+	decl	%r10d
+	jl	.Ldec_done
+
+	pshufd	$78,%xmm15,%xmm7
+	pshufd	$78,%xmm2,%xmm13
+	pxor	%xmm15,%xmm7
+	pshufd	$78,%xmm4,%xmm14
+	pxor	%xmm2,%xmm13
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm4,%xmm14
+	pshufd	$78,%xmm5,%xmm9
+	pxor	%xmm0,%xmm8
+	pshufd	$78,%xmm3,%xmm10
+	pxor	%xmm5,%xmm9
+	pxor	%xmm13,%xmm15
+	pxor	%xmm13,%xmm0
+	pshufd	$78,%xmm1,%xmm11
+	pxor	%xmm3,%xmm10
+	pxor	%xmm7,%xmm5
+	pxor	%xmm8,%xmm3
+	pshufd	$78,%xmm6,%xmm12
+	pxor	%xmm1,%xmm11
+	pxor	%xmm14,%xmm0
+	pxor	%xmm9,%xmm1
+	pxor	%xmm6,%xmm12
+
+	pxor	%xmm14,%xmm5
+	pxor	%xmm13,%xmm3
+	pxor	%xmm13,%xmm1
+	pxor	%xmm10,%xmm6
+	pxor	%xmm11,%xmm2
+	pxor	%xmm14,%xmm1
+	pxor	%xmm14,%xmm6
+	pxor	%xmm12,%xmm4
+	pshufd	$147,%xmm15,%xmm7
+	pshufd	$147,%xmm0,%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$147,%xmm5,%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$147,%xmm3,%xmm10
+	pxor	%xmm9,%xmm5
+	pshufd	$147,%xmm1,%xmm11
+	pxor	%xmm10,%xmm3
+	pshufd	$147,%xmm6,%xmm12
+	pxor	%xmm11,%xmm1
+	pshufd	$147,%xmm2,%xmm13
+	pxor	%xmm12,%xmm6
+	pshufd	$147,%xmm4,%xmm14
+	pxor	%xmm13,%xmm2
+	pxor	%xmm14,%xmm4
+
+	pxor	%xmm15,%xmm8
+	pxor	%xmm4,%xmm7
+	pxor	%xmm4,%xmm8
+	pshufd	$78,%xmm15,%xmm15
+	pxor	%xmm0,%xmm9
+	pshufd	$78,%xmm0,%xmm0
+	pxor	%xmm1,%xmm12
+	pxor	%xmm7,%xmm15
+	pxor	%xmm6,%xmm13
+	pxor	%xmm8,%xmm0
+	pxor	%xmm3,%xmm11
+	pshufd	$78,%xmm1,%xmm7
+	pxor	%xmm2,%xmm14
+	pshufd	$78,%xmm6,%xmm8
+	pxor	%xmm5,%xmm10
+	pshufd	$78,%xmm3,%xmm1
+	pxor	%xmm4,%xmm10
+	pshufd	$78,%xmm4,%xmm6
+	pxor	%xmm4,%xmm11
+	pshufd	$78,%xmm2,%xmm3
+	pxor	%xmm11,%xmm7
+	pshufd	$78,%xmm5,%xmm2
+	pxor	%xmm12,%xmm8
+	pxor	%xmm1,%xmm10
+	pxor	%xmm14,%xmm6
+	pxor	%xmm3,%xmm13
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm9,%xmm2
+	movdqa	%xmm13,%xmm5
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm2,%xmm1
+	movdqa	%xmm10,%xmm2
+	movdqa	-16(%r11),%xmm7
+	jnz	.Ldec_loop
+	movdqa	-32(%r11),%xmm7
+	jmp	.Ldec_loop
+.align	16
+.Ldec_done:
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm2,%xmm9
+	psrlq	$1,%xmm2
+	movdqa	%xmm1,%xmm10
+	psrlq	$1,%xmm1
+	pxor	%xmm4,%xmm2
+	pxor	%xmm6,%xmm1
+	pand	%xmm7,%xmm2
+	pand	%xmm7,%xmm1
+	pxor	%xmm2,%xmm4
+	psllq	$1,%xmm2
+	pxor	%xmm1,%xmm6
+	psllq	$1,%xmm1
+	pxor	%xmm9,%xmm2
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm5,%xmm9
+	psrlq	$1,%xmm5
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm3,%xmm5
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm15
+	pxor	%xmm5,%xmm3
+	psllq	$1,%xmm5
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm6,%xmm9
+	psrlq	$2,%xmm6
+	movdqa	%xmm1,%xmm10
+	psrlq	$2,%xmm1
+	pxor	%xmm4,%xmm6
+	pxor	%xmm2,%xmm1
+	pand	%xmm8,%xmm6
+	pand	%xmm8,%xmm1
+	pxor	%xmm6,%xmm4
+	psllq	$2,%xmm6
+	pxor	%xmm1,%xmm2
+	psllq	$2,%xmm1
+	pxor	%xmm9,%xmm6
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm3
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm5
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm3,%xmm9
+	psrlq	$4,%xmm3
+	movdqa	%xmm5,%xmm10
+	psrlq	$4,%xmm5
+	pxor	%xmm4,%xmm3
+	pxor	%xmm2,%xmm5
+	pand	%xmm7,%xmm3
+	pand	%xmm7,%xmm5
+	pxor	%xmm3,%xmm4
+	psllq	$4,%xmm3
+	pxor	%xmm5,%xmm2
+	psllq	$4,%xmm5
+	pxor	%xmm9,%xmm3
+	pxor	%xmm10,%xmm5
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm6,%xmm0
+	pxor	%xmm1,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm6
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm1
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	(%rax),%xmm7
+	pxor	%xmm7,%xmm5
+	pxor	%xmm7,%xmm3
+	pxor	%xmm7,%xmm1
+	pxor	%xmm7,%xmm6
+	pxor	%xmm7,%xmm2
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm15
+	pxor	%xmm7,%xmm0
+	.byte	0xf3,0xc3
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+.type	_bsaes_key_convert,@function
+.align	16
+_bsaes_key_convert:
+	leaq	.Lmasks(%rip),%r11
+	movdqu	(%rcx),%xmm7
+	leaq	16(%rcx),%rcx
+	movdqa	0(%r11),%xmm0
+	movdqa	16(%r11),%xmm1
+	movdqa	32(%r11),%xmm2
+	movdqa	48(%r11),%xmm3
+	movdqa	64(%r11),%xmm4
+	pcmpeqd	%xmm5,%xmm5
+
+	movdqu	(%rcx),%xmm6
+	movdqa	%xmm7,(%rax)
+	leaq	16(%rax),%rax
+	decl	%r10d
+	jmp	.Lkey_loop
+.align	16
+.Lkey_loop:
+.byte	102,15,56,0,244
+
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm1,%xmm9
+
+	pand	%xmm6,%xmm8
+	pand	%xmm6,%xmm9
+	movdqa	%xmm2,%xmm10
+	pcmpeqb	%xmm0,%xmm8
+	psllq	$4,%xmm0
+	movdqa	%xmm3,%xmm11
+	pcmpeqb	%xmm1,%xmm9
+	psllq	$4,%xmm1
+
+	pand	%xmm6,%xmm10
+	pand	%xmm6,%xmm11
+	movdqa	%xmm0,%xmm12
+	pcmpeqb	%xmm2,%xmm10
+	psllq	$4,%xmm2
+	movdqa	%xmm1,%xmm13
+	pcmpeqb	%xmm3,%xmm11
+	psllq	$4,%xmm3
+
+	movdqa	%xmm2,%xmm14
+	movdqa	%xmm3,%xmm15
+	pxor	%xmm5,%xmm8
+	pxor	%xmm5,%xmm9
+
+	pand	%xmm6,%xmm12
+	pand	%xmm6,%xmm13
+	movdqa	%xmm8,0(%rax)
+	pcmpeqb	%xmm0,%xmm12
+	psrlq	$4,%xmm0
+	movdqa	%xmm9,16(%rax)
+	pcmpeqb	%xmm1,%xmm13
+	psrlq	$4,%xmm1
+	leaq	16(%rcx),%rcx
+
+	pand	%xmm6,%xmm14
+	pand	%xmm6,%xmm15
+	movdqa	%xmm10,32(%rax)
+	pcmpeqb	%xmm2,%xmm14
+	psrlq	$4,%xmm2
+	movdqa	%xmm11,48(%rax)
+	pcmpeqb	%xmm3,%xmm15
+	psrlq	$4,%xmm3
+	movdqu	(%rcx),%xmm6
+
+	pxor	%xmm5,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm12,64(%rax)
+	movdqa	%xmm13,80(%rax)
+	movdqa	%xmm14,96(%rax)
+	movdqa	%xmm15,112(%rax)
+	leaq	128(%rax),%rax
+	decl	%r10d
+	jnz	.Lkey_loop
+
+	movdqa	80(%r11),%xmm7
+
+	.byte	0xf3,0xc3
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+
+.globl	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,@function
+.align	16
+bsaes_cbc_encrypt:
+	cmpl	$0,%r9d
+	jne	asm_AES_cbc_encrypt
+	cmpq	$128,%rdx
+	jb	asm_AES_cbc_encrypt
+
+	movq	%rsp,%rax
+.Lcbc_dec_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movl	240(%rcx),%eax
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	movq	%r8,%rbx
+	shrq	$4,%r14
+
+	movl	%eax,%edx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%edx,%r10d
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7
+	movdqa	%xmm6,(%rax)
+	movdqa	%xmm7,(%rsp)
+
+	movdqu	(%rbx),%xmm14
+	subq	$8,%r14
+.Lcbc_dec_loop:
+	movdqu	0(%r12),%xmm15
+	movdqu	16(%r12),%xmm0
+	movdqu	32(%r12),%xmm1
+	movdqu	48(%r12),%xmm2
+	movdqu	64(%r12),%xmm3
+	movdqu	80(%r12),%xmm4
+	movq	%rsp,%rax
+	movdqu	96(%r12),%xmm5
+	movl	%edx,%r10d
+	movdqu	112(%r12),%xmm6
+	movdqa	%xmm14,32(%rbp)
+
+	call	_bsaes_decrypt8
+
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm1
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm6
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm2
+	movdqu	112(%r12),%xmm14
+	pxor	%xmm13,%xmm4
+	movdqu	%xmm15,0(%r13)
+	leaq	128(%r12),%r12
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm2,96(%r13)
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+	subq	$8,%r14
+	jnc	.Lcbc_dec_loop
+
+	addq	$8,%r14
+	jz	.Lcbc_dec_done
+
+	movdqu	0(%r12),%xmm15
+	movq	%rsp,%rax
+	movl	%edx,%r10d
+	cmpq	$2,%r14
+	jb	.Lcbc_dec_one
+	movdqu	16(%r12),%xmm0
+	je	.Lcbc_dec_two
+	movdqu	32(%r12),%xmm1
+	cmpq	$4,%r14
+	jb	.Lcbc_dec_three
+	movdqu	48(%r12),%xmm2
+	je	.Lcbc_dec_four
+	movdqu	64(%r12),%xmm3
+	cmpq	$6,%r14
+	jb	.Lcbc_dec_five
+	movdqu	80(%r12),%xmm4
+	je	.Lcbc_dec_six
+	movdqu	96(%r12),%xmm5
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm1
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm6
+	movdqu	96(%r12),%xmm14
+	pxor	%xmm12,%xmm2
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm2,96(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_six:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm1
+	movdqu	80(%r12),%xmm14
+	pxor	%xmm11,%xmm6
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_five:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm14
+	pxor	%xmm10,%xmm1
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_four:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm14
+	pxor	%xmm9,%xmm3
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_three:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm14
+	pxor	%xmm8,%xmm5
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_two:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm14
+	pxor	%xmm7,%xmm0
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_one:
+	leaq	(%r12),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm14
+	movdqu	%xmm14,(%r13)
+	movdqa	%xmm15,%xmm14
+
+.Lcbc_dec_done:
+	movdqu	%xmm14,(%rbx)
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lcbc_dec_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lcbc_dec_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lcbc_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,@function
+.align	16
+bsaes_ctr32_encrypt_blocks:
+	movq	%rsp,%rax
+.Lctr_enc_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movdqu	(%r8),%xmm0
+	movl	240(%rcx),%eax
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	movdqa	%xmm0,32(%rbp)
+	cmpq	$8,%rdx
+	jb	.Lctr_enc_short
+
+	movl	%eax,%ebx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%ebx,%r10d
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7
+	movdqa	%xmm7,(%rax)
+
+	movdqa	(%rsp),%xmm8
+	leaq	.LADD1(%rip),%r11
+	movdqa	32(%rbp),%xmm15
+	movdqa	-32(%r11),%xmm7
+.byte	102,68,15,56,0,199
+.byte	102,68,15,56,0,255
+	movdqa	%xmm8,(%rsp)
+	jmp	.Lctr_enc_loop
+.align	16
+.Lctr_enc_loop:
+	movdqa	%xmm15,32(%rbp)
+	movdqa	%xmm15,%xmm0
+	movdqa	%xmm15,%xmm1
+	paddd	0(%r11),%xmm0
+	movdqa	%xmm15,%xmm2
+	paddd	16(%r11),%xmm1
+	movdqa	%xmm15,%xmm3
+	paddd	32(%r11),%xmm2
+	movdqa	%xmm15,%xmm4
+	paddd	48(%r11),%xmm3
+	movdqa	%xmm15,%xmm5
+	paddd	64(%r11),%xmm4
+	movdqa	%xmm15,%xmm6
+	paddd	80(%r11),%xmm5
+	paddd	96(%r11),%xmm6
+
+
+
+	movdqa	(%rsp),%xmm8
+	leaq	16(%rsp),%rax
+	movdqa	-16(%r11),%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+.byte	102,68,15,56,0,255
+	pxor	%xmm8,%xmm1
+.byte	102,15,56,0,199
+	pxor	%xmm8,%xmm2
+.byte	102,15,56,0,207
+	pxor	%xmm8,%xmm3
+.byte	102,15,56,0,215
+	pxor	%xmm8,%xmm4
+.byte	102,15,56,0,223
+	pxor	%xmm8,%xmm5
+.byte	102,15,56,0,231
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,239
+	leaq	.LBS0(%rip),%r11
+.byte	102,15,56,0,247
+	movl	%ebx,%r10d
+
+	call	_bsaes_encrypt8_bitslice
+
+	subq	$8,%r14
+	jc	.Lctr_enc_loop_done
+
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	movdqu	32(%r12),%xmm9
+	movdqu	48(%r12),%xmm10
+	movdqu	64(%r12),%xmm11
+	movdqu	80(%r12),%xmm12
+	movdqu	96(%r12),%xmm13
+	movdqu	112(%r12),%xmm14
+	leaq	128(%r12),%r12
+	pxor	%xmm15,%xmm7
+	movdqa	32(%rbp),%xmm15
+	pxor	%xmm8,%xmm0
+	movdqu	%xmm7,0(%r13)
+	pxor	%xmm9,%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	%xmm10,%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	%xmm11,%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	%xmm12,%xmm6
+	movdqu	%xmm2,64(%r13)
+	pxor	%xmm13,%xmm1
+	movdqu	%xmm6,80(%r13)
+	pxor	%xmm14,%xmm4
+	movdqu	%xmm1,96(%r13)
+	leaq	.LADD1(%rip),%r11
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+	paddd	112(%r11),%xmm15
+	jnz	.Lctr_enc_loop
+
+	jmp	.Lctr_enc_done
+.align	16
+.Lctr_enc_loop_done:
+	addq	$8,%r14
+	movdqu	0(%r12),%xmm7
+	pxor	%xmm7,%xmm15
+	movdqu	%xmm15,0(%r13)
+	cmpq	$2,%r14
+	jb	.Lctr_enc_done
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm8,%xmm0
+	movdqu	%xmm0,16(%r13)
+	je	.Lctr_enc_done
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm9,%xmm3
+	movdqu	%xmm3,32(%r13)
+	cmpq	$4,%r14
+	jb	.Lctr_enc_done
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm10,%xmm5
+	movdqu	%xmm5,48(%r13)
+	je	.Lctr_enc_done
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm11,%xmm2
+	movdqu	%xmm2,64(%r13)
+	cmpq	$6,%r14
+	jb	.Lctr_enc_done
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm12,%xmm6
+	movdqu	%xmm6,80(%r13)
+	je	.Lctr_enc_done
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm13,%xmm1
+	movdqu	%xmm1,96(%r13)
+	jmp	.Lctr_enc_done
+
+.align	16
+.Lctr_enc_short:
+	leaq	32(%rbp),%rdi
+	leaq	48(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_encrypt
+	movdqu	(%r12),%xmm0
+	leaq	16(%r12),%r12
+	movl	44(%rbp),%eax
+	bswapl	%eax
+	pxor	48(%rbp),%xmm0
+	incl	%eax
+	movdqu	%xmm0,(%r13)
+	bswapl	%eax
+	leaq	16(%r13),%r13
+	movl	%eax,44(%rsp)
+	decq	%r14
+	jnz	.Lctr_enc_short
+
+.Lctr_enc_done:
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lctr_enc_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lctr_enc_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lctr_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl	bsaes_xts_encrypt
+.type	bsaes_xts_encrypt,@function
+.align	16
+bsaes_xts_encrypt:
+	movq	%rsp,%rax
+.Lxts_enc_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+
+	leaq	(%r9),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r8),%rdx
+	call	asm_AES_encrypt		
+
+	movl	240(%r15),%eax
+	movq	%r14,%rbx
+
+	movl	%eax,%edx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%edx,%r10d
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7
+	movdqa	%xmm7,(%rax)
+
+	andq	$-16,%r14
+	subq	$128,%rsp
+	movdqa	32(%rbp),%xmm6
+
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+
+	subq	$128,%r14
+	jc	.Lxts_enc_short
+	jmp	.Lxts_enc_loop
+
+.align	16
+.Lxts_enc_loop:
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqu	112(%r12),%xmm14
+	leaq	128(%r12),%r12
+	movdqa	%xmm6,112(%rsp)
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	pxor	%xmm14,%xmm6
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm2,64(%r13)
+	pxor	96(%rsp),%xmm1
+	movdqu	%xmm6,80(%r13)
+	pxor	112(%rsp),%xmm4
+	movdqu	%xmm1,96(%r13)
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+
+	subq	$128,%r14
+	jnc	.Lxts_enc_loop
+
+.Lxts_enc_short:
+	addq	$128,%r14
+	jz	.Lxts_enc_done
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	cmpq	$16,%r14
+	je	.Lxts_enc_1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	cmpq	$32,%r14
+	je	.Lxts_enc_2
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	cmpq	$48,%r14
+	je	.Lxts_enc_3
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	cmpq	$64,%r14
+	je	.Lxts_enc_4
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	cmpq	$80,%r14
+	je	.Lxts_enc_5
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	cmpq	$96,%r14
+	je	.Lxts_enc_6
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqa	%xmm6,112(%rsp)
+	leaq	112(%r12),%r12
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm2,64(%r13)
+	pxor	96(%rsp),%xmm1
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm1,96(%r13)
+	leaq	112(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_6:
+	pxor	%xmm11,%xmm3
+	leaq	96(%r12),%r12
+	pxor	%xmm12,%xmm4
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm2,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	leaq	96(%r13),%r13
+
+	movdqa	96(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_5:
+	pxor	%xmm10,%xmm2
+	leaq	80(%r12),%r12
+	pxor	%xmm11,%xmm3
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	movdqu	%xmm2,64(%r13)
+	leaq	80(%r13),%r13
+
+	movdqa	80(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_4:
+	pxor	%xmm9,%xmm1
+	leaq	64(%r12),%r12
+	pxor	%xmm10,%xmm2
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	movdqu	%xmm5,48(%r13)
+	leaq	64(%r13),%r13
+
+	movdqa	64(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_3:
+	pxor	%xmm8,%xmm0
+	leaq	48(%r12),%r12
+	pxor	%xmm9,%xmm1
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm3,32(%r13)
+	leaq	48(%r13),%r13
+
+	movdqa	48(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_2:
+	pxor	%xmm7,%xmm15
+	leaq	32(%r12),%r12
+	pxor	%xmm8,%xmm0
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	leaq	32(%r13),%r13
+
+	movdqa	32(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_1:
+	pxor	%xmm15,%xmm7
+	leaq	16(%r12),%r12
+	movdqa	%xmm7,32(%rbp)
+	leaq	32(%rbp),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_encrypt		
+	pxor	32(%rbp),%xmm15
+
+
+
+
+
+	movdqu	%xmm15,0(%r13)
+	leaq	16(%r13),%r13
+
+	movdqa	16(%rsp),%xmm6
+
+.Lxts_enc_done:
+	andl	$15,%ebx
+	jz	.Lxts_enc_ret
+	movq	%r13,%rdx
+
+.Lxts_enc_steal:
+	movzbl	(%r12),%eax
+	movzbl	-16(%rdx),%ecx
+	leaq	1(%r12),%r12
+	movb	%al,-16(%rdx)
+	movb	%cl,0(%rdx)
+	leaq	1(%rdx),%rdx
+	subl	$1,%ebx
+	jnz	.Lxts_enc_steal
+
+	movdqu	-16(%r13),%xmm15
+	leaq	32(%rbp),%rdi
+	pxor	%xmm6,%xmm15
+	leaq	32(%rbp),%rsi
+	movdqa	%xmm15,32(%rbp)
+	leaq	(%r15),%rdx
+	call	asm_AES_encrypt		
+	pxor	32(%rbp),%xmm6
+	movdqu	%xmm6,-16(%r13)
+
+.Lxts_enc_ret:
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lxts_enc_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lxts_enc_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lxts_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl	bsaes_xts_decrypt
+.type	bsaes_xts_decrypt,@function
+.align	16
+bsaes_xts_decrypt:
+	movq	%rsp,%rax
+.Lxts_dec_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+
+	leaq	(%r9),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r8),%rdx
+	call	asm_AES_encrypt		
+
+	movl	240(%r15),%eax
+	movq	%r14,%rbx
+
+	movl	%eax,%edx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%edx,%r10d
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7
+	movdqa	%xmm6,(%rax)
+	movdqa	%xmm7,(%rsp)
+
+	xorl	%eax,%eax
+	andq	$-16,%r14
+	testl	$15,%ebx
+	setnz	%al
+	shlq	$4,%rax
+	subq	%rax,%r14
+
+	subq	$128,%rsp
+	movdqa	32(%rbp),%xmm6
+
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+
+	subq	$128,%r14
+	jc	.Lxts_dec_short
+	jmp	.Lxts_dec_loop
+
+.align	16
+.Lxts_dec_loop:
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqu	112(%r12),%xmm14
+	leaq	128(%r12),%r12
+	movdqa	%xmm6,112(%rsp)
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	pxor	%xmm14,%xmm6
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm1,64(%r13)
+	pxor	96(%rsp),%xmm2
+	movdqu	%xmm6,80(%r13)
+	pxor	112(%rsp),%xmm4
+	movdqu	%xmm2,96(%r13)
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+
+	subq	$128,%r14
+	jnc	.Lxts_dec_loop
+
+.Lxts_dec_short:
+	addq	$128,%r14
+	jz	.Lxts_dec_done
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	cmpq	$16,%r14
+	je	.Lxts_dec_1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	cmpq	$32,%r14
+	je	.Lxts_dec_2
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	cmpq	$48,%r14
+	je	.Lxts_dec_3
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	cmpq	$64,%r14
+	je	.Lxts_dec_4
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	cmpq	$80,%r14
+	je	.Lxts_dec_5
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	cmpq	$96,%r14
+	je	.Lxts_dec_6
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqa	%xmm6,112(%rsp)
+	leaq	112(%r12),%r12
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm1,64(%r13)
+	pxor	96(%rsp),%xmm2
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm2,96(%r13)
+	leaq	112(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_6:
+	pxor	%xmm11,%xmm3
+	leaq	96(%r12),%r12
+	pxor	%xmm12,%xmm4
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	leaq	96(%r13),%r13
+
+	movdqa	96(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_5:
+	pxor	%xmm10,%xmm2
+	leaq	80(%r12),%r12
+	pxor	%xmm11,%xmm3
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	leaq	80(%r13),%r13
+
+	movdqa	80(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_4:
+	pxor	%xmm9,%xmm1
+	leaq	64(%r12),%r12
+	pxor	%xmm10,%xmm2
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	leaq	64(%r13),%r13
+
+	movdqa	64(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_3:
+	pxor	%xmm8,%xmm0
+	leaq	48(%r12),%r12
+	pxor	%xmm9,%xmm1
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	leaq	48(%r13),%r13
+
+	movdqa	48(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_2:
+	pxor	%xmm7,%xmm15
+	leaq	32(%r12),%r12
+	pxor	%xmm8,%xmm0
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	leaq	32(%r13),%r13
+
+	movdqa	32(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_1:
+	pxor	%xmm15,%xmm7
+	leaq	16(%r12),%r12
+	movdqa	%xmm7,32(%rbp)
+	leaq	32(%rbp),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm15
+
+
+
+
+
+	movdqu	%xmm15,0(%r13)
+	leaq	16(%r13),%r13
+
+	movdqa	16(%rsp),%xmm6
+
+.Lxts_dec_done:
+	andl	$15,%ebx
+	jz	.Lxts_dec_ret
+
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+	pshufd	$19,%xmm14,%xmm13
+	movdqa	%xmm6,%xmm5
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	movdqu	(%r12),%xmm15
+	pxor	%xmm13,%xmm6
+
+	leaq	32(%rbp),%rdi
+	pxor	%xmm6,%xmm15
+	leaq	32(%rbp),%rsi
+	movdqa	%xmm15,32(%rbp)
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm6
+	movq	%r13,%rdx
+	movdqu	%xmm6,(%r13)
+
+.Lxts_dec_steal:
+	movzbl	16(%r12),%eax
+	movzbl	(%rdx),%ecx
+	leaq	1(%r12),%r12
+	movb	%al,(%rdx)
+	movb	%cl,16(%rdx)
+	leaq	1(%rdx),%rdx
+	subl	$1,%ebx
+	jnz	.Lxts_dec_steal
+
+	movdqu	(%r13),%xmm15
+	leaq	32(%rbp),%rdi
+	pxor	%xmm5,%xmm15
+	leaq	32(%rbp),%rsi
+	movdqa	%xmm15,32(%rbp)
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm5
+	movdqu	%xmm5,(%r13)
+
+.Lxts_dec_ret:
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lxts_dec_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lxts_dec_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lxts_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
+.type	_bsaes_const,@object
+.align	64
+_bsaes_const:
+.LM0ISR:
+.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:
+.quad	0x5555555555555555, 0x5555555555555555
+.LBS1:
+.quad	0x3333333333333333, 0x3333333333333333
+.LBS2:
+.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:
+.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:
+.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+.quad	0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:
+.quad	0x0000000000000000, 0x0000000100000000
+.LADD2:
+.quad	0x0000000000000000, 0x0000000200000000
+.LADD3:
+.quad	0x0000000000000000, 0x0000000300000000
+.LADD4:
+.quad	0x0000000000000000, 0x0000000400000000
+.LADD5:
+.quad	0x0000000000000000, 0x0000000500000000
+.LADD6:
+.quad	0x0000000000000000, 0x0000000600000000
+.LADD7:
+.quad	0x0000000000000000, 0x0000000700000000
+.LADD8:
+.quad	0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+.long	0x87,0,1,0
+.Lmasks:
+.quad	0x0101010101010101, 0x0101010101010101
+.quad	0x0202020202020202, 0x0202020202020202
+.quad	0x0404040404040404, 0x0404040404040404
+.quad	0x0808080808080808, 0x0808080808080808
+.LM0:
+.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+.quad	0x6363636363636363, 0x6363636363636363
+.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0
+.align	64
+.size	_bsaes_const,.-_bsaes_const
diff --git a/main/openssl/crypto/aes/asm/bsaes-x86_64.pl b/main/openssl/crypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 00000000..41b90f08
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/bsaes-x86_64.pl
@@ -0,0 +1,3108 @@
+#!/usr/bin/env perl
+
+###################################################################
+### AES-128 [originally in CTR mode]				###
+### bitsliced implementation for Intel Core 2 processors	###
+### requires support of SSE extensions up to SSSE3		###
+### Author: Emilia Käsper and Peter Schwabe			###
+### Date: 2009-03-19						###
+### Public domain						###
+###								###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
+### further information.					###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+#   from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+#   allowed to feed its output back to aesenc[last], this was
+#   achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+#   relies on conversion of "conventional" key schedule as returned
+#   by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+#   to skip one shiftrows(), reduce bit-sliced key schedule and
+#   speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+#		Emilia's	this(*)		difference
+#
+# Core 2    	9.30		8.69		+7%
+# Nehalem(**) 	7.63		6.98		+9%
+# Atom	    	17.1		17.4		-2%(***)
+#
+# (*)	Comparison is not completely fair, because "this" is ECB,
+#	i.e. no extra processing such as counter values calculation
+#	and xor-ing input as in Emilia's CTR implementation is
+#	performed. However, the CTR calculations stand for not more
+#	than 1% of total time, so comparison is *rather* fair.
+#
+# (**)	Results were collected on Westmere, which is considered to
+#	be equivalent to Nehalem for this code.
+#
+# (***)	Slowdown on Atom is rather strange per se, because original
+#	implementation has a number of 9+-bytes instructions, which
+#	are bad for Atom front-end, and which I eliminated completely.
+#	In attempt to address deterioration sbox() was tested in FP
+#	SIMD "domain" (movaps instead of movdqa, xorps instead of
+#	pxor, etc.). While it resulted in nominal 4% improvement on
+#	Atom, it hurted Westmere by more than 2x factor.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+# 		conversion	conversion/8x block
+# Core 2	240		0.22
+# Nehalem	180		0.20
+# Atom		430		0.19
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2	9.83
+# Nehalem	7.74
+# Atom		19.0
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+#						<appro@openssl.org>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
+my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
+
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InBasisChange	(@b);
+	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
+	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+	pxor	@b[6], @b[5]
+	pxor	@b[1], @b[2]
+	pxor	@b[0], @b[3]
+	pxor	@b[2], @b[6]
+	pxor 	@b[0], @b[5]
+
+	pxor	@b[3], @b[6]
+	pxor	@b[7], @b[3]
+	pxor	@b[5], @b[7]
+	pxor	@b[4], @b[3]
+	pxor	@b[5], @b[4]
+	pxor	@b[1], @b[3]
+
+	pxor	@b[7], @b[2]
+	pxor	@b[5], @b[1]
+___
+}
+
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+	pxor	@b[6], @b[0]
+	pxor	@b[4], @b[1]
+	pxor	@b[0], @b[2]
+	pxor	@b[6], @b[4]
+	pxor	@b[1], @b[6]
+
+	pxor	@b[5], @b[1]
+	pxor	@b[3], @b[5]
+	pxor	@b[7], @b[3]
+	pxor	@b[5], @b[7]
+	pxor	@b[5], @b[2]
+
+	pxor	@b[7], @b[4]
+___
+}
+
+sub InvSbox {
+# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InvInBasisChange	(@b);
+	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
+	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange {		# OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+	pxor	@b[7], @b[4]
+
+	pxor	@b[5], @b[7]
+	pxor	@b[5], @b[2]
+	pxor	@b[7], @b[3]
+	pxor	@b[3], @b[5]
+	pxor	@b[5], @b[1]
+
+	pxor	@b[1], @b[6]
+	pxor	@b[0], @b[2]
+	pxor	@b[6], @b[4]
+	pxor	@b[6], @b[0]
+	pxor	@b[4], @b[1]
+___
+}
+
+sub InvOutBasisChange {		# InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+	pxor	@b[5], @b[1]
+	pxor	@b[7], @b[2]
+
+	pxor	@b[1], @b[3]
+	pxor	@b[5], @b[4]
+	pxor	@b[5], @b[7]
+	pxor	@b[4], @b[3]
+	 pxor 	@b[0], @b[5]
+	pxor	@b[7], @b[3]
+	 pxor	@b[2], @b[6]
+	 pxor	@b[1], @b[2]
+	pxor	@b[3], @b[6]
+
+	pxor	@b[0], @b[3]
+	pxor	@b[6], @b[5]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	pxor 	$y1, $t0
+	pand	$x0, $t0
+	pxor	$x1, $x0
+	pand	$y0, $x1
+	pand	$y1, $x0
+	pxor	$x1, $x0
+	pxor	$t0, $x1
+___
+}
+
+sub Mul_GF4_N {				# not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	pxor	$y1, $t0
+	pand	$x0, $t0
+	pxor	$x1, $x0
+	pand	$y0, $x1
+	pand	$y1, $x0
+	pxor	$x0, $x1
+	pxor	$t0, $x0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	 movdqa	$y2, $t1
+	pxor	$y1, $t0
+	 pxor 	$y3, $t1
+	pand	$x0, $t0
+	 pand	$x2, $t1
+	pxor	$x1, $x0
+	 pxor	$x3, $x2
+	pand	$y0, $x1
+	 pand	$y2, $x3
+	pand	$y1, $x0
+	 pand	$y3, $x2
+	pxor	$x0, $x1
+	 pxor	$x3, $x2
+	pxor	$t0, $x0
+	 pxor	$t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+	movdqa	@x[0], @t[0]
+	movdqa	@x[1], @t[1]
+___
+	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+	pxor	@x[2], @t[0]
+	pxor	@x[3], @t[1]
+	pxor	@y[2], @y[0]
+	pxor	@y[3], @y[1]
+___
+	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	pxor	@t[0], @x[0]
+	pxor	@t[0], @x[2]
+	pxor	@t[1], @x[1]
+	pxor	@t[1], @x[3]
+
+	movdqa	@x[4], @t[0]
+	movdqa	@x[5], @t[1]
+	pxor	@x[6], @t[0]
+	pxor	@x[7], @t[1]
+___
+	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	pxor	@y[2], @y[0]
+	pxor	@y[3], @y[1]
+___
+	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+	pxor	@t[0], @x[4]
+	pxor	@t[0], @x[6]
+	pxor	@t[1], @x[5]
+	pxor	@t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+	movdqa	@x[4], @t[3]
+	movdqa	@x[5], @t[2]
+	movdqa	@x[1], @t[1]
+	movdqa	@x[7], @s[1]
+	movdqa	@x[0], @s[0]
+
+	pxor	@x[6], @t[3]
+	pxor	@x[7], @t[2]
+	pxor	@x[3], @t[1]
+	 movdqa	@t[3], @s[2]
+	pxor	@x[6], @s[1]
+	 movdqa	@t[2], @t[0]
+	pxor	@x[2], @s[0]
+	 movdqa	@t[3], @s[3]
+
+	por	@t[1], @t[2]
+	por	@s[0], @t[3]
+	pxor	@t[0], @s[3]
+	pand	@s[0], @s[2]
+	pxor	@t[1], @s[0]
+	pand	@t[1], @t[0]
+	pand	@s[0], @s[3]
+	movdqa	@x[3], @s[0]
+	pxor	@x[2], @s[0]
+	pand	@s[0], @s[1]
+	pxor	@s[1], @t[3]
+	pxor	@s[1], @t[2]
+	movdqa	@x[4], @s[1]
+	movdqa	@x[1], @s[0]
+	pxor	@x[5], @s[1]
+	pxor	@x[0], @s[0]
+	movdqa	@s[1], @t[1]
+	pand	@s[0], @s[1]
+	por	@s[0], @t[1]
+	pxor	@s[1], @t[0]
+	pxor	@s[3], @t[3]
+	pxor	@s[2], @t[2]
+	pxor	@s[3], @t[1]
+	movdqa	@x[7], @s[0]
+	pxor	@s[2], @t[0]
+	movdqa	@x[6], @s[1]
+	pxor	@s[2], @t[1]
+	movdqa	@x[5], @s[2]
+	pand	@x[3], @s[0]
+	movdqa	@x[4], @s[3]
+	pand	@x[2], @s[1]
+	pand	@x[1], @s[2]
+	por	@x[0], @s[3]
+	pxor	@s[0], @t[3]
+	pxor	@s[1], @t[2]
+	pxor	@s[2], @t[1]
+	pxor	@s[3], @t[0] 
+
+	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+	# new smaller inversion
+
+	movdqa	@t[3], @s[0]
+	pand	@t[1], @t[3]
+	pxor	@t[2], @s[0]
+
+	movdqa	@t[0], @s[2]
+	movdqa	@s[0], @s[3]
+	pxor	@t[3], @s[2]
+	pand	@s[2], @s[3]
+
+	movdqa	@t[1], @s[1]
+	pxor	@t[2], @s[3]
+	pxor	@t[0], @s[1]
+
+	pxor	@t[2], @t[3]
+
+	pand	@t[3], @s[1]
+
+	movdqa	@s[2], @t[2]
+	pxor	@t[0], @s[1]
+
+	pxor	@s[1], @t[2]
+	pxor	@s[1], @t[1]
+
+	pand	@t[0], @t[2]
+
+	pxor	@t[2], @s[2]
+	pxor	@t[2], @t[1]
+
+	pand	@s[3], @s[2]
+
+	pxor	@s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+	pxor	0x00($key),@x[0]
+	pxor	0x10($key),@x[1]
+	pshufb	$mask,@x[0]
+	pxor	0x20($key),@x[2]
+	pshufb	$mask,@x[1]
+	pxor	0x30($key),@x[3]
+	pshufb	$mask,@x[2]
+	pxor	0x40($key),@x[4]
+	pshufb	$mask,@x[3]
+	pxor	0x50($key),@x[5]
+	pshufb	$mask,@x[4]
+	pxor	0x60($key),@x[6]
+	pshufb	$mask,@x[5]
+	pxor	0x70($key),@x[7]
+	pshufb	$mask,@x[6]
+	lea	0x80($key),$key
+	pshufb	$mask,@x[7]
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16];	# optional
+$code.=<<___;
+	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
+	pshufd	\$0x93, @x[1], @t[1]
+	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
+	pshufd	\$0x93, @x[2], @t[2]
+	 pxor	@t[1], @x[1]
+	pshufd	\$0x93, @x[3], @t[3]
+	 pxor	@t[2], @x[2]
+	pshufd	\$0x93, @x[4], @t[4]
+	 pxor	@t[3], @x[3]
+	pshufd	\$0x93, @x[5], @t[5]
+	 pxor	@t[4], @x[4]
+	pshufd	\$0x93, @x[6], @t[6]
+	 pxor	@t[5], @x[5]
+	pshufd	\$0x93, @x[7], @t[7]
+	 pxor	@t[6], @x[6]
+	 pxor	@t[7], @x[7]
+
+	pxor	@x[0], @t[1]
+	pxor	@x[7], @t[0]
+	pxor	@x[7], @t[1]
+	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
+	pxor	@x[1], @t[2]
+	 pshufd	\$0x4E, @x[1], @x[1]
+	pxor	@x[4], @t[5]
+	 pxor	@t[0], @x[0]
+	pxor	@x[5], @t[6]
+	 pxor	@t[1], @x[1]
+	pxor	@x[3], @t[4]
+	 pshufd	\$0x4E, @x[4], @t[0]
+	pxor	@x[6], @t[7]
+	 pshufd	\$0x4E, @x[5], @t[1]
+	pxor	@x[2], @t[3]
+	 pshufd	\$0x4E, @x[3], @x[4]
+	pxor	@x[7], @t[3]
+	 pshufd	\$0x4E, @x[7], @x[5]
+	pxor	@x[7], @t[4]
+	 pshufd	\$0x4E, @x[6], @x[3]
+	pxor	@t[4], @t[0]
+	 pshufd	\$0x4E, @x[2], @x[6]
+	pxor	@t[5], @t[1]
+___
+$code.=<<___ if (!$inv);
+	pxor	@t[3], @x[4]
+	pxor	@t[7], @x[5]
+	pxor	@t[6], @x[3]
+	 movdqa	@t[0], @x[2]
+	pxor	@t[2], @x[6]
+	 movdqa	@t[1], @x[7]
+___
+$code.=<<___ if ($inv);
+	pxor	@x[4], @t[3]
+	pxor	@t[7], @x[5]
+	pxor	@x[3], @t[6]
+	 movdqa	@t[0], @x[3]
+	pxor	@t[2], @x[6]
+	 movdqa	@t[6], @x[2]
+	 movdqa	@t[1], @x[7]
+	 movdqa	@x[6], @x[4]
+	 movdqa	@t[3], @x[6]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+	# multiplication by 0x0e
+	pshufd	\$0x93, @x[7], @t[7]
+	movdqa	@x[2], @t[2]
+	pxor	@x[5], @x[7]		# 7 5
+	pxor	@x[5], @x[2]		# 2 5
+	pshufd	\$0x93, @x[0], @t[0]
+	movdqa	@x[5], @t[5]
+	pxor	@x[0], @x[5]		# 5 0		[1]
+	pxor	@x[1], @x[0]		# 0 1
+	pshufd	\$0x93, @x[1], @t[1]
+	pxor	@x[2], @x[1]		# 1 25
+	pxor	@x[6], @x[0]		# 01 6		[2]
+	pxor	@x[3], @x[1]		# 125 3		[4]
+	pshufd	\$0x93, @x[3], @t[3]
+	pxor	@x[0], @x[2]		# 25 016	[3]
+	pxor	@x[7], @x[3]		# 3 75
+	pxor	@x[6], @x[7]		# 75 6		[0]
+	pshufd	\$0x93, @x[6], @t[6]
+	movdqa	@x[4], @t[4]
+	pxor	@x[4], @x[6]		# 6 4
+	pxor	@x[3], @x[4]		# 4 375		[6]
+	pxor	@x[7], @x[3]		# 375 756=36
+	pxor	@t[5], @x[6]		# 64 5		[7]
+	pxor	@t[2], @x[3]		# 36 2
+	pxor	@t[4], @x[3]		# 362 4		[5]
+	pshufd	\$0x93, @t[5], @t[5]
+___
+					my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+	# multiplication by 0x0b
+	pxor	@y[0], @y[1]
+	pxor	@t[0], @y[0]
+	pxor	@t[1], @y[1]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[5], @y[0]
+	pxor	@t[6], @y[1]
+	pxor	@t[7], @y[0]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[6], @t[7]		# clobber t[7]
+	pxor	@y[0], @y[1]
+
+	pxor	@t[0], @y[3]
+	pshufd	\$0x93, @t[0], @t[0]
+	pxor	@t[1], @y[2]
+	pxor	@t[1], @y[4]
+	pxor	@t[2], @y[2]
+	pshufd	\$0x93, @t[1], @t[1]
+	pxor	@t[2], @y[3]
+	pxor	@t[2], @y[5]
+	pxor	@t[7], @y[2]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[3], @y[3]
+	pxor	@t[3], @y[6]
+	pxor	@t[3], @y[4]
+	pshufd	\$0x93, @t[3], @t[3]
+	pxor	@t[4], @y[7]
+	pxor	@t[4], @y[5]
+	pxor	@t[7], @y[7]
+	pxor	@t[5], @y[3]
+	pxor	@t[4], @y[4]
+	pxor	@t[5], @t[7]		# clobber t[7] even more
+
+	pxor	@t[7], @y[5]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[7], @y[6]
+	pxor	@t[7], @y[4]
+
+	pxor	@t[5], @t[7]
+	pshufd	\$0x93, @t[5], @t[5]
+	pxor	@t[6], @t[7]		# restore t[7]
+
+	# multiplication by 0x0d
+	pxor	@y[7], @y[4]
+	pxor	@t[4], @y[7]
+	pshufd	\$0x93, @t[6], @t[6]
+	pxor	@t[0], @y[2]
+	pxor	@t[5], @y[7]
+	pxor	@t[2], @y[2]
+	pshufd	\$0x93, @t[7], @t[7]
+
+	pxor	@y[1], @y[3]
+	pxor	@t[1], @y[1]
+	pxor	@t[0], @y[0]
+	pxor	@t[0], @y[3]
+	pxor	@t[5], @y[1]
+	pxor	@t[5], @y[0]
+	pxor	@t[7], @y[1]
+	pshufd	\$0x93, @t[0], @t[0]
+	pxor	@t[6], @y[0]
+	pxor	@y[1], @y[3]
+	pxor	@t[1], @y[4]
+	pshufd	\$0x93, @t[1], @t[1]
+
+	pxor	@t[7], @y[7]
+	pxor	@t[2], @y[4]
+	pxor	@t[2], @y[5]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[6], @y[2]
+	pxor	@t[3], @t[6]		# clobber t[6]
+	pxor	@y[7], @y[4]
+	pxor	@t[6], @y[3]
+
+	pxor	@t[6], @y[6]
+	pxor	@t[5], @y[5]
+	pxor	@t[4], @y[6]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[6], @y[5]
+	pxor	@t[7], @y[6]
+	pxor	@t[3], @t[6]		# restore t[6]
+
+	pshufd	\$0x93, @t[5], @t[5]
+	pshufd	\$0x93, @t[6], @t[6]
+	pshufd	\$0x93, @t[7], @t[7]
+	pshufd	\$0x93, @t[3], @t[3]
+
+	# multiplication by 0x09
+	pxor	@y[1], @y[4]
+	pxor	@y[1], @t[1]		# t[1]=y[1]
+	pxor	@t[5], @t[0]		# clobber t[0]
+	pxor	@t[5], @t[1]
+	pxor	@t[0], @y[3]
+	pxor	@y[0], @t[0]		# t[0]=y[0]
+	pxor	@t[6], @t[1]
+	pxor	@t[7], @t[6]		# clobber t[6]
+	pxor	@t[1], @y[4]
+	pxor	@t[4], @y[7]
+	pxor	@y[4], @t[4]		# t[4]=y[4]
+	pxor	@t[3], @y[6]
+	pxor	@y[3], @t[3]		# t[3]=y[3]
+	pxor	@t[2], @y[5]
+	pxor	@y[2], @t[2]		# t[2]=y[2]
+	pxor	@t[7], @t[3]
+	pxor	@y[5], @t[5]		# t[5]=y[5]
+	pxor	@t[6], @t[2]
+	pxor	@t[6], @t[5]
+	pxor	@y[6], @t[6]		# t[6]=y[6]
+	pxor	@y[7], @t[7]		# t[7]=y[7]
+
+	movdqa	@t[0],@XMM[0]
+	movdqa	@t[1],@XMM[1]
+	movdqa	@t[2],@XMM[2]
+	movdqa	@t[3],@XMM[3]
+	movdqa	@t[4],@XMM[4]
+	movdqa	@t[5],@XMM[5]
+	movdqa	@t[6],@XMM[6]
+	movdqa	@t[7],@XMM[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
+# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
+
+$code.=<<___;
+	# multiplication by 0x05-0x00-0x04-0x00
+	pshufd	\$0x4E, @x[0], @t[0]
+	pshufd	\$0x4E, @x[6], @t[6]
+	pxor	@x[0], @t[0]
+	pshufd	\$0x4E, @x[7], @t[7]
+	pxor	@x[6], @t[6]
+	pshufd	\$0x4E, @x[1], @t[1]
+	pxor	@x[7], @t[7]
+	pshufd	\$0x4E, @x[2], @t[2]
+	pxor	@x[1], @t[1]
+	pshufd	\$0x4E, @x[3], @t[3]
+	pxor	@x[2], @t[2]
+	 pxor	@t[6], @x[0]
+	 pxor	@t[6], @x[1]
+	pshufd	\$0x4E, @x[4], @t[4]
+	pxor	@x[3], @t[3]
+	 pxor	@t[0], @x[2]
+	 pxor	@t[1], @x[3]
+	pshufd	\$0x4E, @x[5], @t[5]
+	pxor	@x[4], @t[4]
+	 pxor	@t[7], @x[1]
+	 pxor	@t[2], @x[4]
+	pxor	@x[5], @t[5]
+
+	 pxor	@t[7], @x[2]
+	 pxor	@t[6], @x[3]
+	 pxor	@t[6], @x[4]
+	 pxor	@t[3], @x[5]
+	 pxor	@t[4], @x[6]
+	 pxor	@t[7], @x[4]
+	 pxor	@t[7], @x[5]
+	 pxor	@t[5], @x[7]
+___
+	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
+}
+
+sub aesenc {				# not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	movdqa	0x30($const),@t[0]	# .LSR
+___
+	&ShiftRows	(@b,@t[0]);
+	&Sbox		(@b,@t);
+	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
+}
+
+sub aesenclast {			# not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	movdqa	0x40($const),@t[0]	# .LSRM0
+___
+	&ShiftRows	(@b,@t[0]);
+	&Sbox		(@b,@t);
+$code.=<<___
+	pxor	0x00($key),@b[0]
+	pxor	0x10($key),@b[1]
+	pxor	0x20($key),@b[4]
+	pxor	0x30($key),@b[6]
+	pxor	0x40($key),@b[3]
+	pxor	0x50($key),@b[7]
+	pxor	0x60($key),@b[2]
+	pxor	0x70($key),@b[5]
+___
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+	movdqa	$b,$t
+	psrlq	\$$n,$b
+	pxor  	$a,$b
+	pand	$mask,$b
+	pxor	$b,$a
+	psllq	\$$n,$b
+	pxor	$t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+	movdqa	$b0,$t0
+	psrlq	\$$n,$b0
+	 movdqa	$b1,$t1
+	 psrlq	\$$n,$b1
+	pxor  	$a0,$b0
+	 pxor  	$a1,$b1
+	pand	$mask,$b0
+	 pand	$mask,$b1
+	pxor	$b0,$a0
+	psllq	\$$n,$b0
+	 pxor	$b1,$a1
+	 psllq	\$$n,$b1
+	pxor	$t0,$b0
+	 pxor	$t1,$b1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+	movdqa	0x00($const),$t0	# .LBS0
+	movdqa	0x10($const),$t1	# .LBS1
+___
+	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+	movdqa	0x20($const),$t0	# .LBS2
+___
+	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+.text
+
+.extern	asm_AES_encrypt
+.extern	asm_AES_decrypt
+
+.type	_bsaes_encrypt8,\@abi-omnipotent
+.align	64
+_bsaes_encrypt8:
+	lea	.LBS0(%rip), $const	# constants table
+
+	movdqa	($key), @XMM[9]		# round 0 key
+	lea	0x10($key), $key
+	movdqa	0x50($const), @XMM[8]	# .LM0SR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	 pshufb	@XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	dec	$rounds
+	jmp	.Lenc_sbox
+.align	16
+.Lenc_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+	&Sbox		(@XMM[0..7, 8..15]);
+$code.=<<___;
+	dec	$rounds
+	jl	.Lenc_done
+___
+	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+	movdqa	0x30($const), @XMM[8]	# .LSR
+	jnz	.Lenc_loop
+	movdqa	0x40($const), @XMM[8]	# .LSRM0
+	jmp	.Lenc_loop
+.align	16
+.Lenc_done:
+___
+	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+	movdqa	($key), @XMM[8]		# last round key
+	pxor	@XMM[8], @XMM[4]
+	pxor	@XMM[8], @XMM[6]
+	pxor	@XMM[8], @XMM[3]
+	pxor	@XMM[8], @XMM[7]
+	pxor	@XMM[8], @XMM[2]
+	pxor	@XMM[8], @XMM[5]
+	pxor	@XMM[8], @XMM[0]
+	pxor	@XMM[8], @XMM[1]
+	ret
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type	_bsaes_decrypt8,\@abi-omnipotent
+.align	64
+_bsaes_decrypt8:
+	lea	.LBS0(%rip), $const	# constants table
+
+	movdqa	($key), @XMM[9]		# round 0 key
+	lea	0x10($key), $key
+	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	 pshufb	@XMM[8], @XMM[7]
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	dec	$rounds
+	jmp	.Ldec_sbox
+.align	16
+.Ldec_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+	&InvSbox	(@XMM[0..7, 8..15]);
+$code.=<<___;
+	dec	$rounds
+	jl	.Ldec_done
+___
+	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+	movdqa	-0x10($const), @XMM[8]	# .LISR
+	jnz	.Ldec_loop
+	movdqa	-0x20($const), @XMM[8]	# .LISRM0
+	jmp	.Ldec_loop
+.align	16
+.Ldec_done:
+___
+	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+	movdqa	($key), @XMM[8]		# last round key
+	pxor	@XMM[8], @XMM[6]
+	pxor	@XMM[8], @XMM[4]
+	pxor	@XMM[8], @XMM[2]
+	pxor	@XMM[8], @XMM[7]
+	pxor	@XMM[8], @XMM[3]
+	pxor	@XMM[8], @XMM[5]
+	pxor	@XMM[8], @XMM[0]
+	pxor	@XMM[8], @XMM[1]
+	ret
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
+	movdqa	@x[0], @x[2]
+	movdqa	@x[1], @x[3]
+___
+	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+	movdqa	@x[0], @x[4]
+	movdqa	@x[2], @x[6]
+	movdqa	@x[1], @x[5]
+	movdqa	@x[3], @x[7]
+___
+	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
+	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type	_bsaes_key_convert,\@abi-omnipotent
+.align	16
+_bsaes_key_convert:
+	lea	.Lmasks(%rip), $const
+	movdqu	($inp), %xmm7		# load round 0 key
+	lea	0x10($inp), $inp
+	movdqa	0x00($const), %xmm0	# 0x01...
+	movdqa	0x10($const), %xmm1	# 0x02...
+	movdqa	0x20($const), %xmm2	# 0x04...
+	movdqa	0x30($const), %xmm3	# 0x08...
+	movdqa	0x40($const), %xmm4	# .LM0
+	pcmpeqd	%xmm5, %xmm5		# .LNOT
+
+	movdqu	($inp), %xmm6		# load round 1 key
+	movdqa	%xmm7, ($out)		# save round 0 key
+	lea	0x10($out), $out
+	dec	$rounds
+	jmp	.Lkey_loop
+.align	16
+.Lkey_loop:
+	pshufb	%xmm4, %xmm6		# .LM0
+
+	movdqa	%xmm0,	%xmm8
+	movdqa	%xmm1,	%xmm9
+
+	pand	%xmm6,	%xmm8
+	pand	%xmm6,	%xmm9
+	movdqa	%xmm2,	%xmm10
+	pcmpeqb	%xmm0,	%xmm8
+	psllq	\$4,	%xmm0		# 0x10...
+	movdqa	%xmm3,	%xmm11
+	pcmpeqb	%xmm1,	%xmm9
+	psllq	\$4,	%xmm1		# 0x20...
+
+	pand	%xmm6,	%xmm10
+	pand	%xmm6,	%xmm11
+	movdqa	%xmm0,	%xmm12
+	pcmpeqb	%xmm2,	%xmm10
+	psllq	\$4,	%xmm2		# 0x40...
+	movdqa	%xmm1,	%xmm13
+	pcmpeqb	%xmm3,	%xmm11
+	psllq	\$4,	%xmm3		# 0x80...
+
+	movdqa	%xmm2,	%xmm14
+	movdqa	%xmm3,	%xmm15
+	 pxor	%xmm5,	%xmm8		# "pnot"
+	 pxor	%xmm5,	%xmm9
+
+	pand	%xmm6,	%xmm12
+	pand	%xmm6,	%xmm13
+	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
+	pcmpeqb	%xmm0,	%xmm12
+	psrlq	\$4,	%xmm0		# 0x01...
+	 movdqa	%xmm9, 0x10($out)
+	pcmpeqb	%xmm1,	%xmm13
+	psrlq	\$4,	%xmm1		# 0x02...
+	 lea	0x10($inp), $inp
+
+	pand	%xmm6,	%xmm14
+	pand	%xmm6,	%xmm15
+	 movdqa	%xmm10, 0x20($out)
+	pcmpeqb	%xmm2,	%xmm14
+	psrlq	\$4,	%xmm2		# 0x04...
+	 movdqa	%xmm11, 0x30($out)
+	pcmpeqb	%xmm3,	%xmm15
+	psrlq	\$4,	%xmm3		# 0x08...
+	 movdqu	($inp), %xmm6		# load next round key
+
+	pxor	%xmm5, %xmm13		# "pnot"
+	pxor	%xmm5, %xmm14
+	movdqa	%xmm12, 0x40($out)
+	movdqa	%xmm13, 0x50($out)
+	movdqa	%xmm14, 0x60($out)
+	movdqa	%xmm15, 0x70($out)
+	lea	0x80($out),$out
+	dec	$rounds
+	jnz	.Lkey_loop
+
+	movdqa	0x50($const), %xmm7	# .L63
+	#movdqa	%xmm6, ($out)		# don't save last round key
+	ret
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0 && !$win64) {	# following four functions are unsupported interface
+			# used for benchmarking...
+$code.=<<___;
+.globl	bsaes_enc_key_convert
+.type	bsaes_enc_key_convert,\@function,2
+.align	16
+bsaes_enc_key_convert:
+	mov	240($inp),%r10d		# pass rounds
+	mov	$inp,%rcx		# pass key
+	mov	$out,%rax		# pass key schedule
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+	ret
+.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl	bsaes_encrypt_128
+.type	bsaes_encrypt_128,\@function,4
+.align	16
+bsaes_encrypt_128:
+.Lenc128_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	movdqu	0x60($inp), @XMM[6]
+	movdqu	0x70($inp), @XMM[7]
+	mov	$key, %rax		# pass the $key
+	lea	0x80($inp), $inp
+	mov	\$10,%r10d
+
+	call	_bsaes_encrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$0x80,$len
+	ja	.Lenc128_loop
+	ret
+.size	bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl	bsaes_dec_key_convert
+.type	bsaes_dec_key_convert,\@function,2
+.align	16
+bsaes_dec_key_convert:
+	mov	240($inp),%r10d		# pass rounds
+	mov	$inp,%rcx		# pass key
+	mov	$out,%rax		# pass key schedule
+	call	_bsaes_key_convert
+	pxor	($out),%xmm7		# fix up round 0 key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,($out)
+	ret
+.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl	bsaes_decrypt_128
+.type	bsaes_decrypt_128,\@function,4
+.align	16
+bsaes_decrypt_128:
+.Ldec128_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	movdqu	0x60($inp), @XMM[6]
+	movdqu	0x70($inp), @XMM[7]
+	mov	$key, %rax		# pass the $key
+	lea	0x80($inp), $inp
+	mov	\$10,%r10d
+
+	call	_bsaes_decrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$0x80,$len
+	ja	.Ldec128_loop
+	ret
+.size	bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+
+if ($ecb) {
+$code.=<<___;
+.globl	bsaes_ecb_encrypt_blocks
+.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ecb_encrypt_blocks:
+	mov	%rsp, %rax
+.Lecb_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+	mov	%rsp,%rbp		# backup %rsp
+	mov	240($arg4),%eax		# rounds
+	mov	$arg1,$inp		# backup arguments
+	mov	$arg2,$out
+	mov	$arg3,$len
+	mov	$arg4,$key
+	cmp	\$8,$arg3
+	jb	.Lecb_enc_short
+
+	mov	%eax,%ebx		# backup rounds
+	shl	\$7,%rax		# 128 bytes per inner round key
+	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
+	sub	%rax,%rsp
+	mov	%rsp,%rax		# pass key schedule
+	mov	$key,%rcx		# pass key
+	mov	%ebx,%r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+
+	sub	\$8,$len
+.Lecb_enc_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%ebx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	lea	0x80($inp), $inp
+
+	call	_bsaes_encrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lecb_enc_loop
+
+	add	\$8,$len
+	jz	.Lecb_enc_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%ebx,%r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lecb_enc_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lecb_enc_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lecb_enc_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lecb_enc_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lecb_enc_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lecb_enc_six
+	movdqu	0x60($inp), @XMM[6]
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_six:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_five:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_four:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_three:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_two:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_one:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_short:
+	lea	($inp), $arg1
+	lea	($out), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt
+	lea	16($inp), $inp
+	lea	16($out), $out
+	dec	$len
+	jnz	.Lecb_enc_short
+
+.Lecb_enc_done:
+	lea	(%rsp),%rax
+	pxor	%xmm0, %xmm0
+.Lecb_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	jb	.Lecb_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lecb_enc_epilogue:
+	ret
+.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+
+.globl	bsaes_ecb_decrypt_blocks
+.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ecb_decrypt_blocks:
+	mov	%rsp, %rax
+.Lecb_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+	mov	%rsp,%rbp		# backup %rsp
+	mov	240($arg4),%eax		# rounds
+	mov	$arg1,$inp		# backup arguments
+	mov	$arg2,$out
+	mov	$arg3,$len
+	mov	$arg4,$key
+	cmp	\$8,$arg3
+	jb	.Lecb_dec_short
+
+	mov	%eax,%ebx		# backup rounds
+	shl	\$7,%rax		# 128 bytes per inner round key
+	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
+	sub	%rax,%rsp
+	mov	%rsp,%rax		# pass key schedule
+	mov	$key,%rcx		# pass key
+	mov	%ebx,%r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7		# fix up 0 round key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,(%rsp)
+
+	sub	\$8,$len
+.Lecb_dec_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%ebx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	lea	0x80($inp), $inp
+
+	call	_bsaes_decrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lecb_dec_loop
+
+	add	\$8,$len
+	jz	.Lecb_dec_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%ebx,%r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lecb_dec_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lecb_dec_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lecb_dec_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lecb_dec_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lecb_dec_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lecb_dec_six
+	movdqu	0x60($inp), @XMM[6]
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_six:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_five:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_four:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_three:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_two:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_one:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_short:
+	lea	($inp), $arg1
+	lea	($out), $arg2
+	lea	($key), $arg3
+	call	asm_AES_decrypt
+	lea	16($inp), $inp
+	lea	16($out), $out
+	dec	$len
+	jnz	.Lecb_dec_short
+
+.Lecb_dec_done:
+	lea	(%rsp),%rax
+	pxor	%xmm0, %xmm0
+.Lecb_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	jb	.Lecb_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lecb_dec_epilogue:
+	ret
+.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern	asm_AES_cbc_encrypt
+.globl	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,\@abi-omnipotent
+.align	16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+	mov	48(%rsp),$arg6		# pull direction flag
+___
+$code.=<<___;
+	cmp	\$0,$arg6
+	jne	asm_AES_cbc_encrypt
+	cmp	\$128,$arg3
+	jb	asm_AES_cbc_encrypt
+
+	mov	%rsp, %rax
+.Lcbc_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	240($arg4), %eax	# rounds
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+	mov	$arg5, %rbx
+	shr	\$4, $len		# bytes to blocks
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7		# fix up 0 round key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,(%rsp)
+
+	movdqu	(%rbx), @XMM[15]	# load IV
+	sub	\$8,$len
+.Lcbc_dec_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%edx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+
+	call	_bsaes_decrypt8
+
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[12], @XMM[7]
+	movdqu	0x60($inp), @XMM[14]
+	pxor	@XMM[13], @XMM[3]
+	movdqu	0x70($inp), @XMM[15]	# IV
+	pxor	@XMM[14], @XMM[5]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x80($inp), $inp
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lcbc_dec_loop
+
+	add	\$8,$len
+	jz	.Lcbc_dec_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lcbc_dec_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lcbc_dec_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lcbc_dec_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lcbc_dec_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lcbc_dec_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lcbc_dec_six
+	movdqu	0x60($inp), @XMM[6]
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[12], @XMM[7]
+	movdqu	0x60($inp), @XMM[15]	# IV
+	pxor	@XMM[13], @XMM[3]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_six:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[15]	# IV
+	pxor	@XMM[12], @XMM[7]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_five:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[15]	# IV
+	pxor	@XMM[11], @XMM[2]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_four:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[15]	# IV
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_three:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[15]	# IV
+	pxor	@XMM[9], @XMM[6]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_two:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[15]	# IV
+	pxor	@XMM[8], @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_one:
+	lea	($inp), $arg1
+	lea	0x20(%rbp), $arg2	# buffer output
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[15]	# ^= IV
+	movdqu	@XMM[15], ($out)	# write output
+	movdqa	@XMM[0], @XMM[15]	# IV
+
+.Lcbc_dec_done:
+	movdqu	@XMM[15], (%rbx)	# return IV
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lcbc_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lcbc_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lcbc_dec_epilogue:
+	ret
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ctr32_encrypt_blocks:
+	mov	%rsp, %rax
+.Lctr_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	movdqu	($arg5), %xmm0		# load counter
+	mov	240($arg4), %eax	# rounds
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+	movdqa	%xmm0, 0x20(%rbp)	# copy counter
+	cmp	\$8, $arg3
+	jb	.Lctr_enc_short
+
+	mov	%eax, %ebx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%ebx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+
+	movdqa	(%rsp), @XMM[9]		# load round0 key
+	lea	.LADD1(%rip), %r11
+	movdqa	0x20(%rbp), @XMM[0]	# counter copy
+	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
+	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
+	pshufb	@XMM[8], @XMM[0]
+	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
+	jmp	.Lctr_enc_loop
+.align	16
+.Lctr_enc_loop:
+	movdqa	@XMM[0], 0x20(%rbp)	# save counter
+	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
+	movdqa	@XMM[0], @XMM[2]
+	paddd	0x00(%r11), @XMM[1]	# .LADD1
+	movdqa	@XMM[0], @XMM[3]
+	paddd	0x10(%r11), @XMM[2]	# .LADD2
+	movdqa	@XMM[0], @XMM[4]
+	paddd	0x20(%r11), @XMM[3]	# .LADD3
+	movdqa	@XMM[0], @XMM[5]
+	paddd	0x30(%r11), @XMM[4]	# .LADD4
+	movdqa	@XMM[0], @XMM[6]
+	paddd	0x40(%r11), @XMM[5]	# .LADD5
+	movdqa	@XMM[0], @XMM[7]
+	paddd	0x50(%r11), @XMM[6]	# .LADD6
+	paddd	0x60(%r11), @XMM[7]	# .LADD7
+
+	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	# to flip byte order in 32-bit counter
+	movdqa	(%rsp), @XMM[9]		# round 0 key
+	lea	0x10(%rsp), %rax	# pass key schedule
+	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	lea	.LBS0(%rip), %r11	# constants table
+	 pshufb	@XMM[8], @XMM[7]
+	mov	%ebx,%r10d		# pass rounds
+
+	call	_bsaes_encrypt8_bitslice
+
+	sub	\$8,$len
+	jc	.Lctr_enc_loop_done
+
+	movdqu	0x00($inp), @XMM[8]	# load input
+	movdqu	0x10($inp), @XMM[9]
+	movdqu	0x20($inp), @XMM[10]
+	movdqu	0x30($inp), @XMM[11]
+	movdqu	0x40($inp), @XMM[12]
+	movdqu	0x50($inp), @XMM[13]
+	movdqu	0x60($inp), @XMM[14]
+	movdqu	0x70($inp), @XMM[15]
+	lea	0x80($inp),$inp
+	pxor	@XMM[0], @XMM[8]
+	movdqa	0x20(%rbp), @XMM[0]	# load counter
+	pxor	@XMM[9], @XMM[1]
+	movdqu	@XMM[8], 0x00($out)	# write output
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	@XMM[11], @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	@XMM[12], @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	@XMM[13], @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	@XMM[14], @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	@XMM[15], @XMM[5]
+	movdqu	@XMM[2], 0x60($out)
+	lea	.LADD1(%rip), %r11
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	paddd	0x70(%r11), @XMM[0]	# .LADD8
+	jnz	.Lctr_enc_loop
+
+	jmp	.Lctr_enc_done
+.align	16
+.Lctr_enc_loop_done:
+	add	\$8, $len
+	movdqu	0x00($inp), @XMM[8]	# load input
+	pxor	@XMM[8], @XMM[0]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	cmp	\$2,$len
+	jb	.Lctr_enc_done
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[9], @XMM[1]
+	movdqu	@XMM[1], 0x10($out)
+	je	.Lctr_enc_done
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[4], 0x20($out)
+	cmp	\$4,$len
+	jb	.Lctr_enc_done
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[11], @XMM[6]
+	movdqu	@XMM[6], 0x30($out)
+	je	.Lctr_enc_done
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[12], @XMM[3]
+	movdqu	@XMM[3], 0x40($out)
+	cmp	\$6,$len
+	jb	.Lctr_enc_done
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[13], @XMM[7]
+	movdqu	@XMM[7], 0x50($out)
+	je	.Lctr_enc_done
+	movdqu	0x60($inp), @XMM[14]
+	pxor	@XMM[14], @XMM[2]
+	movdqu	@XMM[2], 0x60($out)
+	jmp	.Lctr_enc_done
+
+.align	16
+.Lctr_enc_short:
+	lea	0x20(%rbp), $arg1
+	lea	0x30(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt
+	movdqu	($inp), @XMM[1]
+	lea	16($inp), $inp
+	mov	0x2c(%rbp), %eax	# load 32-bit counter
+	bswap	%eax
+	pxor	0x30(%rbp), @XMM[1]
+	inc	%eax			# increment
+	movdqu	@XMM[1], ($out)
+	bswap	%eax
+	lea	16($out), $out
+	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
+	dec	$len
+	jnz	.Lctr_enc_short
+
+.Lctr_enc_done:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lctr_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lctr_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lctr_enc_epilogue:
+	ret
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$arg6=~s/d$//;
+
+$code.=<<___;
+.globl	bsaes_xts_encrypt
+.type	bsaes_xts_encrypt,\@abi-omnipotent
+.align	16
+bsaes_xts_encrypt:
+	mov	%rsp, %rax
+.Lxts_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull key2
+	mov	0xa8(%rsp),$arg6	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+
+	lea	($arg6), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($arg5), $arg3
+	call	asm_AES_encrypt		# generate initial tweak
+
+	mov	240($key), %eax		# rounds
+	mov	$len, %rbx		# backup $len
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6, %xmm7		# fix up last round key
+	movdqa	%xmm7, (%rax)		# save last round key
+
+	and	\$-16, $len
+	sub	\$0x80, %rsp		# place for tweak[8]
+	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+
+	sub	\$0x80, $len
+	jc	.Lxts_enc_short
+	jmp	.Lxts_enc_loop
+
+.align	16
+.Lxts_enc_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqu	0x70($inp), @XMM[8+7]
+	lea	0x80($inp), $inp
+	movdqa	@XMM[7], 0x70(%rsp)
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	pxor	@XMM[8+7], @XMM[7]
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	0x60(%rsp), @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	0x70(%rsp), @XMM[5]
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+
+	sub	\$0x80,$len
+	jnc	.Lxts_enc_loop
+
+.Lxts_enc_short:
+	add	\$0x80, $len
+	jz	.Lxts_enc_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+	cmp	\$`0x10*$i`,$len
+	je	.Lxts_enc_$i
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqa	@XMM[7], 0x70(%rsp)
+	lea	0x70($inp), $inp
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	0x60(%rsp), @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	lea	0x70($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_6:
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x60($inp), $inp
+	pxor	@XMM[8+5], @XMM[5]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	lea	0x60($out), $out
+
+	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_5:
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x50($inp), $inp
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	lea	0x50($out), $out
+
+	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_4:
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x40($inp), $inp
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	lea	0x40($out), $out
+
+	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_3:
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x30($inp), $inp
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	lea	0x30($out), $out
+
+	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_2:
+	pxor	@XMM[8+0], @XMM[0]
+	lea	0x20($inp), $inp
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	lea	0x20($out), $out
+
+	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_1:
+	pxor	@XMM[0], @XMM[8]
+	lea	0x10($inp), $inp
+	movdqa	@XMM[8], 0x20(%rbp)
+	lea	0x20(%rbp), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
+	#pxor	@XMM[8], @XMM[0]
+	#lea	0x80(%rsp), %rax	# pass key schedule
+	#mov	%edx, %r10d		# pass rounds
+	#call	_bsaes_encrypt8
+	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x10($out), $out
+
+	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
+
+.Lxts_enc_done:
+	and	\$15, %ebx
+	jz	.Lxts_enc_ret
+	mov	$out, %rdx
+
+.Lxts_enc_steal:
+	movzb	($inp), %eax
+	movzb	-16(%rdx), %ecx
+	lea	1($inp), $inp
+	mov	%al, -16(%rdx)
+	mov	%cl, 0(%rdx)
+	lea	1(%rdx), %rdx
+	sub	\$1,%ebx
+	jnz	.Lxts_enc_steal
+
+	movdqu	-16($out), @XMM[0]
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[7], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_encrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[7]
+	movdqu	@XMM[7], -16($out)
+
+.Lxts_enc_ret:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lxts_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lxts_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lxts_enc_epilogue:
+	ret
+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl	bsaes_xts_decrypt
+.type	bsaes_xts_decrypt,\@abi-omnipotent
+.align	16
+bsaes_xts_decrypt:
+	mov	%rsp, %rax
+.Lxts_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull key2
+	mov	0xa8(%rsp),$arg6	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+
+	lea	($arg6), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($arg5), $arg3
+	call	asm_AES_encrypt		# generate initial tweak
+
+	mov	240($key), %eax		# rounds
+	mov	$len, %rbx		# backup $len
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp), %xmm7		# fix up round 0 key
+	movdqa	%xmm6, (%rax)		# save last round key
+	movdqa	%xmm7, (%rsp)
+
+	xor	%eax, %eax		# if ($len%16) len-=16;
+	and	\$-16, $len
+	test	\$15, %ebx
+	setnz	%al
+	shl	\$4, %rax
+	sub	%rax, $len
+
+	sub	\$0x80, %rsp		# place for tweak[8]
+	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+
+	sub	\$0x80, $len
+	jc	.Lxts_dec_short
+	jmp	.Lxts_dec_loop
+
+.align	16
+.Lxts_dec_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqu	0x70($inp), @XMM[8+7]
+	lea	0x80($inp), $inp
+	movdqa	@XMM[7], 0x70(%rsp)
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	pxor	@XMM[8+7], @XMM[7]
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	pxor	0x60(%rsp), @XMM[3]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	0x70(%rsp), @XMM[5]
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+
+	sub	\$0x80,$len
+	jnc	.Lxts_dec_loop
+
+.Lxts_dec_short:
+	add	\$0x80, $len
+	jz	.Lxts_dec_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+	cmp	\$`0x10*$i`,$len
+	je	.Lxts_dec_$i
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqa	@XMM[7], 0x70(%rsp)
+	lea	0x70($inp), $inp
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	pxor	0x60(%rsp), @XMM[3]
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	lea	0x70($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_6:
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x60($inp), $inp
+	pxor	@XMM[8+5], @XMM[5]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	lea	0x60($out), $out
+
+	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_5:
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x50($inp), $inp
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	lea	0x50($out), $out
+
+	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_4:
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x40($inp), $inp
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	lea	0x40($out), $out
+
+	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_3:
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x30($inp), $inp
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	lea	0x30($out), $out
+
+	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_2:
+	pxor	@XMM[8+0], @XMM[0]
+	lea	0x20($inp), $inp
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	lea	0x20($out), $out
+
+	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_1:
+	pxor	@XMM[0], @XMM[8]
+	lea	0x10($inp), $inp
+	movdqa	@XMM[8], 0x20(%rbp)
+	lea	0x20(%rbp), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
+	#pxor	@XMM[8], @XMM[0]
+	#lea	0x80(%rsp), %rax	# pass key schedule
+	#mov	%edx, %r10d		# pass rounds
+	#call	_bsaes_decrypt8
+	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x10($out), $out
+
+	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
+
+.Lxts_dec_done:
+	and	\$15, %ebx
+	jz	.Lxts_dec_ret
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	movdqa	@XMM[7], @XMM[6]
+	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	movdqu	($inp), @XMM[0]
+	pxor	$twres, @XMM[7]
+
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[7], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[7]
+	mov	$out, %rdx
+	movdqu	@XMM[7], ($out)
+
+.Lxts_dec_steal:
+	movzb	16($inp), %eax
+	movzb	(%rdx), %ecx
+	lea	1($inp), $inp
+	mov	%al, (%rdx)
+	mov	%cl, 16(%rdx)
+	lea	1(%rdx), %rdx
+	sub	\$1,%ebx
+	jnz	.Lxts_dec_steal
+
+	movdqu	($out), @XMM[0]
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[6], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[6]
+	movdqu	@XMM[6], ($out)
+
+.Lxts_dec_ret:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lxts_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lxts_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lxts_dec_epilogue:
+	ret
+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type	_bsaes_const,\@object
+.align	64
+_bsaes_const:
+.LM0ISR:	# InvShiftRows constants
+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:		# bit-slice constants
+	.quad	0x5555555555555555, 0x5555555555555555
+.LBS1:
+	.quad	0x3333333333333333, 0x3333333333333333
+.LBS2:
+	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:		# shiftrows constants
+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:	# byte-swap upper dword
+	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:		# counter increment constants
+	.quad	0x0000000000000000, 0x0000000100000000
+.LADD2:
+	.quad	0x0000000000000000, 0x0000000200000000
+.LADD3:
+	.quad	0x0000000000000000, 0x0000000300000000
+.LADD4:
+	.quad	0x0000000000000000, 0x0000000400000000
+.LADD5:
+	.quad	0x0000000000000000, 0x0000000500000000
+.LADD6:
+	.quad	0x0000000000000000, 0x0000000600000000
+.LADD7:
+	.quad	0x0000000000000000, 0x0000000700000000
+.LADD8:
+	.quad	0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+	.long	0x87,0,1,0
+.Lmasks:
+	.quad	0x0101010101010101, 0x0101010101010101
+	.quad	0x0202020202020202, 0x0202020202020202
+	.quad	0x0404040404040404, 0x0404040404040404
+	.quad	0x0808080808080808, 0x0808080808080808
+.LM0:
+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+	.quad	0x6363636363636363, 0x6363636363636363
+.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align	64
+.size	_bsaes_const,.-_bsaes_const
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	mov	160($context),%rax	# pull context->Rbp
+
+	lea	0x40(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xa0(%rax),%rax		# adjust stack pointer
+
+	mov	0x70(%rax),%rbp
+	mov	0x68(%rax),%rbx
+	mov	0x60(%rax),%r12
+	mov	0x58(%rax),%r13
+	mov	0x50(%rax),%r14
+	mov	0x48(%rax),%r15
+	lea	0x78(%rax),%rax		# adjust stack pointer
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+___
+$code.=<<___ if ($ecb);
+	.rva	.Lecb_enc_prologue
+	.rva	.Lecb_enc_epilogue
+	.rva	.Lecb_enc_info
+
+	.rva	.Lecb_dec_prologue
+	.rva	.Lecb_dec_epilogue
+	.rva	.Lecb_dec_info
+___
+$code.=<<___;
+	.rva	.Lcbc_dec_prologue
+	.rva	.Lcbc_dec_epilogue
+	.rva	.Lcbc_dec_info
+
+	.rva	.Lctr_enc_prologue
+	.rva	.Lctr_enc_epilogue
+	.rva	.Lctr_enc_info
+
+	.rva	.Lxts_enc_prologue
+	.rva	.Lxts_enc_epilogue
+	.rva	.Lxts_enc_info
+
+	.rva	.Lxts_dec_prologue
+	.rva	.Lxts_dec_epilogue
+	.rva	.Lxts_dec_info
+
+.section	.xdata
+.align	8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
+.Lecb_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
+.Lctr_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
+.Lxts_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
+.Lxts_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/vpaes-x86.S b/main/openssl/crypto/aes/asm/vpaes-x86.S
new file mode 100644
index 00000000..c53a5074
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/vpaes-x86.S
@@ -0,0 +1,661 @@
+.file	"vpaes-x86.s"
+.text
+.align	64
+.L_vpaes_consts:
+.long	218628480,235210255,168496130,67568393
+.long	252381056,17041926,33884169,51187212
+.long	252645135,252645135,252645135,252645135
+.long	1512730624,3266504856,1377990664,3401244816
+.long	830229760,1275146365,2969422977,3447763452
+.long	3411033600,2979783055,338359620,2782886510
+.long	4209124096,907596821,221174255,1006095553
+.long	191964160,3799684038,3164090317,1589111125
+.long	182528256,1777043520,2877432650,3265356744
+.long	1874708224,3503451415,3305285752,363511674
+.long	1606117888,3487855781,1093350906,2384367825
+.long	197121,67569157,134941193,202313229
+.long	67569157,134941193,202313229,197121
+.long	134941193,202313229,197121,67569157
+.long	202313229,197121,67569157,134941193
+.long	33619971,100992007,168364043,235736079
+.long	235736079,33619971,100992007,168364043
+.long	168364043,235736079,33619971,100992007
+.long	100992007,168364043,235736079,33619971
+.long	50462976,117835012,185207048,252579084
+.long	252314880,51251460,117574920,184942860
+.long	184682752,252054788,50987272,118359308
+.long	118099200,185467140,251790600,50727180
+.long	2946363062,528716217,1300004225,1881839624
+.long	1532713819,1532713819,1532713819,1532713819
+.long	3602276352,4288629033,3737020424,4153884961
+.long	1354558464,32357713,2958822624,3775749553
+.long	1201988352,132424512,1572796698,503232858
+.long	2213177600,1597421020,4103937655,675398315
+.long	2749646592,4273543773,1511898873,121693092
+.long	3040248576,1103263732,2871565598,1608280554
+.long	2236667136,2588920351,482954393,64377734
+.long	3069987328,291237287,2117370568,3650299247
+.long	533321216,3573750986,2572112006,1401264716
+.long	1339849704,2721158661,548607111,3445553514
+.long	2128193280,3054596040,2183486460,1257083700
+.long	655635200,1165381986,3923443150,2344132524
+.long	190078720,256924420,290342170,357187870
+.long	1610966272,2263057382,4103205268,309794674
+.long	2592527872,2233205587,1335446729,3402964816
+.long	3973531904,3225098121,3002836325,1918774430
+.long	3870401024,2102906079,2284471353,4117666579
+.long	617007872,1021508343,366931923,691083277
+.long	2528395776,3491914898,2968704004,1613121270
+.long	3445188352,3247741094,844474987,4093578302
+.long	651481088,1190302358,1689581232,574775300
+.long	4289380608,206939853,2555985458,2489840491
+.long	2130264064,327674451,3566485037,3349835193
+.long	2470714624,316102159,3636825756,3393945945
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte	118,101,114,115,105,116,121,41,0
+.align	64
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	addl	(%esp),%ebp
+	movdqa	-48(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm6
+	ret
+.size	_vpaes_preheat,.-_vpaes_preheat
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movl	$16,%ecx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%edx),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,208
+	movdqa	16(%ebp),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	pxor	%xmm2,%xmm0
+	addl	$16,%edx
+	leal	192(%ebp),%ebx
+	jmp	.L000enc_entry
+.align	16
+.L001enc_loop:
+	movdqa	32(%ebp),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	movdqa	64(%ebp),%xmm5
+.byte	102,15,56,0,234
+	movdqa	-64(%ebx,%ecx,1),%xmm1
+	movdqa	80(%ebp),%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm5,%xmm2
+	movdqa	(%ebx,%ecx,1),%xmm4
+	movdqa	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	addl	$16,%edx
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addl	$16,%ecx
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	pxor	%xmm3,%xmm0
+	subl	$1,%eax
+.L000enc_entry:
+	movdqa	%xmm6,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+	movdqa	-32(%ebp),%xmm5
+.byte	102,15,56,0,232
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm7,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm7,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm7,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm7,%xmm3
+	movdqu	(%edx),%xmm5
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	jnz	.L001enc_loop
+	movdqa	96(%ebp),%xmm4
+	movdqa	112(%ebp),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%ebx,%ecx,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+	movl	240(%edx),%eax
+	leal	608(%ebp),%ebx
+	movdqa	%xmm6,%xmm1
+	movdqa	-64(%ebx),%xmm2
+	pandn	%xmm0,%xmm1
+	movl	%eax,%ecx
+	psrld	$4,%xmm1
+	movdqu	(%edx),%xmm5
+	shll	$4,%ecx
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,208
+	movdqa	-48(%ebx),%xmm0
+	xorl	$48,%ecx
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	pxor	%xmm5,%xmm2
+	movdqa	176(%ebp),%xmm5
+	pxor	%xmm2,%xmm0
+	addl	$16,%edx
+	leal	-352(%ebx,%ecx,1),%ecx
+	jmp	.L002dec_entry
+.align	16
+.L003dec_loop:
+	movdqa	-32(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	-16(%ebx),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	addl	$16,%edx
+.byte	102,15,56,0,197
+	movdqa	(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	16(%ebx),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	subl	$1,%eax
+.byte	102,15,56,0,197
+	movdqa	32(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	48(%ebx),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,197
+	movdqa	64(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	80(%ebx),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,58,15,237,12
+.L002dec_entry:
+	movdqa	%xmm6,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+	movdqa	-32(%ebp),%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm7,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm7,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm7,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm7,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqu	(%edx),%xmm0
+	jnz	.L003dec_loop
+	movdqa	96(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%ebx),%xmm0
+	movdqa	(%ecx),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+	addl	(%esp),%ebp
+	movdqu	(%esi),%xmm0
+	movdqa	320(%ebp),%xmm2
+	movdqa	%xmm0,%xmm3
+	leal	(%ebp),%ebx
+	movdqa	%xmm2,4(%esp)
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+	testl	%edi,%edi
+	jnz	.L004schedule_am_decrypting
+	movdqu	%xmm0,(%edx)
+	jmp	.L005schedule_go
+.L004schedule_am_decrypting:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%edx)
+	xorl	$48,%ecx
+.L005schedule_go:
+	cmpl	$192,%eax
+	ja	.L006schedule_256
+	je	.L007schedule_192
+.L008schedule_128:
+	movl	$10,%eax
+.L009loop_schedule_128:
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.L009loop_schedule_128
+.align	16
+.L007schedule_192:
+	movdqu	8(%esi),%xmm0
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%eax
+.L011loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	.L011loop_schedule_192
+.align	16
+.L006schedule_256:
+	movdqu	16(%esi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%eax
+.L012loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,20(%esp)
+	movdqa	%xmm6,%xmm7
+	call	.L_vpaes_schedule_low_round
+	movdqa	20(%esp),%xmm7
+	jmp	.L012loop_schedule_256
+.align	16
+.L010schedule_mangle_last:
+	leal	384(%ebp),%ebx
+	testl	%edi,%edi
+	jnz	.L013schedule_mangle_last_dec
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,193
+	leal	352(%ebp),%ebx
+	addl	$32,%edx
+.L013schedule_mangle_last_dec:
+	addl	$-16,%edx
+	pxor	336(%ebp),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	$128,%xmm6,%xmm0
+	pxor	%xmm0,%xmm6
+	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	pxor	%xmm1,%xmm1
+	movhlps	%xmm1,%xmm6
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+	movdqa	8(%esp),%xmm2
+	pxor	%xmm1,%xmm1
+.byte	102,15,58,15,202,15
+.byte	102,15,58,15,210,15
+	pxor	%xmm1,%xmm7
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+	movdqa	%xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	336(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm4
+	movdqa	-48(%ebp),%xmm5
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm4,%xmm0
+	movdqa	-32(%ebp),%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm5,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm5,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	32(%ebp),%xmm4
+.byte	102,15,56,0,226
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	-16(%ebp),%xmm2
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	movdqa	(%ebx),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%ebx),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	128(%ebp),%xmm5
+	testl	%edi,%edi
+	jnz	.L014schedule_mangle_dec
+	addl	$16,%edx
+	pxor	336(%ebp),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+	jmp	.L015schedule_mangle_both
+.align	16
+.L014schedule_mangle_dec:
+	movdqa	-16(%ebp),%xmm2
+	leal	416(%ebp),%esi
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm4
+	movdqa	(%esi),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	32(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	64(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	96(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	addl	$-16,%edx
+.L015schedule_mangle_both:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	addl	$-16,%ecx
+	andl	$48,%ecx
+	movdqu	%xmm3,(%edx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	movl	$48,%ecx
+	movl	$0,%edi
+	leal	.L_vpaes_consts+0x30-.L016pic_point,%ebp
+	call	_vpaes_schedule_core
+.L016pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	shll	$4,%ebx
+	leal	16(%edx,%ebx,1),%edx
+	movl	$1,%edi
+	movl	%eax,%ecx
+	shrl	$1,%ecx
+	andl	$32,%ecx
+	xorl	$32,%ecx
+	leal	.L_vpaes_consts+0x30-.L017pic_point,%ebp
+	call	_vpaes_schedule_core
+.L017pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	leal	.L_vpaes_consts+0x30-.L018pic_point,%ebp
+	call	_vpaes_preheat
+.L018pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_encrypt,.-.L_vpaes_encrypt_begin
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	leal	.L_vpaes_consts+0x30-.L019pic_point,%ebp
+	call	_vpaes_preheat
+.L019pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_decrypt,.-.L_vpaes_decrypt_begin
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	subl	$16,%eax
+	jc	.L020cbc_abort
+	leal	-56(%esp),%ebx
+	movl	36(%esp),%ebp
+	andl	$-16,%ebx
+	movl	40(%esp),%ecx
+	xchgl	%esp,%ebx
+	movdqu	(%ebp),%xmm1
+	subl	%esi,%edi
+	movl	%ebx,48(%esp)
+	movl	%edi,(%esp)
+	movl	%edx,4(%esp)
+	movl	%ebp,8(%esp)
+	movl	%eax,%edi
+	leal	.L_vpaes_consts+0x30-.L021pic_point,%ebp
+	call	_vpaes_preheat
+.L021pic_point:
+	cmpl	$0,%ecx
+	je	.L022cbc_dec_loop
+	jmp	.L023cbc_enc_loop
+.align	16
+.L023cbc_enc_loop:
+	movdqu	(%esi),%xmm0
+	pxor	%xmm1,%xmm0
+	call	_vpaes_encrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	movdqa	%xmm0,%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	.L023cbc_enc_loop
+	jmp	.L024cbc_done
+.align	16
+.L022cbc_dec_loop:
+	movdqu	(%esi),%xmm0
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm0,32(%esp)
+	call	_vpaes_decrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	pxor	16(%esp),%xmm0
+	movdqa	32(%esp),%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	.L022cbc_dec_loop
+.L024cbc_done:
+	movl	8(%esp),%ebx
+	movl	48(%esp),%esp
+	movdqu	%xmm1,(%ebx)
+.L020cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
diff --git a/main/openssl/crypto/aes/asm/vpaes-x86.pl b/main/openssl/crypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 00000000..1533e2c3
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,903 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+#		aes-586.pl		vpaes-x86.pl
+#
+# Core 2(**)	29.1/42.3/18.3		22.0/25.6(***)
+# Nehalem	27.9/40.4/18.1		10.3/12.0
+# Atom		102./119./60.1		64.5/85.3(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +32%/65%  improvement on Core 2
+#	and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+#	code path).
+#
+#						<appro@openssl.org>
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$PREFIX="vpaes";
+
+my  ($round, $base, $magic, $key, $const, $inp, $out)=
+    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
+
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30;		# inv, inva
+	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10;		# s0F
+	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00;		# input transform (lo, hi)
+	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20;		# sb1u, sb1t
+	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40;		# sb2u, sb2t
+	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60;		# sbou, sbot
+	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80;	# mc_forward
+	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0;	# mc_backward
+	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100;		# sr
+	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140;		# rcon
+	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150;		# s63: all equal to 0x63 transformed
+	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160;		# output transform
+	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
+	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+$k_dksd=0x1a0;		# decryption key schedule: invskew x*D
+	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
+	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
+$k_dksb=0x1c0;		# decryption key schedule: invskew x*B
+	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
+	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
+$k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
+	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
+	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
+$k_dks9=0x200;		# decryption key schedule: invskew x*9
+	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
+	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+$k_dipt=0x220;		# decryption input transform
+	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
+	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
+
+$k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
+	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
+	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
+$k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
+	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
+	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
+$k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
+	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
+	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
+$k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
+	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
+	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
+$k_dsbo=0x2c0;		# decryption sbox final output
+	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
+	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
+&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align	(64);
+
+&function_begin_B("_vpaes_preheat");
+	&add	($const,&DWP(0,"esp"));
+	&movdqa	("xmm7",&QWP($k_inv,$const));
+	&movdqa	("xmm6",&QWP($k_s0F,$const));
+	&ret	();
+&function_end_B("_vpaes_preheat");
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm6-%xmm7 as in _vpaes_preheat
+##    (%edx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+	&mov	($magic,16);
+	&mov	($round,&DWP(240,$key));
+	&movdqa	("xmm1","xmm6")
+	&movdqa	("xmm2",&QWP($k_ipt,$const));
+	&pandn	("xmm1","xmm0");
+	&movdqu	("xmm5",&QWP(0,$key));
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm6");
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm2","xmm5");
+	&pxor	("xmm0","xmm2");
+	&add	($key,16);
+	&lea	($base,&DWP($k_mc_backward,$const));
+	&jmp	(&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+	# middle of middle round
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
+	&pshufb	("xmm4","xmm2");		# 4 = sb1u
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
+	&pshufb	("xmm5","xmm2");		# 4 = sb2u
+	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+	&pshufb	("xmm2","xmm3");		# 2 = sb2t
+	&pxor	("xmm2","xmm5");		# 2 = 2A
+	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
+	&movdqa	("xmm3","xmm0");		# 3 = A
+	&pshufb	("xmm0","xmm1");		# 0 = B
+	&add	($key,16);			# next key
+	&pxor	("xmm0","xmm2");		# 0 = 2A+B
+	&pshufb	("xmm3","xmm4");		# 3 = D
+	&add	($magic,16);			# next mc
+	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
+	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
+	&and	($magic,0x30);			# ... mod 4
+	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
+	&sub	($round,1);			# nr--
+
+&set_label("enc_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm5","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&jnz	(&label("enc_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
+	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm1");
+	&ret	();
+&function_end_B("_vpaes_encrypt_core");
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+&function_begin_B("_vpaes_decrypt_core");
+	&mov	($round,&DWP(240,$key));
+	&lea	($base,&DWP($k_dsbd,$const));
+	&movdqa	("xmm1","xmm6");
+	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
+	&pandn	("xmm1","xmm0");
+	&mov	($magic,$round);
+	&psrld	("xmm1",4)
+	&movdqu	("xmm5",&QWP(0,$key));
+	&shl	($magic,4);
+	&pand	("xmm0","xmm6");
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
+	&xor	($magic,0x30);
+	&pshufb	("xmm0","xmm1");
+	&and	($magic,0x30);
+	&pxor	("xmm2","xmm5");
+	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
+	&pxor	("xmm0","xmm2");
+	&add	($key,16);
+	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
+	&jmp	(&label("dec_entry"));
+
+&set_label("dec_loop",16);
+##
+##  Inverse mix columns
+##
+	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
+	&pshufb	("xmm4","xmm2");		# 4 = sb9u
+	&pxor	("xmm4","xmm0");
+	&movdqa	("xmm0",&QWP(-0x10,$base));	# 0 : sb9t
+	&pshufb	("xmm0","xmm3");		# 0 = sb9t
+	&pxor	("xmm0","xmm4");		# 0 = ch
+	&add	($key,16);			# next round key
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
+	&pshufb	("xmm4","xmm2");		# 4 = sbdu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x10,$base));	# 0 : sbdt
+	&pshufb	("xmm0","xmm3");		# 0 = sbdt
+	&pxor	("xmm0","xmm4");		# 0 = ch
+	&sub	($round,1);			# nr--
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
+	&pshufb	("xmm4","xmm2");		# 4 = sbbu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x30,$base));	# 0 : sbbt
+	&pshufb	("xmm0","xmm3");		# 0 = sbbt
+	&pxor	("xmm0","xmm4");		# 0 = ch
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
+	&pshufb	("xmm4","xmm2");		# 4 = sbeu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x50,$base));	# 0 : sbet
+	&pshufb	("xmm0","xmm3");		# 0 = sbet
+	&pxor	("xmm0","xmm4");		# 0 = ch
+
+	&palignr("xmm5","xmm5",12);
+
+&set_label("dec_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqu	("xmm0",&QWP(0,$key));
+	&jnz	(&label("dec_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
+	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
+	&movdqa	("xmm2",&QWP(0,$magic));
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_decrypt_core");
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+	&add	($const,&DWP(0,"esp"));
+	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
+	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
+
+	# input transform
+	&movdqa	("xmm3","xmm0");
+	&lea	($base,&DWP($k_ipt,$const));
+	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
+	&call	("_vpaes_schedule_transform");
+	&movdqa	("xmm7","xmm0");
+
+	&test	($out,$out);
+	&jnz	(&label("schedule_am_decrypting"));
+
+	# encrypting, output zeroth round key after transform
+	&movdqu	(&QWP(0,$key),"xmm0");
+	&jmp	(&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+	# decrypting, output zeroth round key after shiftrows
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&xor	($magic,0x30);
+
+&set_label("schedule_go");
+	&cmp	($round,192);
+	&ja	(&label("schedule_256"));
+	&je	(&label("schedule_192"));
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+	&mov	($round,10);
+
+&set_label("loop_schedule_128");
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# write output
+	&jmp	(&label("loop_schedule_128"));
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+&set_label("schedule_192",16);
+	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform	
+	&movdqa	("xmm6","xmm0");		# save short part
+	&pxor	("xmm4","xmm4");		# clear 4
+	&movhlps("xmm6","xmm4");		# clobber low side with zeros
+	&mov	($round,4);
+
+&set_label("loop_schedule_192");
+	&call	("_vpaes_schedule_round");
+	&palignr("xmm0","xmm6",8);
+	&call	("_vpaes_schedule_mangle");	# save key n
+	&call	("_vpaes_schedule_192_smear");
+	&call	("_vpaes_schedule_mangle");	# save key n+1
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# save key n+2
+	&call	("_vpaes_schedule_192_smear");
+	&jmp	(&label("loop_schedule_192"));
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform	
+	&mov	($round,7);
+
+&set_label("loop_schedule_256");
+	&call	("_vpaes_schedule_mangle");	# output low result
+	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
+
+	# high round
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	
+
+	# low round. swap xmm7 and xmm6
+	&pshufd	("xmm0","xmm0",0xFF);
+	&movdqa	(&QWP(20,"esp"),"xmm7");
+	&movdqa	("xmm7","xmm6");
+	&call	("_vpaes_schedule_low_round");
+	&movdqa	("xmm7",&QWP(20,"esp"));
+
+	&jmp	(&label("loop_schedule_256"));
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+	# schedule last round key from xmm0
+	&lea	($base,&DWP($k_deskew,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_last_dec"));
+
+	# encrypting
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm0","xmm1");		# output permute
+	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
+	&add	($key,32);
+
+&set_label("schedule_mangle_last_dec");
+	&add	($key,-16);
+	&pxor	("xmm0",&QWP($k_s63,$const));
+	&call	("_vpaes_schedule_transform");	# output transform
+	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
+
+	# cleanup
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&pxor	("xmm6","xmm6");
+	&pxor	("xmm7","xmm7");
+	&ret	();
+&function_end_B("_vpaes_schedule_core");
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+&function_begin_B("_vpaes_schedule_192_smear");
+	&pshufd	("xmm0","xmm6",0x80);		# d c 0 0 -> c 0 0 0
+	&pxor	("xmm6","xmm0");		# -> c+d c 0 0
+	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
+	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
+	&movdqa	("xmm0","xmm6");
+	&pxor	("xmm1","xmm1");
+	&movhlps("xmm6","xmm1");		# clobber low side with zeros
+	&ret	();
+&function_end_B("_vpaes_schedule_192_smear");
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+	# extract rcon from xmm8
+	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
+	&pxor	("xmm1","xmm1");
+	&palignr("xmm1","xmm2",15);
+	&palignr("xmm2","xmm2",15);
+	&pxor	("xmm7","xmm1");
+
+	# rotate
+	&pshufd	("xmm0","xmm0",0xFF);
+	&palignr("xmm0","xmm0",1);
+
+	# fall through...
+	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
+
+	# low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+	# smear xmm7
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",4);
+	&pxor	("xmm7","xmm1");
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",8);
+	&pxor	("xmm7","xmm1");
+	&pxor	("xmm7",&QWP($k_s63,$const));
+
+	# subbyte
+	&movdqa	("xmm4",&QWP($k_s0F,$const));
+	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
+	&movdqa	("xmm1","xmm4");	
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm4");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm5");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm5");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = sbox output
+
+	# add in smeared stuff
+	&pxor	("xmm0","xmm7");
+	&movdqa	("xmm7","xmm0");
+	&ret	();
+&function_end_B("_vpaes_schedule_round");
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%ebx)
+##
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm2");
+	&movdqa	("xmm2",&QWP(0,$base));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP(16,$base));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%edx), and increments or decrements it
+##  Keeps track of round number mod 4 in %ecx
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+	&movdqa	("xmm4","xmm0");	# save xmm0 for later
+	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_dec"));
+
+	# encrypting
+	&add	($key,16);
+	&pxor	("xmm4",&QWP($k_s63,$const));
+	&pshufb	("xmm4","xmm5");
+	&movdqa	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+
+	&jmp	(&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+	# inverse mix columns
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&lea	($inp,&DWP($k_dksd,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm4");
+	&psrld	("xmm1",4);			# 1 = hi
+	&pand	("xmm4","xmm2");		# 4 = lo
+
+	&movdqa	("xmm2",&QWP(0,$inp));
+	&pshufb	("xmm2","xmm4");
+	&movdqa	("xmm3",&QWP(0x10,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x20,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x30,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x40,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x50,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x60,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x70,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+
+	&add	($key,-16);
+
+&set_label("schedule_mangle_both");
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&add	($magic,-16);
+	&and	($magic,0x30);
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&ret	();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
+	&mov	($magic,0x30);
+	&mov	($out,0);
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+
+&function_begin("${PREFIX}_set_decrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
+	&shl	($base,4);
+	&lea	($key,&DWP(16,$key,$base));
+
+	&mov	($out,1);
+	&mov	($magic,$round);
+	&shr	($magic,1);
+	&and	($magic,32);
+	&xor	($magic,32);			# nbist==192?0:32;
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_decrypt_key");
+
+&function_begin("${PREFIX}_encrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_encrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+
+&function_begin("${PREFIX}_decrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_decrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_decrypt");
+
+&function_begin("${PREFIX}_cbc_encrypt");
+	&mov	($inp,&wparam(0));		# inp
+	&mov	($out,&wparam(1));		# out
+	&mov	($round,&wparam(2));		# len
+	&mov	($key,&wparam(3));		# key
+	&sub	($round,16);
+	&jc	(&label("cbc_abort"));
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($const,&wparam(4));		# ivp
+	&and	($base,-16);
+	&mov	($magic,&wparam(5));		# enc
+	&xchg	($base,"esp");			# alloca
+	&movdqu	("xmm1",&QWP(0,$const));	# load IV
+	&sub	($out,$inp);
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	(&DWP(0,"esp"),$out);		# save out
+	&mov	(&DWP(4,"esp"),$key)		# save key
+	&mov	(&DWP(8,"esp"),$const);		# save ivp
+	&mov	($out,$round);			# $out works as $len
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&cmp	($magic,0);
+	&je	(&label("cbc_dec_loop"));
+	&jmp	(&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&pxor	("xmm0","xmm1");		# inp^=iv
+	&call	("_vpaes_encrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&movdqa	("xmm1","xmm0");
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_enc_loop"));
+	&jmp	(&label("cbc_done"));
+
+&set_label("cbc_dec_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
+	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
+	&call	("_vpaes_decrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
+	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_dec_loop"));
+
+&set_label("cbc_done");
+	&mov	($base,&DWP(8,"esp"));		# restore ivp
+	&mov	("esp",&DWP(48,"esp"));
+	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+&asm_finish();
diff --git a/main/openssl/crypto/aes/asm/vpaes-x86_64.S b/main/openssl/crypto/aes/asm/vpaes-x86_64.S
new file mode 100644
index 00000000..2b68e615
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/vpaes-x86_64.S
@@ -0,0 +1,828 @@
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align	16
+.Lenc_loop:
+
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm15,%xmm5
+.byte	102,15,56,0,234
+	movdqa	-64(%r11,%r10,1),%xmm1
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm5,%xmm2
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$48,%r11
+	pxor	%xmm3,%xmm0
+	subq	$1,%rax
+
+.Lenc_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm5
+.byte	102,15,56,0,232
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+	movdqu	(%r9),%xmm5
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	jnz	.Lenc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	.byte	0xf3,0xc3
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+	movq	%rdx,%r9
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_dipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movq	%rax,%r11
+	psrld	$4,%xmm1
+	movdqu	(%r9),%xmm5
+	shlq	$4,%r11
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_dipt+16(%rip),%xmm0
+	xorq	$48,%r11
+	leaq	.Lk_dsbd(%rip),%r10
+.byte	102,15,56,0,193
+	andq	$48,%r11
+	pxor	%xmm5,%xmm2
+	movdqa	.Lk_mc_forward+48(%rip),%xmm5
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	addq	%r10,%r11
+	jmp	.Ldec_entry
+
+.align	16
+.Ldec_loop:
+
+
+
+	movdqa	-32(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	-16(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	addq	$16,%r9
+
+.byte	102,15,56,0,197
+	movdqa	0(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	16(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	subq	$1,%rax
+
+.byte	102,15,56,0,197
+	movdqa	32(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	48(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+.byte	102,15,56,0,197
+	movdqa	64(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	80(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+.byte	102,15,58,15,237,12
+
+.Ldec_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqu	(%r9),%xmm0
+	jnz	.Ldec_loop
+
+
+	movdqa	96(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%r10),%xmm0
+	movdqa	-352(%r11),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	.byte	0xf3,0xc3
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+
+
+
+
+
+	call	_vpaes_preheat		
+	movdqa	.Lk_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	.Lk_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	.Lk_sr(%rip),%r10
+	testq	%rcx,%rcx
+	jnz	.Lschedule_am_decrypting
+
+
+	movdqu	%xmm0,(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%rdx)
+	xorq	$48,%r8
+
+.Lschedule_go:
+	cmpl	$192,%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+	movl	$10,%esi
+
+.Loop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+	jmp	.Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0
+	call	_vpaes_schedule_transform	
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform	
+	movl	$7,%esi
+
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle	
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+
+
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	.Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_mangle_last:
+
+	leaq	.Lk_deskew(%rip),%r11
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	.Lk_opt(%rip),%r11
+	addq	$32,%rdx
+
+.Lschedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	.Lk_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform 
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	$128,%xmm6,%xmm0
+	pxor	%xmm0,%xmm6
+	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	pxor	%xmm1,%xmm1
+	movhlps	%xmm1,%xmm6
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	.Lk_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_dec
+
+
+	addq	$16,%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+
+	leaq	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm4
+
+	movdqa	0(%r11),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	32(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	64(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	96(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+
+	addq	$-16,%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$48,%r8
+	movdqu	%xmm3,(%rdx)
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$48,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+	shll	$4,%eax
+	leaq	16(%rdx,%rax,1),%rdx
+
+	movl	$1,%ecx
+	movl	%esi,%r8d
+	shrl	$1,%r8d
+	andl	$32,%r8d
+	xorl	$32,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_decrypt,.-vpaes_decrypt
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+	xchgq	%rcx,%rdx
+	subq	$16,%rcx
+	jc	.Lcbc_abort
+	movdqu	(%r8),%xmm6
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	cmpl	$0,%r9d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	(%rdi),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	(%rdi),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,(%r8)
+.Lcbc_abort:
+	.byte	0xf3,0xc3
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+
+
+
+
+
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	leaq	.Lk_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	.byte	0xf3,0xc3
+.size	_vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type	_vpaes_consts,@object
+.align	64
+_vpaes_consts:
+.Lk_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
diff --git a/main/openssl/crypto/aes/asm/vpaes-x86_64.pl b/main/openssl/crypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 00000000..41f2e46f
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1207 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+#		aes-x86_64.pl		vpaes-x86_64.pl
+#
+# Core 2(**)	30.5/43.7/14.3		21.8/25.7(***)
+# Nehalem	30.5/42.2/14.6		 9.8/11.8
+# Atom		63.9/79.0/32.1		64.0/84.8(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +40%/78% improvement on Core 2
+#	(as implied, over "hyper-threading-safe" code path).
+#
+#						<appro@openssl.org>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+	mov	%rdx,	%r9
+	mov	\$16,	%r11
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	movdqa	.Lk_ipt(%rip), %xmm2	# iptlo
+	pandn	%xmm0,	%xmm1
+	movdqu	(%r9),	%xmm5		# round0 key
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm5,	%xmm2
+	pxor	%xmm2,	%xmm0
+	add	\$16,	%r9
+	lea	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align 16
+.Lenc_loop:
+	# middle of middle round
+	movdqa  %xmm13,	%xmm4	# 4 : sb1u
+	pshufb  %xmm2,	%xmm4	# 4 = sb1u
+	pxor	%xmm5,	%xmm4	# 4 = sb1u + k
+	movdqa  %xmm12,	%xmm0	# 0 : sb1t
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	movdqa  %xmm15,	%xmm5	# 4 : sb2u
+	pshufb	%xmm2,	%xmm5	# 4 = sb2u
+	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
+	movdqa	%xmm14, %xmm2	# 2 : sb2t
+	pshufb	%xmm3,  %xmm2	# 2 = sb2t
+	pxor	%xmm5,	%xmm2	# 2 = 2A
+	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
+	movdqa	%xmm0,  %xmm3	# 3 = A
+	pshufb  %xmm1,  %xmm0	# 0 = B
+	add	\$16,	%r9	# next key
+	pxor	%xmm2,  %xmm0	# 0 = 2A+B
+	pshufb	%xmm4,	%xmm3	# 3 = D
+	add	\$16,	%r11	# next mc
+	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
+	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
+	and	\$0x30,	%r11	# ... mod 4
+	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
+	sub	\$1,%rax	# nr--
+
+.Lenc_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	\$4,   	%xmm1   # 1 = i
+	pand	%xmm9, 	%xmm0   # 0 = k
+	movdqa	%xmm11, %xmm5	# 2 : a/k
+	pshufb  %xmm0,  %xmm5	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	movdqa	%xmm10, %xmm3   # 3 : 1/jak
+	movdqu	(%r9),	%xmm5
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	jnz	.Lenc_loop
+
+	# middle of last round
+	movdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+	movdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	pxor	%xmm5,  %xmm4	# 4 = sb1u + k
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	movdqa	0x40(%r11,%r10), %xmm1		# .Lk_sr[]
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	%xmm1,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+	
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type	_vpaes_decrypt_core,\@abi-omnipotent
+.align	16
+_vpaes_decrypt_core:
+	mov	%rdx,	%r9		# load key
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	movdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	pandn	%xmm0,	%xmm1
+	mov	%rax,	%r11
+	psrld	\$4,	%xmm1
+	movdqu	(%r9),	%xmm5		# round0 key
+	shl	\$4,	%r11
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_dipt+16(%rip), %xmm0 # ipthi
+	xor	\$0x30,	%r11
+	lea	.Lk_dsbd(%rip),%r10
+	pshufb	%xmm1,	%xmm0
+	and	\$0x30,	%r11
+	pxor	%xmm5,	%xmm2
+	movdqa	.Lk_mc_forward+48(%rip), %xmm5
+	pxor	%xmm2,	%xmm0
+	add	\$16,	%r9
+	add	%r10,	%r11
+	jmp	.Ldec_entry
+
+.align 16
+.Ldec_loop:
+##
+##  Inverse mix columns
+##
+	movdqa  -0x20(%r10),%xmm4	# 4 : sb9u
+	pshufb	%xmm2,	%xmm4		# 4 = sb9u
+	pxor	%xmm0,	%xmm4
+	movdqa  -0x10(%r10),%xmm0	# 0 : sb9t
+	pshufb	%xmm3,	%xmm0		# 0 = sb9t
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	add	\$16, %r9		# next round key
+
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x00(%r10),%xmm4	# 4 : sbdu
+	pshufb	%xmm2,	%xmm4		# 4 = sbdu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x10(%r10),%xmm0	# 0 : sbdt
+	pshufb	%xmm3,	%xmm0		# 0 = sbdt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	sub	\$1,%rax		# nr--
+	
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x20(%r10),%xmm4	# 4 : sbbu
+	pshufb	%xmm2,	%xmm4		# 4 = sbbu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x30(%r10),%xmm0	# 0 : sbbt
+	pshufb	%xmm3,	%xmm0		# 0 = sbbt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x40(%r10),%xmm4	# 4 : sbeu
+	pshufb	%xmm2,	%xmm4		# 4 = sbeu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x50(%r10),%xmm0	# 0 : sbet
+	pshufb	%xmm3,	%xmm0		# 0 = sbet
+	pxor	%xmm4,	%xmm0		# 0 = ch
+
+	palignr	\$12,	%xmm5,	%xmm5
+	
+.Ldec_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	\$4,    %xmm1	# 1 = i
+	pand	%xmm9, 	%xmm0	# 0 = k
+	movdqa	%xmm11, %xmm2	# 2 : a/k
+	pshufb  %xmm0,  %xmm2	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa	%xmm10,	%xmm3	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3	# 3 = 1/i
+	pxor	%xmm2, 	%xmm3	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4	# 4 = 1/j
+	pxor	%xmm2, 	%xmm4	# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2	# 2 = io
+	movdqa	%xmm10, %xmm3	# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3	# 3 = 1/jak
+	pxor	%xmm1,  %xmm3	# 3 = jo
+	movdqu	(%r9),	%xmm0
+	jnz	.Ldec_loop
+
+	# middle of last round
+	movdqa	0x60(%r10), %xmm4	# 3 : sbou
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	pxor	%xmm0,  %xmm4	# 4 = sb1u + k
+	movdqa	0x70(%r10), %xmm0	# 0 : sbot
+	movdqa	-0x160(%r11), %xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	%xmm2,	%xmm0
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_schedule_core,\@abi-omnipotent
+.align	16
+_vpaes_schedule_core:
+	# rdi = key
+	# rsi = size in bits
+	# rdx = buffer
+	# rcx = direction.  0=encrypt, 1=decrypt
+
+	call	_vpaes_preheat		# load the tables
+	movdqa	.Lk_rcon(%rip), %xmm8	# load rcon
+	movdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	# input transform
+	movdqa	%xmm0,	%xmm3
+	lea	.Lk_ipt(%rip), %r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,	%xmm7
+
+	lea	.Lk_sr(%rip),%r10
+	test	%rcx,	%rcx
+	jnz	.Lschedule_am_decrypting
+
+	# encrypting, output zeroth round key after transform
+	movdqu	%xmm0,	(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	# decrypting, output zeroth round key after shiftrows
+	movdqa	(%r8,%r10),%xmm1
+	pshufb  %xmm1,	%xmm3
+	movdqu	%xmm3,	(%rdx)
+	xor	\$0x30, %r8
+
+.Lschedule_go:
+	cmp	\$192,	%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	\$10, %esi
+	
+.Loop_schedule_128:
+	call 	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	# write output
+	jmp 	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	call	_vpaes_schedule_transform	# input transform
+	movdqa	%xmm0,	%xmm6		# save short part
+	pxor	%xmm4,	%xmm4		# clear 4
+	movhlps	%xmm4,	%xmm6		# clobber low side with zeros
+	mov	\$4,	%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+	palignr	\$8,%xmm6,%xmm0	
+	call	_vpaes_schedule_mangle	# save key n
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle	# save key n+1
+	call	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	# save key n+2
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	call	_vpaes_schedule_transform	# input transform
+	mov	\$7, %esi
+	
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle	# output low result
+	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	# high round
+	call	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+
+	# low round. swap xmm7 and xmm6
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	movdqa	%xmm7,	%xmm5
+	movdqa	%xmm6,	%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,	%xmm7
+	
+	jmp	.Loop_schedule_256
+
+	
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	16
+.Lschedule_mangle_last:
+	# schedule last round key from xmm0
+	lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	test	%rcx, 	%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+	# encrypting
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,	%xmm0		# output permute
+	lea	.Lk_opt(%rip),	%r11	# prepare to output transform
+	add	\$32,	%rdx
+
+.Lschedule_mangle_last_dec:
+	add	\$-16,	%rdx
+	pxor	.Lk_s63(%rip),	%xmm0
+	call	_vpaes_schedule_transform # output transform
+	movdqu	%xmm0,	(%rdx)		# save last key
+
+	# cleanup
+	pxor	%xmm0,  %xmm0
+	pxor	%xmm1,  %xmm1
+	pxor	%xmm2,  %xmm2
+	pxor	%xmm3,  %xmm3
+	pxor	%xmm4,  %xmm4
+	pxor	%xmm5,  %xmm5
+	pxor	%xmm6,  %xmm6
+	pxor	%xmm7,  %xmm7
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,\@abi-omnipotent
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	\$0x80,	%xmm6,	%xmm0	# d c 0 0 -> c 0 0 0
+	pxor	%xmm0,	%xmm6		# -> c+d c 0 0
+	pshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	pxor	%xmm0,	%xmm6		# -> b+c+d b+c b a
+	movdqa	%xmm6,	%xmm0
+	pxor	%xmm1,	%xmm1
+	movhlps	%xmm1,	%xmm6		# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,\@abi-omnipotent
+.align	16
+_vpaes_schedule_round:
+	# extract rcon from xmm8
+	pxor	%xmm1,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm8
+	pxor	%xmm1,	%xmm7
+
+	# rotate
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	palignr	\$1,	%xmm0,	%xmm0
+	
+	# fall through...
+	
+	# low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	# smear xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$4,	%xmm7
+	pxor	%xmm1,	%xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$8,	%xmm7
+	pxor	%xmm1,	%xmm7
+	pxor	.Lk_s63(%rip), %xmm7
+
+	# subbytes
+	movdqa  %xmm9, 	%xmm1
+	pandn	%xmm0, 	%xmm1
+	psrld	\$4,    %xmm1		# 1 = i
+	pand	%xmm9, 	%xmm0		# 0 = k
+	movdqa	%xmm11, %xmm2		# 2 : a/k
+	pshufb  %xmm0,  %xmm2		# 2 = a/k
+	pxor	%xmm1,	%xmm0		# 0 = j
+	movdqa	%xmm10,	%xmm3		# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
+	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4		# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
+	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2		# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2		# 2 = io
+	movdqa	%xmm10, %xmm3		# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
+	pxor	%xmm1,  %xmm3		# 3 = jo
+	movdqa	%xmm13, %xmm4		# 4 : sbou
+	pshufb  %xmm2,  %xmm4		# 4 = sbou
+	movdqa	%xmm12, %xmm0		# 0 : sbot
+	pshufb  %xmm3,	%xmm0		# 0 = sb1t
+	pxor	%xmm4, 	%xmm0		# 0 = sbox output
+
+	# add in smeared stuff
+	pxor	%xmm7,	%xmm0	
+	movdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,\@abi-omnipotent
+.align	16
+_vpaes_schedule_transform:
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	movdqa	(%r11), %xmm2 	# lo
+	pshufb	%xmm0,	%xmm2
+	movdqa	16(%r11), %xmm0 # hi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm2,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,\@abi-omnipotent
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,	%xmm4	# save xmm0 for later
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	test	%rcx, 	%rcx
+	jnz	.Lschedule_mangle_dec
+
+	# encrypting
+	add	\$16,	%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+	pshufb	%xmm5,	%xmm4
+	movdqa	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+	# inverse mix columns
+	lea	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm4,	%xmm1
+	psrld	\$4,	%xmm1	# 1 = hi
+	pand	%xmm9,	%xmm4	# 4 = lo
+
+	movdqa	0x00(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	movdqa	0x10(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x20(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x30(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x40(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x50(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x60(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x70(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+
+	add	\$-16,	%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,%xmm3
+	add	\$-16,	%r8
+	and	\$0x30,	%r8
+	movdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl	${PREFIX}_set_encrypt_key
+.type	${PREFIX}_set_encrypt_key,\@function,3
+.align	16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+	mov	%esi,%eax
+	shr	\$5,%eax
+	add	\$5,%eax
+	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	\$0,%ecx
+	mov	\$0x30,%r8d
+	call	_vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+	xor	%eax,%eax
+	ret
+.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl	${PREFIX}_set_decrypt_key
+.type	${PREFIX}_set_decrypt_key,\@function,3
+.align	16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+	mov	%esi,%eax
+	shr	\$5,%eax
+	add	\$5,%eax
+	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	shl	\$4,%eax
+	lea	16(%rdx,%rax),%rdx
+
+	mov	\$1,%ecx
+	mov	%esi,%r8d
+	shr	\$1,%r8d
+	and	\$32,%r8d
+	xor	\$32,%r8d	# nbits==192?0:32
+	call	_vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+	xor	%eax,%eax
+	ret
+.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl	${PREFIX}_encrypt
+.type	${PREFIX}_encrypt,\@function,3
+.align	16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+	ret
+.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl	${PREFIX}_decrypt
+.type	${PREFIX}_decrypt,\@function,3
+.align	16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+	ret
+.size	${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+#                       size_t length, const AES_KEY *key,
+#                       unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl	${PREFIX}_cbc_encrypt
+.type	${PREFIX}_cbc_encrypt,\@function,6
+.align	16
+${PREFIX}_cbc_encrypt:
+	xchg	$key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+	sub	\$16,$len
+	jc	.Lcbc_abort
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+	movdqu	($ivp),%xmm6		# load IV
+	sub	$inp,$out
+	call	_vpaes_preheat
+	cmp	\$0,${enc}d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	($inp),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,($out,$inp)
+	lea	16($inp),$inp
+	sub	\$16,$len
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	($inp),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,($out,$inp)
+	lea	16($inp),$inp
+	sub	\$16,$len
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,($ivp)		# save IV
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+.Lcbc_abort:
+	ret
+.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_preheat,\@abi-omnipotent
+.align	16
+_vpaes_preheat:
+	lea	.Lk_s0F(%rip), %r10
+	movdqa	-0x20(%r10), %xmm10	# .Lk_inv
+	movdqa	-0x10(%r10), %xmm11	# .Lk_inv+16
+	movdqa	0x00(%r10), %xmm9	# .Lk_s0F
+	movdqa	0x30(%r10), %xmm13	# .Lk_sb1
+	movdqa	0x40(%r10), %xmm12	# .Lk_sb1+16
+	movdqa	0x50(%r10), %xmm15	# .Lk_sb2
+	movdqa	0x60(%r10), %xmm14	# .Lk_sb2+16
+	ret
+.size	_vpaes_preheat,.-_vpaes_preheat
+########################################################
+##                                                    ##
+##                     Constants                      ##
+##                                                    ##
+########################################################
+.type	_vpaes_consts,\@object
+.align	64
+_vpaes_consts:
+.Lk_inv:	# inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:	# s0F
+	.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:	# input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:	# sb1u, sb1t
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:	# sb2u, sb2t
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:	# sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:	# mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:		# sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:	# rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:	# s63: all equal to 0x63 transformed
+	.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:	# output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:	# deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+.Lk_dksd:	# decryption key schedule: invskew x*D
+	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	# decryption key schedule: invskew x*B
+	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
+	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	# decryption key schedule: invskew x*9
+	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+.Lk_dipt:	# decryption input transform
+	.quad	0x0F505B040B545F00, 0x154A411E114E451A
+	.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:	# decryption sbox output *9*u, *9*t
+	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:	# decryption sbox output *D*u, *D*t
+	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:	# decryption sbox output *B*u, *B*t
+	.quad	0xD022649296B44200, 0x602646F6B0F2D404
+	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:	# decryption sbox output *E*u, *E*t
+	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:	# decryption sbox final output
+	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz	"Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	16(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xb8(%rax),%rax		# adjust stack pointer
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_end_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_info_${PREFIX}_set_encrypt_key
+
+	.rva	.LSEH_begin_${PREFIX}_set_decrypt_key
+	.rva	.LSEH_end_${PREFIX}_set_decrypt_key
+	.rva	.LSEH_info_${PREFIX}_set_decrypt_key
+
+	.rva	.LSEH_begin_${PREFIX}_encrypt
+	.rva	.LSEH_end_${PREFIX}_encrypt
+	.rva	.LSEH_info_${PREFIX}_encrypt
+
+	.rva	.LSEH_begin_${PREFIX}_decrypt
+	.rva	.LSEH_end_${PREFIX}_decrypt
+	.rva	.LSEH_info_${PREFIX}_decrypt
+
+	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_info_${PREFIX}_cbc_encrypt
+
+.section	.xdata
+.align	8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lenc_key_body,.Lenc_key_epilogue	# HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Ldec_key_body,.Ldec_key_epilogue	# HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lenc_body,.Lenc_epilogue		# HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Ldec_body,.Ldec_epilogue		# HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lcbc_body,.Lcbc_epilogue		# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/arm_arch.h b/main/openssl/crypto/arm_arch.h
new file mode 100644
index 00000000..5a831076
--- /dev/null
+++ b/main/openssl/crypto/arm_arch.h
@@ -0,0 +1,51 @@
+#ifndef __ARM_ARCH_H__
+#define __ARM_ARCH_H__
+
+#if !defined(__ARM_ARCH__)
+# if defined(__CC_ARM)
+#  define __ARM_ARCH__ __TARGET_ARCH_ARM
+#  if defined(__BIG_ENDIAN)
+#   define __ARMEB__
+#  else
+#   define __ARMEL__
+#  endif
+# elif defined(__GNUC__)
+  /*
+   * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+   * bunch of below macros. See all_architectires[] table in
+   * gcc/config/arm/arm.c. On a side note it defines
+   * __ARMEL__/__ARMEB__ for little-/big-endian.
+   */
+#  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
+	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \
+	defined(__ARM_ARCH_7EM__)
+#   define __ARM_ARCH__ 7
+#  elif	defined(__ARM_ARCH_6__)	|| defined(__ARM_ARCH_6J__)	|| \
+	defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__)	|| \
+	defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__)	|| \
+	defined(__ARM_ARCH_6T2__)
+#   define __ARM_ARCH__ 6
+#  elif	defined(__ARM_ARCH_5__)	|| defined(__ARM_ARCH_5T__)	|| \
+	defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__)	|| \
+	defined(__ARM_ARCH_5TEJ__)
+#   define __ARM_ARCH__ 5
+#  elif	defined(__ARM_ARCH_4__)	|| defined(__ARM_ARCH_4T__)
+#   define __ARM_ARCH__ 4
+#  else
+#   error "unsupported ARM architecture"
+#  endif
+# endif
+#endif
+
+#ifdef OPENSSL_FIPSCANISTER
+#include <openssl/fipssyms.h>
+#endif
+
+#if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+                                     
+#define ARMV7_NEON      (1<<0)
+#define ARMV7_TICK      (1<<1)
+#endif
+
+#endif
diff --git a/main/openssl/crypto/armv4cpuid.S b/main/openssl/crypto/armv4cpuid.S
new file mode 100644
index 00000000..2d618dea
--- /dev/null
+++ b/main/openssl/crypto/armv4cpuid.S
@@ -0,0 +1,154 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.align	5
+.global	_armv7_neon_probe
+.type	_armv7_neon_probe,%function
+_armv7_neon_probe:
+	.word	0xf26ee1fe	@ vorr	q15,q15,q15
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_neon_probe,.-_armv7_neon_probe
+
+.global	_armv7_tick
+.type	_armv7_tick,%function
+_armv7_tick:
+	mrc	p15,0,r0,c9,c13,0
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_tick,.-_armv7_tick
+
+.global	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd:	ldrex	r2,[r0]
+	add	r3,r2,r1
+	strex	r2,r3,[r0]
+	cmp	r2,#0
+	bne	.Ladd
+	mov	r0,r3
+	.word	0xe12fff1e	@ bx	lr
+#else
+	stmdb	sp!,{r4-r6,lr}
+	ldr	r2,.Lspinlock
+	adr	r3,.Lspinlock
+	mov	r4,r0
+	mov	r5,r1
+	add	r6,r3,r2	@ &spinlock
+	b	.+8
+.Lspin:	bl	sched_yield
+	mov	r0,#-1
+	swp	r0,r0,[r6]
+	cmp	r0,#0
+	bne	.Lspin
+
+	ldr	r2,[r4]
+	add	r2,r2,r5
+	str	r2,[r4]
+	str	r0,[r6]		@ release spinlock
+	ldmia	sp!,{r4-r6,lr}
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+#endif
+.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global	OPENSSL_cleanse
+.type	OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+	eor	ip,ip,ip
+	cmp	r1,#7
+	subhs	r1,r1,#4
+	bhs	.Lot
+	cmp	r1,#0
+	beq	.Lcleanse_done
+.Little:
+	strb	ip,[r0],#1
+	subs	r1,r1,#1
+	bhi	.Little
+	b	.Lcleanse_done
+
+.Lot:	tst	r0,#3
+	beq	.Laligned
+	strb	ip,[r0],#1
+	sub	r1,r1,#1
+	b	.Lot
+.Laligned:
+	str	ip,[r0],#4
+	subs	r1,r1,#4
+	bhs	.Laligned
+	adds	r1,r1,#4
+	bne	.Little
+.Lcleanse_done:
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
+
+.global	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+	ldr	r0,.LOPENSSL_armcap
+	adr	r1,.LOPENSSL_armcap
+	ldr	r0,[r1,r0]
+	eor	r2,r2,r2
+	eor	r3,r3,r3
+	eor	ip,ip,ip
+	tst	r0,#1
+	beq	.Lwipe_done
+	.word	0xf3000150	@ veor    q0, q0, q0
+	.word	0xf3022152	@ veor    q1, q1, q1
+	.word	0xf3044154	@ veor    q2, q2, q2
+	.word	0xf3066156	@ veor    q3, q3, q3
+	.word	0xf34001f0	@ veor    q8, q8, q8
+	.word	0xf34221f2	@ veor    q9, q9, q9
+	.word	0xf34441f4	@ veor    q10, q10, q10
+	.word	0xf34661f6	@ veor    q11, q11, q11
+	.word	0xf34881f8	@ veor    q12, q12, q12
+	.word	0xf34aa1fa	@ veor    q13, q13, q13
+	.word	0xf34cc1fc	@ veor    q14, q14, q14
+	.word	0xf34ee1fe	@ veor    q15, q15, q15
+.Lwipe_done:
+	mov	r0,sp
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global	OPENSSL_instrument_bus
+.type	OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.global	OPENSSL_instrument_bus2
+.type	OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.LOPENSSL_armcap
+#if __ARM_ARCH__>=6
+.align	5
+#else
+.Lspinlock:
+.word	atomic_add_spinlock-.Lspinlock
+.align	5
+
+.data
+.align	2
+atomic_add_spinlock:
+.word	0
+#endif
+
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
diff --git a/main/openssl/crypto/asn1/a_d2i_fp.c b/main/openssl/crypto/asn1/a_d2i_fp.c
index ece40bc4..52b2ebdb 100644
--- a/main/openssl/crypto/asn1/a_d2i_fp.c
+++ b/main/openssl/crypto/asn1/a_d2i_fp.c
@@ -57,6 +57,7 @@
  */
 
 #include <stdio.h>
+#include <limits.h>
 #include "cryptlib.h"
 #include <openssl/buffer.h>
 #include <openssl/asn1_mac.h>
@@ -143,17 +144,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 	BUF_MEM *b;
 	unsigned char *p;
 	int i;
-	int ret=-1;
 	ASN1_const_CTX c;
-	int want=HEADER_SIZE;
+	size_t want=HEADER_SIZE;
 	int eos=0;
-#if defined(__GNUC__) && defined(__ia64)
-	/* pathetic compiler bug in all known versions as of Nov. 2002 */
-	long off=0;
-#else
-	int off=0;
-#endif
-	int len=0;
+	size_t off=0;
+	size_t len=0;
 
 	b=BUF_MEM_new();
 	if (b == NULL)
@@ -169,7 +164,7 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 			{
 			want-=(len-off);
 
-			if (!BUF_MEM_grow_clean(b,len+want))
+			if (len + want < len || !BUF_MEM_grow_clean(b,len+want))
 				{
 				ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
 				goto err;
@@ -181,7 +176,14 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 				goto err;
 				}
 			if (i > 0)
+				{
+				if (len+i < len)
+					{
+					ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+					goto err;
+					}
 				len+=i;
+				}
 			}
 		/* else data already loaded */
 
@@ -206,6 +208,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 			{
 			/* no data body so go round again */
 			eos++;
+			if (eos < 0)
+				{
+				ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_HEADER_TOO_LONG);
+				goto err;
+				}
 			want=HEADER_SIZE;
 			}
 		else if (eos && (c.slen == 0) && (c.tag == V_ASN1_EOC))
@@ -220,10 +227,16 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 		else 
 			{
 			/* suck in c.slen bytes of data */
-			want=(int)c.slen;
+			want=c.slen;
 			if (want > (len-off))
 				{
 				want-=(len-off);
+				if (want > INT_MAX /* BIO_read takes an int length */ ||
+					len+want < len)
+						{
+						ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+						goto err;
+						}
 				if (!BUF_MEM_grow_clean(b,len+want))
 					{
 					ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
@@ -238,11 +251,18 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 						    ASN1_R_NOT_ENOUGH_DATA);
 						goto err;
 						}
+					/* This can't overflow because
+					 * |len+want| didn't overflow. */
 					len+=i;
-					want -= i;
+					want-=i;
 					}
 				}
-			off+=(int)c.slen;
+			if (off + c.slen < off)
+				{
+				ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+				goto err;
+				}
+			off+=c.slen;
 			if (eos <= 0)
 				{
 				break;
@@ -252,9 +272,15 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 			}
 		}
 
+	if (off > INT_MAX)
+		{
+		ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+		goto err;
+		}
+
 	*pb = b;
 	return off;
 err:
 	if (b != NULL) BUF_MEM_free(b);
-	return(ret);
+	return -1;
 	}
diff --git a/main/openssl/crypto/asn1/a_digest.c b/main/openssl/crypto/asn1/a_digest.c
index d00d9e22..cbdeea6a 100644
--- a/main/openssl/crypto/asn1/a_digest.c
+++ b/main/openssl/crypto/asn1/a_digest.c
@@ -87,7 +87,8 @@ int ASN1_digest(i2d_of_void *i2d, const EVP_MD *type, char *data,
 	p=str;
 	i2d(data,&p);
 
-	EVP_Digest(str, i, md, len, type, NULL);
+	if (!EVP_Digest(str, i, md, len, type, NULL))
+		return 0;
 	OPENSSL_free(str);
 	return(1);
 	}
@@ -104,7 +105,8 @@ int ASN1_item_digest(const ASN1_ITEM *it, const EVP_MD *type, void *asn,
 	i=ASN1_item_i2d(asn,&str, it);
 	if (!str) return(0);
 
-	EVP_Digest(str, i, md, len, type, NULL);
+	if (!EVP_Digest(str, i, md, len, type, NULL))
+		return 0;
 	OPENSSL_free(str);
 	return(1);
 	}
diff --git a/main/openssl/crypto/asn1/a_int.c b/main/openssl/crypto/asn1/a_int.c
index 3348b876..297c45a9 100644
--- a/main/openssl/crypto/asn1/a_int.c
+++ b/main/openssl/crypto/asn1/a_int.c
@@ -116,7 +116,7 @@ int i2c_ASN1_INTEGER(ASN1_INTEGER *a, unsigned char **pp)
 	int pad=0,ret,i,neg;
 	unsigned char *p,*n,pb=0;
 
-	if ((a == NULL) || (a->data == NULL)) return(0);
+	if (a == NULL) return(0);
 	neg=a->type & V_ASN1_NEG;
 	if (a->length == 0)
 		ret=1;
@@ -386,8 +386,8 @@ long ASN1_INTEGER_get(const ASN1_INTEGER *a)
 	
 	if (a->length > (int)sizeof(long))
 		{
-		/* hmm... a bit ugly */
-		return(0xffffffffL);
+		/* hmm... a bit ugly, return all ones */
+		return -1;
 		}
 	if (a->data == NULL)
 		return 0;
diff --git a/main/openssl/crypto/asn1/a_sign.c b/main/openssl/crypto/asn1/a_sign.c
index ff63bfc7..7b4a193d 100644
--- a/main/openssl/crypto/asn1/a_sign.c
+++ b/main/openssl/crypto/asn1/a_sign.c
@@ -184,9 +184,9 @@ int ASN1_sign(i2d_of_void *i2d, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	p=buf_in;
 
 	i2d(data,&p);
-	EVP_SignInit_ex(&ctx,type, NULL);
-	EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
-	if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
+	if (!EVP_SignInit_ex(&ctx,type, NULL)
+		|| !EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl)
+		|| !EVP_SignFinal(&ctx,(unsigned char *)buf_out,
 			(unsigned int *)&outl,pkey))
 		{
 		outl=0;
@@ -218,65 +218,100 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	     const EVP_MD *type)
 	{
 	EVP_MD_CTX ctx;
+	EVP_MD_CTX_init(&ctx);
+	if (!EVP_DigestSignInit(&ctx, NULL, type, NULL, pkey))
+		{
+		EVP_MD_CTX_cleanup(&ctx);
+		return 0;
+		}
+	return ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, &ctx);
+	}
+		
+
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx)
+	{
+	const EVP_MD *type;
+	EVP_PKEY *pkey;
 	unsigned char *buf_in=NULL,*buf_out=NULL;
-	int inl=0,outl=0,outll=0;
+	size_t inl=0,outl=0,outll=0;
 	int signid, paramtype;
+	int rv;
+
+	type = EVP_MD_CTX_md(ctx);
+	pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx);
 
-	if (type == NULL)
+	if (!type || !pkey)
 		{
-		int def_nid;
-		if (EVP_PKEY_get_default_digest_nid(pkey, &def_nid) > 0)
-			type = EVP_get_digestbynid(def_nid);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ASN1_R_CONTEXT_NOT_INITIALISED);
+		return 0;
 		}
 
-	if (type == NULL)
+	if (pkey->ameth->item_sign)
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN, ASN1_R_NO_DEFAULT_DIGEST);
-		return 0;
+		rv = pkey->ameth->item_sign(ctx, it, asn, algor1, algor2,
+						signature);
+		if (rv == 1)
+			outl = signature->length;
+		/* Return value meanings:
+		 * <=0: error.
+		 *   1: method does everything.
+		 *   2: carry on as normal.
+		 *   3: ASN1 method sets algorithm identifiers: just sign.
+		 */
+		if (rv <= 0)
+			ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_EVP_LIB);
+		if (rv <= 1)
+			goto err;
 		}
+	else
+		rv = 2;
 
-	if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
+	if (rv == 2)
 		{
-		if (!pkey->ameth ||
-			!OBJ_find_sigid_by_algs(&signid, EVP_MD_nid(type),
-						pkey->ameth->pkey_id))
+		if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 			{
-			ASN1err(ASN1_F_ASN1_ITEM_SIGN,
-				ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
-			return 0;
+			if (!pkey->ameth ||
+				!OBJ_find_sigid_by_algs(&signid,
+							EVP_MD_nid(type),
+							pkey->ameth->pkey_id))
+				{
+				ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,
+					ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
+				return 0;
+				}
 			}
-		}
-	else
-		signid = type->pkey_type;
+		else
+			signid = type->pkey_type;
 
-	if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
-		paramtype = V_ASN1_NULL;
-	else
-		paramtype = V_ASN1_UNDEF;
+		if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
+			paramtype = V_ASN1_NULL;
+		else
+			paramtype = V_ASN1_UNDEF;
 
-	if (algor1)
-		X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
-	if (algor2)
-		X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+		if (algor1)
+			X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
+		if (algor2)
+			X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+
+		}
 
-	EVP_MD_CTX_init(&ctx);
 	inl=ASN1_item_i2d(asn,&buf_in, it);
 	outll=outl=EVP_PKEY_size(pkey);
-	buf_out=(unsigned char *)OPENSSL_malloc((unsigned int)outl);
+	buf_out=OPENSSL_malloc((unsigned int)outl);
 	if ((buf_in == NULL) || (buf_out == NULL))
 		{
 		outl=0;
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_MALLOC_FAILURE);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_MALLOC_FAILURE);
 		goto err;
 		}
 
-	EVP_SignInit_ex(&ctx,type, NULL);
-	EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
-	if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
-			(unsigned int *)&outl,pkey))
+	if (!EVP_DigestSignUpdate(ctx, buf_in, inl)
+		|| !EVP_DigestSignFinal(ctx, buf_out, &outl))
 		{
 		outl=0;
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_EVP_LIB);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_EVP_LIB);
 		goto err;
 		}
 	if (signature->data != NULL) OPENSSL_free(signature->data);
@@ -289,7 +324,7 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	signature->flags&= ~(ASN1_STRING_FLAG_BITS_LEFT|0x07);
 	signature->flags|=ASN1_STRING_FLAG_BITS_LEFT;
 err:
-	EVP_MD_CTX_cleanup(&ctx);
+	EVP_MD_CTX_cleanup(ctx);
 	if (buf_in != NULL)
 		{ OPENSSL_cleanse((char *)buf_in,(unsigned int)inl); OPENSSL_free(buf_in); }
 	if (buf_out != NULL)
diff --git a/main/openssl/crypto/asn1/a_strex.c b/main/openssl/crypto/asn1/a_strex.c
index 264ebf23..ead37ac3 100644
--- a/main/openssl/crypto/asn1/a_strex.c
+++ b/main/openssl/crypto/asn1/a_strex.c
@@ -567,6 +567,7 @@ int ASN1_STRING_to_UTF8(unsigned char **out, ASN1_STRING *in)
 	if(mbflag == -1) return -1;
 	mbflag |= MBSTRING_FLAG;
 	stmp.data = NULL;
+	stmp.length = 0;
 	ret = ASN1_mbstring_copy(&str, in->data, in->length, mbflag, B_ASN1_UTF8STRING);
 	if(ret < 0) return ret;
 	*out = stmp.data;
diff --git a/main/openssl/crypto/asn1/a_verify.c b/main/openssl/crypto/asn1/a_verify.c
index cecdb13c..fc84cd3d 100644
--- a/main/openssl/crypto/asn1/a_verify.c
+++ b/main/openssl/crypto/asn1/a_verify.c
@@ -101,8 +101,13 @@ int ASN1_verify(i2d_of_void *i2d, X509_ALGOR *a, ASN1_BIT_STRING *signature,
 	p=buf_in;
 
 	i2d(data,&p);
-	EVP_VerifyInit_ex(&ctx,type, NULL);
-	EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+	if (!EVP_VerifyInit_ex(&ctx,type, NULL)
+		|| !EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl))
+		{
+		ASN1err(ASN1_F_ASN1_VERIFY,ERR_R_EVP_LIB);
+		ret=0;
+		goto err;
+		}
 
 	OPENSSL_cleanse(buf_in,(unsigned int)inl);
 	OPENSSL_free(buf_in);
@@ -126,16 +131,21 @@ err:
 #endif
 
 
-int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signature,
-	     void *asn, EVP_PKEY *pkey)
+int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a,
+		ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey)
 	{
 	EVP_MD_CTX ctx;
-	const EVP_MD *type = NULL;
 	unsigned char *buf_in=NULL;
 	int ret= -1,inl;
 
 	int mdnid, pknid;
 
+	if (!pkey)
+		{
+		ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_PASSED_NULL_PARAMETER);
+		return -1;
+		}
+
 	EVP_MD_CTX_init(&ctx);
 
 	/* Convert signature OID into digest and public key OIDs */
@@ -144,25 +154,47 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
 		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
 		goto err;
 		}
-	type=EVP_get_digestbynid(mdnid);
-	if (type == NULL)
+	if (mdnid == NID_undef)
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
-		goto err;
+		if (!pkey->ameth || !pkey->ameth->item_verify)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
+			goto err;
+			}
+		ret = pkey->ameth->item_verify(&ctx, it, asn, a,
+							signature, pkey);
+		/* Return value of 2 means carry on, anything else means we
+		 * exit straight away: either a fatal error of the underlying
+		 * verification routine handles all verification.
+		 */
+		if (ret != 2)
+			goto err;
+		ret = -1;
 		}
-
-	/* Check public key OID matches public key type */
-	if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+	else
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
-		goto err;
-		}
+		const EVP_MD *type;
+		type=EVP_get_digestbynid(mdnid);
+		if (type == NULL)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
+			goto err;
+			}
+
+		/* Check public key OID matches public key type */
+		if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
+			goto err;
+			}
+
+		if (!EVP_DigestVerifyInit(&ctx, NULL, type, NULL, pkey))
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+			ret=0;
+			goto err;
+			}
 
-	if (!EVP_VerifyInit_ex(&ctx,type, NULL))
-		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
-		ret=0;
-		goto err;
 		}
 
 	inl = ASN1_item_i2d(asn, &buf_in, it);
@@ -173,13 +205,18 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
 		goto err;
 		}
 
-	EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+	if (!EVP_DigestVerifyUpdate(&ctx,buf_in,inl))
+		{
+		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+		ret=0;
+		goto err;
+		}
 
 	OPENSSL_cleanse(buf_in,(unsigned int)inl);
 	OPENSSL_free(buf_in);
 
-	if (EVP_VerifyFinal(&ctx,(unsigned char *)signature->data,
-			(unsigned int)signature->length,pkey) <= 0)
+	if (EVP_DigestVerifyFinal(&ctx,signature->data,
+			(size_t)signature->length) <= 0)
 		{
 		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
 		ret=0;
diff --git a/main/openssl/crypto/asn1/ameth_lib.c b/main/openssl/crypto/asn1/ameth_lib.c
index 5a581b90..a19e058f 100644
--- a/main/openssl/crypto/asn1/ameth_lib.c
+++ b/main/openssl/crypto/asn1/ameth_lib.c
@@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
 extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
 
 /* Keep this sorted in type order !! */
 static const EVP_PKEY_ASN1_METHOD *standard_methods[] = 
@@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] =
 #ifndef OPENSSL_NO_EC
 	&eckey_asn1_meth,
 #endif
-	&hmac_asn1_meth
+	&hmac_asn1_meth,
+	&cmac_asn1_meth
 	};
 
 typedef int sk_cmp_fn_type(const char * const *a, const char * const *b);
@@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
 	if (!ameth)
 		return NULL;
 
+	memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD));
+
 	ameth->pkey_id = id;
 	ameth->pkey_base_id = id;
 	ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
@@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
 	ameth->old_priv_encode = 0;
 	ameth->old_priv_decode = 0;
 
+	ameth->item_verify = 0;
+	ameth->item_sign = 0;
+
 	ameth->pkey_size = 0;
 	ameth->pkey_bits = 0;
 
@@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst,
 	dst->pkey_free = src->pkey_free;
 	dst->pkey_ctrl = src->pkey_ctrl;
 
+	dst->item_sign = src->item_sign;
+	dst->item_verify = src->item_verify;
+
 	}
 
 void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)
diff --git a/main/openssl/crypto/asn1/asn1.h b/main/openssl/crypto/asn1/asn1.h
index 59540e4e..220a0c8c 100644
--- a/main/openssl/crypto/asn1/asn1.h
+++ b/main/openssl/crypto/asn1/asn1.h
@@ -235,7 +235,7 @@ typedef struct asn1_object_st
  */
 #define ASN1_STRING_FLAG_MSTRING 0x040 
 /* This is the base type that holds just about everything :-) */
-typedef struct asn1_string_st
+struct asn1_string_st
 	{
 	int length;
 	int type;
@@ -245,7 +245,7 @@ typedef struct asn1_string_st
 	 * input data has a non-zero 'unused bits' value, it will be
 	 * handled correctly */
 	long flags;
-	} ASN1_STRING;
+	};
 
 /* ASN1_ENCODING structure: this is used to save the received
  * encoding of an ASN1 type. This is useful to get round
@@ -293,7 +293,6 @@ DECLARE_STACK_OF(ASN1_STRING_TABLE)
  * see asn1t.h
  */
 typedef struct ASN1_TEMPLATE_st ASN1_TEMPLATE;
-typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct ASN1_TLC_st ASN1_TLC;
 /* This is just an opaque pointer */
 typedef struct ASN1_VALUE_st ASN1_VALUE;
@@ -1194,6 +1193,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_F_ASN1_ITEM_I2D_FP				 193
 #define ASN1_F_ASN1_ITEM_PACK				 198
 #define ASN1_F_ASN1_ITEM_SIGN				 195
+#define ASN1_F_ASN1_ITEM_SIGN_CTX			 220
 #define ASN1_F_ASN1_ITEM_UNPACK				 199
 #define ASN1_F_ASN1_ITEM_VERIFY				 197
 #define ASN1_F_ASN1_MBSTRING_NCOPY			 122
@@ -1266,6 +1266,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_F_PKCS5_PBE2_SET_IV			 167
 #define ASN1_F_PKCS5_PBE_SET				 202
 #define ASN1_F_PKCS5_PBE_SET0_ALGOR			 215
+#define ASN1_F_PKCS5_PBKDF2_SET				 219
 #define ASN1_F_SMIME_READ_ASN1				 212
 #define ASN1_F_SMIME_TEXT				 213
 #define ASN1_F_X509_CINF_NEW				 168
@@ -1291,6 +1292,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_R_BOOLEAN_IS_WRONG_LENGTH			 106
 #define ASN1_R_BUFFER_TOO_SMALL				 107
 #define ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER		 108
+#define ASN1_R_CONTEXT_NOT_INITIALISED			 217
 #define ASN1_R_DATA_IS_WRONG				 109
 #define ASN1_R_DECODE_ERROR				 110
 #define ASN1_R_DECODING_ERROR				 111
diff --git a/main/openssl/crypto/asn1/asn1_err.c b/main/openssl/crypto/asn1/asn1_err.c
index 6e04d08f..1a30bf11 100644
--- a/main/openssl/crypto/asn1/asn1_err.c
+++ b/main/openssl/crypto/asn1/asn1_err.c
@@ -1,6 +1,6 @@
 /* crypto/asn1/asn1_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2009 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -107,6 +107,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
 {ERR_FUNC(ASN1_F_ASN1_ITEM_I2D_FP),	"ASN1_item_i2d_fp"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_PACK),	"ASN1_item_pack"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN),	"ASN1_item_sign"},
+{ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN_CTX),	"ASN1_item_sign_ctx"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_UNPACK),	"ASN1_item_unpack"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_VERIFY),	"ASN1_item_verify"},
 {ERR_FUNC(ASN1_F_ASN1_MBSTRING_NCOPY),	"ASN1_mbstring_ncopy"},
@@ -179,6 +180,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
 {ERR_FUNC(ASN1_F_PKCS5_PBE2_SET_IV),	"PKCS5_pbe2_set_iv"},
 {ERR_FUNC(ASN1_F_PKCS5_PBE_SET),	"PKCS5_pbe_set"},
 {ERR_FUNC(ASN1_F_PKCS5_PBE_SET0_ALGOR),	"PKCS5_pbe_set0_algor"},
+{ERR_FUNC(ASN1_F_PKCS5_PBKDF2_SET),	"PKCS5_pbkdf2_set"},
 {ERR_FUNC(ASN1_F_SMIME_READ_ASN1),	"SMIME_read_ASN1"},
 {ERR_FUNC(ASN1_F_SMIME_TEXT),	"SMIME_text"},
 {ERR_FUNC(ASN1_F_X509_CINF_NEW),	"X509_CINF_NEW"},
@@ -207,6 +209,7 @@ static ERR_STRING_DATA ASN1_str_reasons[]=
 {ERR_REASON(ASN1_R_BOOLEAN_IS_WRONG_LENGTH),"boolean is wrong length"},
 {ERR_REASON(ASN1_R_BUFFER_TOO_SMALL)     ,"buffer too small"},
 {ERR_REASON(ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER),"cipher has no object identifier"},
+{ERR_REASON(ASN1_R_CONTEXT_NOT_INITIALISED),"context not initialised"},
 {ERR_REASON(ASN1_R_DATA_IS_WRONG)        ,"data is wrong"},
 {ERR_REASON(ASN1_R_DECODE_ERROR)         ,"decode error"},
 {ERR_REASON(ASN1_R_DECODING_ERROR)       ,"decoding error"},
diff --git a/main/openssl/crypto/asn1/asn1_locl.h b/main/openssl/crypto/asn1/asn1_locl.h
index 5aa65e28..9fcf0d95 100644
--- a/main/openssl/crypto/asn1/asn1_locl.h
+++ b/main/openssl/crypto/asn1/asn1_locl.h
@@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st
 	int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
 	int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
 							ASN1_PCTX *pctx);
+	int (*sig_print)(BIO *out,
+			 const X509_ALGOR *sigalg, const ASN1_STRING *sig,
+					 int indent, ASN1_PCTX *pctx);
+
 
 	void (*pkey_free)(EVP_PKEY *pkey);
 	int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2);
@@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st
 	int (*old_priv_decode)(EVP_PKEY *pkey,
 				const unsigned char **pder, int derlen);
 	int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder);
+	/* Custom ASN1 signature verification */
+	int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *a, ASN1_BIT_STRING *sig,
+				EVP_PKEY *pkey);
+	int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *alg1, X509_ALGOR *alg2, 
+				ASN1_BIT_STRING *sig);
 
 	} /* EVP_PKEY_ASN1_METHOD */;
 
diff --git a/main/openssl/crypto/asn1/asn_mime.c b/main/openssl/crypto/asn1/asn_mime.c
index c1d1b122..54a704a9 100644
--- a/main/openssl/crypto/asn1/asn_mime.c
+++ b/main/openssl/crypto/asn1/asn_mime.c
@@ -377,8 +377,12 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
 	BIO *tmpbio;
 	const ASN1_AUX *aux = it->funcs;
 	ASN1_STREAM_ARG sarg;
+	int rv = 1;
 
-	if (!(flags & SMIME_DETACHED))
+	/* If data is not deteched or resigning then the output BIO is
+	 * already set up to finalise when it is written through.
+	 */
+	if (!(flags & SMIME_DETACHED) || (flags & PKCS7_REUSE_DIGEST))
 		{
 		SMIME_crlf_copy(data, out, flags);
 		return 1;
@@ -405,7 +409,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
 
 	/* Finalize structure */
 	if (aux->asn1_cb(ASN1_OP_DETACHED_POST, &val, it, &sarg) <= 0)
-		return 0;
+		rv = 0;
 
 	/* Now remove any digests prepended to the BIO */
 
@@ -416,7 +420,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
 		sarg.ndef_bio = tmpbio;
 		}
 
-	return 1;
+	return rv;
 
 	}
 
@@ -486,9 +490,9 @@ ASN1_VALUE *SMIME_read_ASN1(BIO *bio, BIO **bcont, const ASN1_ITEM *it)
 
 		if(strcmp(hdr->value, "application/x-pkcs7-signature") &&
 			strcmp(hdr->value, "application/pkcs7-signature")) {
-			sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
 			ASN1err(ASN1_F_SMIME_READ_ASN1,ASN1_R_SIG_INVALID_MIME_TYPE);
 			ERR_add_error_data(2, "type: ", hdr->value);
+			sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
 			sk_BIO_pop_free(parts, BIO_vfree);
 			return NULL;
 		}
@@ -801,7 +805,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
 	if(name) {
 		if(!(tmpname = BUF_strdup(name))) return NULL;
 		for(p = tmpname ; *p; p++) {
-			c = *p;
+			c = (unsigned char)*p;
 			if(isupper(c)) {
 				c = tolower(c);
 				*p = c;
@@ -811,7 +815,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
 	if(value) {
 		if(!(tmpval = BUF_strdup(value))) return NULL;
 		for(p = tmpval ; *p; p++) {
-			c = *p;
+			c = (unsigned char)*p;
 			if(isupper(c)) {
 				c = tolower(c);
 				*p = c;
@@ -835,7 +839,7 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
 		tmpname = BUF_strdup(name);
 		if(!tmpname) return 0;
 		for(p = tmpname ; *p; p++) {
-			c = *p;
+			c = (unsigned char)*p;
 			if(isupper(c)) {
 				c = tolower(c);
 				*p = c;
@@ -858,12 +862,17 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
 static int mime_hdr_cmp(const MIME_HEADER * const *a,
 			const MIME_HEADER * const *b)
 {
+	if (!(*a)->name || !(*b)->name)
+		return !!(*a)->name - !!(*b)->name;
+
 	return(strcmp((*a)->name, (*b)->name));
 }
 
 static int mime_param_cmp(const MIME_PARAM * const *a,
 			const MIME_PARAM * const *b)
 {
+	if (!(*a)->param_name || !(*b)->param_name)
+		return !!(*a)->param_name - !!(*b)->param_name;
 	return(strcmp((*a)->param_name, (*b)->param_name));
 }
 
diff --git a/main/openssl/crypto/asn1/n_pkey.c b/main/openssl/crypto/asn1/n_pkey.c
index e7d04390..e2517399 100644
--- a/main/openssl/crypto/asn1/n_pkey.c
+++ b/main/openssl/crypto/asn1/n_pkey.c
@@ -129,6 +129,7 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
 	unsigned char buf[256],*zz;
 	unsigned char key[EVP_MAX_KEY_LENGTH];
 	EVP_CIPHER_CTX ctx;
+	EVP_CIPHER_CTX_init(&ctx);
 
 	if (a == NULL) return(0);
 
@@ -206,24 +207,28 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
 	i = strlen((char *)buf);
 	/* If the key is used for SGC the algorithm is modified a little. */
 	if(sgckey) {
-		EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+		if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+			goto err;
 		memcpy(buf + 16, "SGCKEYSALT", 10);
 		i = 26;
 	}
 
-	EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+	if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+		goto err;
 	OPENSSL_cleanse(buf,256);
 
 	/* Encrypt private key in place */
 	zz = enckey->enckey->digest->data;
-	EVP_CIPHER_CTX_init(&ctx);
-	EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL);
-	EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen);
-	EVP_EncryptFinal_ex(&ctx,zz + i,&j);
-	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (!EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL))
+		goto err;
+	if (!EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen))
+		goto err;
+	if (!EVP_EncryptFinal_ex(&ctx,zz + i,&j))
+		goto err;
 
 	ret = i2d_NETSCAPE_ENCRYPTED_PKEY(enckey, pp);
 err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
 	NETSCAPE_ENCRYPTED_PKEY_free(enckey);
 	NETSCAPE_PKEY_free(pkey);
 	return(ret);
@@ -288,6 +293,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
 	const unsigned char *zz;
 	unsigned char key[EVP_MAX_KEY_LENGTH];
 	EVP_CIPHER_CTX ctx;
+	EVP_CIPHER_CTX_init(&ctx);
 
 	i=cb((char *)buf,256,"Enter Private Key password:",0);
 	if (i != 0)
@@ -298,19 +304,22 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
 
 	i = strlen((char *)buf);
 	if(sgckey){
-		EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+		if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+			goto err;
 		memcpy(buf + 16, "SGCKEYSALT", 10);
 		i = 26;
 	}
 		
-	EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+	if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+		goto err;
 	OPENSSL_cleanse(buf,256);
 
-	EVP_CIPHER_CTX_init(&ctx);
-	EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL);
-	EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length);
-	EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j);
-	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (!EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL))
+		goto err;
+	if (!EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length))
+		goto err;
+	if (!EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j))
+		goto err;
 	os->length=i+j;
 
 	zz=os->data;
@@ -328,6 +337,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
 		goto err;
 		}
 err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
 	NETSCAPE_PKEY_free(pkey);
 	return(ret);
 	}
diff --git a/main/openssl/crypto/asn1/p5_pbev2.c b/main/openssl/crypto/asn1/p5_pbev2.c
index cb49b665..4ea68303 100644
--- a/main/openssl/crypto/asn1/p5_pbev2.c
+++ b/main/openssl/crypto/asn1/p5_pbev2.c
@@ -91,12 +91,10 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 				 unsigned char *aiv, int prf_nid)
 {
 	X509_ALGOR *scheme = NULL, *kalg = NULL, *ret = NULL;
-	int alg_nid;
+	int alg_nid, keylen;
 	EVP_CIPHER_CTX ctx;
 	unsigned char iv[EVP_MAX_IV_LENGTH];
-	PBKDF2PARAM *kdf = NULL;
 	PBE2PARAM *pbe2 = NULL;
-	ASN1_OCTET_STRING *osalt = NULL;
 	ASN1_OBJECT *obj;
 
 	alg_nid = EVP_CIPHER_type(cipher);
@@ -127,7 +125,8 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 	EVP_CIPHER_CTX_init(&ctx);
 
 	/* Dummy cipherinit to just setup the IV, and PRF */
-	EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0);
+	if (!EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0))
+		goto err;
 	if(EVP_CIPHER_param_to_asn1(&ctx, scheme->parameter) < 0) {
 		ASN1err(ASN1_F_PKCS5_PBE2_SET_IV,
 					ASN1_R_ERROR_SETTING_CIPHER_PARAMS);
@@ -145,55 +144,21 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 		}
 	EVP_CIPHER_CTX_cleanup(&ctx);
 
-	if(!(kdf = PBKDF2PARAM_new())) goto merr;
-	if(!(osalt = M_ASN1_OCTET_STRING_new())) goto merr;
-
-	if (!saltlen) saltlen = PKCS5_SALT_LEN;
-	if (!(osalt->data = OPENSSL_malloc (saltlen))) goto merr;
-	osalt->length = saltlen;
-	if (salt) memcpy (osalt->data, salt, saltlen);
-	else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0) goto merr;
-
-	if(iter <= 0) iter = PKCS5_DEFAULT_ITER;
-	if(!ASN1_INTEGER_set(kdf->iter, iter)) goto merr;
-
-	/* Now include salt in kdf structure */
-	kdf->salt->value.octet_string = osalt;
-	kdf->salt->type = V_ASN1_OCTET_STRING;
-	osalt = NULL;
-
 	/* If its RC2 then we'd better setup the key length */
 
-	if(alg_nid == NID_rc2_cbc) {
-		if(!(kdf->keylength = M_ASN1_INTEGER_new())) goto merr;
-		if(!ASN1_INTEGER_set (kdf->keylength,
-				 EVP_CIPHER_key_length(cipher))) goto merr;
-	}
-
-	/* prf can stay NULL if we are using hmacWithSHA1 */
-	if (prf_nid != NID_hmacWithSHA1)
-		{
-		kdf->prf = X509_ALGOR_new();
-		if (!kdf->prf)
-			goto merr;
-		X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
-					V_ASN1_NULL, NULL);
-		}
-
-	/* Now setup the PBE2PARAM keyfunc structure */
+	if(alg_nid == NID_rc2_cbc)
+		keylen = EVP_CIPHER_key_length(cipher);
+	else
+		keylen = -1;
 
-	pbe2->keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+	/* Setup keyfunc */
 
-	/* Encode PBKDF2PARAM into parameter of pbe2 */
+	X509_ALGOR_free(pbe2->keyfunc);
 
-	if(!(pbe2->keyfunc->parameter = ASN1_TYPE_new())) goto merr;
+	pbe2->keyfunc = PKCS5_pbkdf2_set(iter, salt, saltlen, prf_nid, keylen);
 
-	if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
-			 &pbe2->keyfunc->parameter->value.sequence)) goto merr;
-	pbe2->keyfunc->parameter->type = V_ASN1_SEQUENCE;
-
-	PBKDF2PARAM_free(kdf);
-	kdf = NULL;
+	if (!pbe2->keyfunc)
+		goto merr;
 
 	/* Now set up top level AlgorithmIdentifier */
 
@@ -219,8 +184,6 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 	err:
 	PBE2PARAM_free(pbe2);
 	/* Note 'scheme' is freed as part of pbe2 */
-	M_ASN1_OCTET_STRING_free(osalt);
-	PBKDF2PARAM_free(kdf);
 	X509_ALGOR_free(kalg);
 	X509_ALGOR_free(ret);
 
@@ -233,3 +196,85 @@ X509_ALGOR *PKCS5_pbe2_set(const EVP_CIPHER *cipher, int iter,
 	{
 	return PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, -1);
 	}
+
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen)
+	{
+	X509_ALGOR *keyfunc = NULL;
+	PBKDF2PARAM *kdf = NULL;
+	ASN1_OCTET_STRING *osalt = NULL;
+
+	if(!(kdf = PBKDF2PARAM_new()))
+		goto merr;
+	if(!(osalt = M_ASN1_OCTET_STRING_new()))
+		goto merr;
+
+	kdf->salt->value.octet_string = osalt;
+	kdf->salt->type = V_ASN1_OCTET_STRING;
+
+	if (!saltlen)
+		saltlen = PKCS5_SALT_LEN;
+	if (!(osalt->data = OPENSSL_malloc (saltlen)))
+		goto merr;
+
+	osalt->length = saltlen;
+
+	if (salt)
+		memcpy (osalt->data, salt, saltlen);
+	else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0)
+		goto merr;
+
+	if(iter <= 0)
+		iter = PKCS5_DEFAULT_ITER;
+
+	if(!ASN1_INTEGER_set(kdf->iter, iter))
+		goto merr;
+
+	/* If have a key len set it up */
+
+	if(keylen > 0) 
+		{
+		if(!(kdf->keylength = M_ASN1_INTEGER_new()))
+			goto merr;
+		if(!ASN1_INTEGER_set (kdf->keylength, keylen))
+			goto merr;
+		}
+
+	/* prf can stay NULL if we are using hmacWithSHA1 */
+	if (prf_nid > 0 && prf_nid != NID_hmacWithSHA1)
+		{
+		kdf->prf = X509_ALGOR_new();
+		if (!kdf->prf)
+			goto merr;
+		X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
+					V_ASN1_NULL, NULL);
+		}
+
+	/* Finally setup the keyfunc structure */
+
+	keyfunc = X509_ALGOR_new();
+	if (!keyfunc)
+		goto merr;
+
+	keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+
+	/* Encode PBKDF2PARAM into parameter of pbe2 */
+
+	if(!(keyfunc->parameter = ASN1_TYPE_new()))
+		goto merr;
+
+	if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
+			 &keyfunc->parameter->value.sequence))
+		goto merr;
+	keyfunc->parameter->type = V_ASN1_SEQUENCE;
+
+	PBKDF2PARAM_free(kdf);
+	return keyfunc;
+
+	merr:
+	ASN1err(ASN1_F_PKCS5_PBKDF2_SET,ERR_R_MALLOC_FAILURE);
+	PBKDF2PARAM_free(kdf);
+	X509_ALGOR_free(keyfunc);
+	return NULL;
+	}
+
diff --git a/main/openssl/crypto/asn1/t_crl.c b/main/openssl/crypto/asn1/t_crl.c
index ee5a687c..c6116920 100644
--- a/main/openssl/crypto/asn1/t_crl.c
+++ b/main/openssl/crypto/asn1/t_crl.c
@@ -94,8 +94,7 @@ int X509_CRL_print(BIO *out, X509_CRL *x)
 	l = X509_CRL_get_version(x);
 	BIO_printf(out, "%8sVersion %lu (0x%lx)\n", "", l+1, l);
 	i = OBJ_obj2nid(x->sig_alg->algorithm);
-	BIO_printf(out, "%8sSignature Algorithm: %s\n", "",
-				 (i == NID_undef) ? "NONE" : OBJ_nid2ln(i));
+	X509_signature_print(out, x->sig_alg, NULL);
 	p=X509_NAME_oneline(X509_CRL_get_issuer(x),NULL,0);
 	BIO_printf(out,"%8sIssuer: %s\n","",p);
 	OPENSSL_free(p);
diff --git a/main/openssl/crypto/asn1/t_x509.c b/main/openssl/crypto/asn1/t_x509.c
index e061f2ff..edbb39a0 100644
--- a/main/openssl/crypto/asn1/t_x509.c
+++ b/main/openssl/crypto/asn1/t_x509.c
@@ -72,6 +72,7 @@
 #include <openssl/objects.h>
 #include <openssl/x509.h>
 #include <openssl/x509v3.h>
+#include "asn1_locl.h"
 
 #ifndef OPENSSL_NO_FP_API
 int X509_print_fp(FILE *fp, X509 *x)
@@ -137,10 +138,10 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
 		if (BIO_write(bp,"        Serial Number:",22) <= 0) goto err;
 
 		bs=X509_get_serialNumber(x);
-		if (bs->length <= 4)
+		if (bs->length <= (int)sizeof(long))
 			{
 			l=ASN1_INTEGER_get(bs);
-			if (l < 0)
+			if (bs->type == V_ASN1_NEG_INTEGER)
 				{
 				l= -l;
 				neg="-";
@@ -167,12 +168,16 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
 
 	if(!(cflag & X509_FLAG_NO_SIGNAME))
 		{
+		if(X509_signature_print(bp, x->sig_alg, NULL) <= 0)
+			goto err;
+#if 0
 		if (BIO_printf(bp,"%8sSignature Algorithm: ","") <= 0) 
 			goto err;
 		if (i2a_ASN1_OBJECT(bp, ci->signature->algorithm) <= 0)
 			goto err;
 		if (BIO_puts(bp, "\n") <= 0)
 			goto err;
+#endif
 		}
 
 	if(!(cflag & X509_FLAG_NO_ISSUER))
@@ -255,7 +260,8 @@ int X509_ocspid_print (BIO *bp, X509 *x)
 		goto err;
 	i2d_X509_NAME(x->cert_info->subject, &dertmp);
 
-	EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL))
+		goto err;
 	for (i=0; i < SHA_DIGEST_LENGTH; i++)
 		{
 		if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0) goto err;
@@ -268,8 +274,10 @@ int X509_ocspid_print (BIO *bp, X509 *x)
 	if (BIO_printf(bp,"\n        Public key OCSP hash: ") <= 0)
 		goto err;
 
-	EVP_Digest(x->cert_info->key->public_key->data,
-		x->cert_info->key->public_key->length, SHA1md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(x->cert_info->key->public_key->data,
+			x->cert_info->key->public_key->length,
+			SHA1md, NULL, EVP_sha1(), NULL))
+		goto err;
 	for (i=0; i < SHA_DIGEST_LENGTH; i++)
 		{
 		if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0)
@@ -283,23 +291,50 @@ err:
 	return(0);
 	}
 
-int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+int X509_signature_dump(BIO *bp, const ASN1_STRING *sig, int indent)
 {
-	unsigned char *s;
+	const unsigned char *s;
 	int i, n;
-	if (BIO_puts(bp,"    Signature Algorithm: ") <= 0) return 0;
-	if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
 
 	n=sig->length;
 	s=sig->data;
 	for (i=0; i<n; i++)
 		{
 		if ((i%18) == 0)
-			if (BIO_write(bp,"\n        ",9) <= 0) return 0;
+			{
+			if (BIO_write(bp,"\n",1) <= 0) return 0;
+			if (BIO_indent(bp, indent, indent) <= 0) return 0;
+			}
 			if (BIO_printf(bp,"%02x%s",s[i],
 				((i+1) == n)?"":":") <= 0) return 0;
 		}
 	if (BIO_write(bp,"\n",1) != 1) return 0;
+
+	return 1;
+}
+
+int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+{
+	int sig_nid;
+	if (BIO_puts(bp,"    Signature Algorithm: ") <= 0) return 0;
+	if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
+
+	sig_nid = OBJ_obj2nid(sigalg->algorithm);
+	if (sig_nid != NID_undef)
+		{
+		int pkey_nid, dig_nid;
+		const EVP_PKEY_ASN1_METHOD *ameth;
+		if (OBJ_find_sigid_algs(sig_nid, &dig_nid, &pkey_nid))
+			{
+			ameth = EVP_PKEY_asn1_find(NULL, pkey_nid);
+			if (ameth && ameth->sig_print)
+				return ameth->sig_print(bp, sigalg, sig, 9, 0);
+			}
+		}
+	if (sig)
+		return X509_signature_dump(bp, sig, 9);
+	else if (BIO_puts(bp, "\n") <= 0)
+		return 0;
 	return 1;
 }
 
diff --git a/main/openssl/crypto/asn1/tasn_prn.c b/main/openssl/crypto/asn1/tasn_prn.c
index 45369801..542a091a 100644
--- a/main/openssl/crypto/asn1/tasn_prn.c
+++ b/main/openssl/crypto/asn1/tasn_prn.c
@@ -446,11 +446,11 @@ static int asn1_print_fsname(BIO *out, int indent,
 	return 1;
 	}
 
-static int asn1_print_boolean_ctx(BIO *out, const int bool,
+static int asn1_print_boolean_ctx(BIO *out, int boolval,
 							const ASN1_PCTX *pctx)
 	{
 	const char *str;
-	switch (bool)
+	switch (boolval)
 		{
 		case -1:
 		str = "BOOL ABSENT";
@@ -574,10 +574,10 @@ static int asn1_primitive_print(BIO *out, ASN1_VALUE **fld,
 		{
 		case V_ASN1_BOOLEAN:
 			{
-			int bool = *(int *)fld;
-			if (bool == -1)
-				bool = it->size;
-			ret = asn1_print_boolean_ctx(out, bool, pctx);
+			int boolval = *(int *)fld;
+			if (boolval == -1)
+				boolval = it->size;
+			ret = asn1_print_boolean_ctx(out, boolval, pctx);
 			}
 		break;
 
diff --git a/main/openssl/crypto/asn1/x_algor.c b/main/openssl/crypto/asn1/x_algor.c
index 99e53429..274e456c 100644
--- a/main/openssl/crypto/asn1/x_algor.c
+++ b/main/openssl/crypto/asn1/x_algor.c
@@ -128,3 +128,17 @@ void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
 		}
 	}
 
+/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */
+
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md)
+	{
+	int param_type;
+
+	if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT)
+		param_type = V_ASN1_UNDEF;
+	else
+		param_type = V_ASN1_NULL;
+
+	X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+
+	}
diff --git a/main/openssl/crypto/asn1/x_name.c b/main/openssl/crypto/asn1/x_name.c
index 49be08b4..d7c23186 100644
--- a/main/openssl/crypto/asn1/x_name.c
+++ b/main/openssl/crypto/asn1/x_name.c
@@ -399,8 +399,7 @@ static int asn1_string_canon(ASN1_STRING *out, ASN1_STRING *in)
 	/* If type not in bitmask just copy string across */
 	if (!(ASN1_tag2bit(in->type) & ASN1_MASK_CANON))
 		{
-		out->type = in->type;
-		if (!ASN1_STRING_set(out, in->data, in->length))
+		if (!ASN1_STRING_copy(out, in))
 			return 0;
 		return 1;
 		}
diff --git a/main/openssl/crypto/asn1/x_pubkey.c b/main/openssl/crypto/asn1/x_pubkey.c
index d42b6a2c..b649e1fc 100644
--- a/main/openssl/crypto/asn1/x_pubkey.c
+++ b/main/openssl/crypto/asn1/x_pubkey.c
@@ -171,7 +171,19 @@ EVP_PKEY *X509_PUBKEY_get(X509_PUBKEY *key)
 		goto error;
 		}
 
-	key->pkey = ret;
+	/* Check to see if another thread set key->pkey first */
+	CRYPTO_w_lock(CRYPTO_LOCK_EVP_PKEY);
+	if (key->pkey)
+		{
+		CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY);
+		EVP_PKEY_free(ret);
+		ret = key->pkey;
+		}
+	else
+		{
+		key->pkey = ret;
+		CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY);
+		}
 	CRYPTO_add(&ret->references, 1, CRYPTO_LOCK_EVP_PKEY);
 
 	return ret;
diff --git a/main/openssl/crypto/bf/asm/bf-586.S b/main/openssl/crypto/bf/asm/bf-586.S
new file mode 100644
index 00000000..aa718d40
--- /dev/null
+++ b/main/openssl/crypto/bf/asm/bf-586.S
@@ -0,0 +1,896 @@
+.file	"bf-586.s"
+.text
+.globl	BF_encrypt
+.type	BF_encrypt,@function
+.align	16
+BF_encrypt:
+.L_BF_encrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	movl	12(%esp),%ebx
+	movl	16(%esp),%ebp
+	pushl	%esi
+	pushl	%edi
+
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+	xorl	%eax,%eax
+	movl	(%ebp),%ebx
+	xorl	%ecx,%ecx
+	xorl	%ebx,%edi
+
+
+	movl	4(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	8(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	12(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	16(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	20(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	24(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	28(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	32(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	36(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	40(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	44(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	48(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	52(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	56(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	60(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	64(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+
+	movl	20(%esp),%eax
+	xorl	%ebx,%edi
+	movl	68(%ebp),%edx
+	xorl	%edx,%esi
+	movl	%edi,4(%eax)
+	movl	%esi,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	BF_encrypt,.-.L_BF_encrypt_begin
+.globl	BF_decrypt
+.type	BF_decrypt,@function
+.align	16
+BF_decrypt:
+.L_BF_decrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	movl	12(%esp),%ebx
+	movl	16(%esp),%ebp
+	pushl	%esi
+	pushl	%edi
+
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+	xorl	%eax,%eax
+	movl	68(%ebp),%ebx
+	xorl	%ecx,%ecx
+	xorl	%ebx,%edi
+
+
+	movl	64(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	60(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	56(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	52(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	48(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	44(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	40(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	36(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	32(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	28(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	24(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	20(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	16(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	12(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	8(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	4(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+
+	movl	20(%esp),%eax
+	xorl	%ebx,%edi
+	movl	(%ebp),%edx
+	xorl	%edx,%esi
+	movl	%edi,4(%eax)
+	movl	%esi,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	BF_decrypt,.-.L_BF_decrypt_begin
+.globl	BF_cbc_encrypt
+.type	BF_cbc_encrypt,@function
+.align	16
+BF_cbc_encrypt:
+.L_BF_cbc_encrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%ebp
+
+	movl	36(%esp),%ebx
+	movl	(%ebx),%esi
+	movl	4(%ebx),%edi
+	pushl	%edi
+	pushl	%esi
+	pushl	%edi
+	pushl	%esi
+	movl	%esp,%ebx
+	movl	36(%esp),%esi
+	movl	40(%esp),%edi
+
+	movl	56(%esp),%ecx
+
+	movl	48(%esp),%eax
+	pushl	%eax
+	pushl	%ebx
+	cmpl	$0,%ecx
+	jz	.L000decrypt
+	andl	$4294967288,%ebp
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	jz	.L001encrypt_finish
+.L002encrypt_loop:
+	movl	(%esi),%ecx
+	movl	4(%esi),%edx
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	call	.L_BF_encrypt_begin
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L002encrypt_loop
+.L001encrypt_finish:
+	movl	52(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L003finish
+	call	.L004PIC_point
+.L004PIC_point:
+	popl	%edx
+	leal	.L005cbc_enc_jmp_table-.L004PIC_point(%edx),%ecx
+	movl	(%ecx,%ebp,4),%ebp
+	addl	%edx,%ebp
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	jmp	*%ebp
+.L006ej7:
+	movb	6(%esi),%dh
+	shll	$8,%edx
+.L007ej6:
+	movb	5(%esi),%dh
+.L008ej5:
+	movb	4(%esi),%dl
+.L009ej4:
+	movl	(%esi),%ecx
+	jmp	.L010ejend
+.L011ej3:
+	movb	2(%esi),%ch
+	shll	$8,%ecx
+.L012ej2:
+	movb	1(%esi),%ch
+.L013ej1:
+	movb	(%esi),%cl
+.L010ejend:
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	call	.L_BF_encrypt_begin
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	jmp	.L003finish
+.L000decrypt:
+	andl	$4294967288,%ebp
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	jz	.L014decrypt_finish
+.L015decrypt_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	call	.L_BF_decrypt_begin
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	16(%esp),%ecx
+	movl	20(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%ecx,(%edi)
+	movl	%edx,4(%edi)
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L015decrypt_loop
+.L014decrypt_finish:
+	movl	52(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L003finish
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	call	.L_BF_decrypt_begin
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	16(%esp),%ecx
+	movl	20(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+.L016dj7:
+	rorl	$16,%edx
+	movb	%dl,6(%edi)
+	shrl	$16,%edx
+.L017dj6:
+	movb	%dh,5(%edi)
+.L018dj5:
+	movb	%dl,4(%edi)
+.L019dj4:
+	movl	%ecx,(%edi)
+	jmp	.L020djend
+.L021dj3:
+	rorl	$16,%ecx
+	movb	%cl,2(%edi)
+	shll	$16,%ecx
+.L022dj2:
+	movb	%ch,1(%esi)
+.L023dj1:
+	movb	%cl,(%esi)
+.L020djend:
+	jmp	.L003finish
+.L003finish:
+	movl	60(%esp),%ecx
+	addl	$24,%esp
+	movl	%eax,(%ecx)
+	movl	%ebx,4(%ecx)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L005cbc_enc_jmp_table:
+.long	0
+.long	.L013ej1-.L004PIC_point
+.long	.L012ej2-.L004PIC_point
+.long	.L011ej3-.L004PIC_point
+.long	.L009ej4-.L004PIC_point
+.long	.L008ej5-.L004PIC_point
+.long	.L007ej6-.L004PIC_point
+.long	.L006ej7-.L004PIC_point
+.align	64
+.size	BF_cbc_encrypt,.-.L_BF_cbc_encrypt_begin
diff --git a/main/openssl/crypto/bf/bf_skey.c b/main/openssl/crypto/bf/bf_skey.c
index 3673cdee..3b0bca41 100644
--- a/main/openssl/crypto/bf/bf_skey.c
+++ b/main/openssl/crypto/bf/bf_skey.c
@@ -58,11 +58,19 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <openssl/crypto.h>
 #include <openssl/blowfish.h>
 #include "bf_locl.h"
 #include "bf_pi.h"
 
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(BLOWFISH);
+	private_BF_set_key(key, len, data);
+	}
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#endif
 	{
 	int i;
 	BF_LONG *p,ri,in[2];
diff --git a/main/openssl/crypto/bf/blowfish.h b/main/openssl/crypto/bf/blowfish.h
index b97e76f9..4b6c8920 100644
--- a/main/openssl/crypto/bf/blowfish.h
+++ b/main/openssl/crypto/bf/blowfish.h
@@ -104,7 +104,9 @@ typedef struct bf_key_st
 	BF_LONG S[4*256];
 	} BF_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data);
+#endif
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
 
 void BF_encrypt(BF_LONG *data,const BF_KEY *key);
diff --git a/main/openssl/crypto/bio/b_sock.c b/main/openssl/crypto/bio/b_sock.c
index d47310d6..470b1a00 100644
--- a/main/openssl/crypto/bio/b_sock.c
+++ b/main/openssl/crypto/bio/b_sock.c
@@ -629,7 +629,8 @@ int BIO_get_accept_socket(char *host, int bind_mode)
 		struct sockaddr_in6 sa_in6;
 #endif
 	} server,client;
-	int s=INVALID_SOCKET,cs,addrlen;
+	int s=INVALID_SOCKET,cs;
+	socklen_t addrlen;
 	unsigned char ip[4];
 	unsigned short port;
 	char *str=NULL,*e;
@@ -704,10 +705,10 @@ int BIO_get_accept_socket(char *host, int bind_mode)
 
 	if ((*p_getaddrinfo.f)(h,p,&hint,&res)) break;
 
-	addrlen = res->ai_addrlen<=sizeof(server) ?
+	addrlen = res->ai_addrlen <= (socklen_t)sizeof(server) ?
 			res->ai_addrlen :
-			sizeof(server);
-	memcpy(&server, res->ai_addr, addrlen);
+			(socklen_t)sizeof(server);
+	memcpy(&server, res->ai_addr, (size_t)addrlen);
 
 	(*p_freeaddrinfo.f)(res);
 	goto again;
@@ -719,7 +720,7 @@ int BIO_get_accept_socket(char *host, int bind_mode)
 	memset((char *)&server,0,sizeof(server));
 	server.sa_in.sin_family=AF_INET;
 	server.sa_in.sin_port=htons(port);
-	addrlen = sizeof(server.sa_in);
+	addrlen = (socklen_t)sizeof(server.sa_in);
 
 	if (h == NULL || strcmp(h,"*") == 0)
 		server.sa_in.sin_addr.s_addr=INADDR_ANY;
@@ -960,7 +961,6 @@ int BIO_set_tcp_ndelay(int s, int on)
 #endif
 	return(ret == 0);
 	}
-#endif
 
 int BIO_socket_nbio(int s, int mode)
 	{
@@ -973,3 +973,4 @@ int BIO_socket_nbio(int s, int mode)
 #endif
 	return(ret == 0);
 	}
+#endif
diff --git a/main/openssl/crypto/bio/bf_buff.c b/main/openssl/crypto/bio/bf_buff.c
index c1fd75aa..4b5a132d 100644
--- a/main/openssl/crypto/bio/bf_buff.c
+++ b/main/openssl/crypto/bio/bf_buff.c
@@ -209,7 +209,7 @@ start:
 	/* add to buffer and return */
 	if (i >= inl)
 		{
-		memcpy(&(ctx->obuf[ctx->obuf_len]),in,inl);
+		memcpy(&(ctx->obuf[ctx->obuf_off+ctx->obuf_len]),in,inl);
 		ctx->obuf_len+=inl;
 		return(num+inl);
 		}
@@ -219,7 +219,7 @@ start:
 		{
 		if (i > 0) /* lets fill it up if we can */
 			{
-			memcpy(&(ctx->obuf[ctx->obuf_len]),in,i);
+			memcpy(&(ctx->obuf[ctx->obuf_off+ctx->obuf_len]),in,i);
 			in+=i;
 			inl-=i;
 			num+=i;
@@ -294,9 +294,9 @@ static long buffer_ctrl(BIO *b, int cmd, long num, void *ptr)
 	case BIO_C_GET_BUFF_NUM_LINES:
 		ret=0;
 		p1=ctx->ibuf;
-		for (i=ctx->ibuf_off; i<ctx->ibuf_len; i++)
+		for (i=0; i<ctx->ibuf_len; i++)
 			{
-			if (p1[i] == '\n') ret++;
+			if (p1[ctx->ibuf_off + i] == '\n') ret++;
 			}
 		break;
 	case BIO_CTRL_WPENDING:
@@ -399,17 +399,18 @@ static long buffer_ctrl(BIO *b, int cmd, long num, void *ptr)
 		for (;;)
 			{
 			BIO_clear_retry_flags(b);
-			if (ctx->obuf_len > ctx->obuf_off)
+			if (ctx->obuf_len > 0)
 				{
 				r=BIO_write(b->next_bio,
 					&(ctx->obuf[ctx->obuf_off]),
-					ctx->obuf_len-ctx->obuf_off);
+					ctx->obuf_len);
 #if 0
-fprintf(stderr,"FLUSH [%3d] %3d -> %3d\n",ctx->obuf_off,ctx->obuf_len-ctx->obuf_off,r);
+fprintf(stderr,"FLUSH [%3d] %3d -> %3d\n",ctx->obuf_off,ctx->obuf_len,r);
 #endif
 				BIO_copy_next_retry(b);
 				if (r <= 0) return((long)r);
 				ctx->obuf_off+=r;
+				ctx->obuf_len-=r;
 				}
 			else
 				{
diff --git a/main/openssl/crypto/bio/bio.h b/main/openssl/crypto/bio/bio.h
index 152802fb..05699ab2 100644
--- a/main/openssl/crypto/bio/bio.h
+++ b/main/openssl/crypto/bio/bio.h
@@ -68,6 +68,14 @@
 
 #include <openssl/crypto.h>
 
+#ifndef OPENSSL_NO_SCTP
+# ifndef OPENSSL_SYS_VMS
+# include <stdint.h>
+# else
+# include <inttypes.h>
+# endif
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -95,6 +103,9 @@ extern "C" {
 #define BIO_TYPE_BIO		(19|0x0400)		/* (half a) BIO pair */
 #define BIO_TYPE_LINEBUFFER	(20|0x0200)		/* filter */
 #define BIO_TYPE_DGRAM		(21|0x0400|0x0100)
+#ifndef OPENSSL_NO_SCTP
+#define BIO_TYPE_DGRAM_SCTP	(24|0x0400|0x0100)
+#endif
 #define BIO_TYPE_ASN1 		(22|0x0200)		/* filter */
 #define BIO_TYPE_COMP 		(23|0x0200)		/* filter */
 
@@ -146,6 +157,7 @@ extern "C" {
 /* #endif */
 
 #define BIO_CTRL_DGRAM_QUERY_MTU          40 /* as kernel for current MTU */
+#define BIO_CTRL_DGRAM_GET_FALLBACK_MTU   47
 #define BIO_CTRL_DGRAM_GET_MTU            41 /* get cached value for MTU */
 #define BIO_CTRL_DGRAM_SET_MTU            42 /* set cached value for
 					      * MTU. want to use this
@@ -161,7 +173,22 @@ extern "C" {
 #define BIO_CTRL_DGRAM_SET_PEER           44 /* Destination for the data */
 
 #define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT   45 /* Next DTLS handshake timeout to
-											  * adjust socket timeouts */
+                                              * adjust socket timeouts */
+
+#ifndef OPENSSL_NO_SCTP
+/* SCTP stuff */
+#define BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE	50
+#define BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY		51
+#define BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY		52
+#define BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD		53
+#define BIO_CTRL_DGRAM_SCTP_GET_SNDINFO		60
+#define BIO_CTRL_DGRAM_SCTP_SET_SNDINFO		61
+#define BIO_CTRL_DGRAM_SCTP_GET_RCVINFO		62
+#define BIO_CTRL_DGRAM_SCTP_SET_RCVINFO		63
+#define BIO_CTRL_DGRAM_SCTP_GET_PRINFO			64
+#define BIO_CTRL_DGRAM_SCTP_SET_PRINFO			65
+#define BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN		70
+#endif
 
 /* modifiers */
 #define BIO_FP_READ		0x02
@@ -306,6 +333,15 @@ DECLARE_STACK_OF(BIO)
 
 typedef struct bio_f_buffer_ctx_struct
 	{
+	/* Buffers are setup like this:
+	 *
+	 * <---------------------- size ----------------------->
+	 * +---------------------------------------------------+
+	 * | consumed | remaining          | free space        |
+	 * +---------------------------------------------------+
+	 * <-- off --><------- len ------->
+	 */
+
 	/* BIO *bio; */ /* this is now in the BIO struct */
 	int ibuf_size;	/* how big is the input buffer */
 	int obuf_size;	/* how big is the output buffer */
@@ -322,6 +358,34 @@ typedef struct bio_f_buffer_ctx_struct
 /* Prefix and suffix callback in ASN1 BIO */
 typedef int asn1_ps_func(BIO *b, unsigned char **pbuf, int *plen, void *parg);
 
+#ifndef OPENSSL_NO_SCTP
+/* SCTP parameter structs */
+struct bio_dgram_sctp_sndinfo
+	{
+	uint16_t snd_sid;
+	uint16_t snd_flags;
+	uint32_t snd_ppid;
+	uint32_t snd_context;
+	};
+
+struct bio_dgram_sctp_rcvinfo
+	{
+	uint16_t rcv_sid;
+	uint16_t rcv_ssn;
+	uint16_t rcv_flags;
+	uint32_t rcv_ppid;
+	uint32_t rcv_tsn;
+	uint32_t rcv_cumtsn;
+	uint32_t rcv_context;
+	};
+
+struct bio_dgram_sctp_prinfo
+	{
+	uint16_t pr_policy;
+	uint32_t pr_value;
+	};
+#endif
+
 /* connect BIO stuff */
 #define BIO_CONN_S_BEFORE		1
 #define BIO_CONN_S_GET_IP		2
@@ -619,6 +683,9 @@ BIO_METHOD *BIO_f_linebuffer(void);
 BIO_METHOD *BIO_f_nbio_test(void);
 #ifndef OPENSSL_NO_DGRAM
 BIO_METHOD *BIO_s_datagram(void);
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void);
+#endif
 #endif
 
 /* BIO_METHOD *BIO_f_ber(void); */
@@ -661,6 +728,15 @@ int BIO_set_tcp_ndelay(int sock,int turn_on);
 
 BIO *BIO_new_socket(int sock, int close_flag);
 BIO *BIO_new_dgram(int fd, int close_flag);
+#ifndef OPENSSL_NO_SCTP
+BIO *BIO_new_dgram_sctp(int fd, int close_flag);
+int BIO_dgram_is_sctp(BIO *bio);
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context);
+int BIO_dgram_sctp_wait_for_dry(BIO *b);
+int BIO_dgram_sctp_msg_waiting(BIO *b);
+#endif
 BIO *BIO_new_fd(int fd, int close_flag);
 BIO *BIO_new_connect(char *host_port);
 BIO *BIO_new_accept(char *host_port);
@@ -725,6 +801,7 @@ void ERR_load_BIO_strings(void);
 #define BIO_F_BUFFER_CTRL				 114
 #define BIO_F_CONN_CTRL					 127
 #define BIO_F_CONN_STATE				 115
+#define BIO_F_DGRAM_SCTP_READ				 132
 #define BIO_F_FILE_CTRL					 116
 #define BIO_F_FILE_READ					 130
 #define BIO_F_LINEBUFFER_CTRL				 129
diff --git a/main/openssl/crypto/bio/bio_err.c b/main/openssl/crypto/bio/bio_err.c
index a224edd5..0dbfbd80 100644
--- a/main/openssl/crypto/bio/bio_err.c
+++ b/main/openssl/crypto/bio/bio_err.c
@@ -1,6 +1,6 @@
 /* crypto/bio/bio_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -95,6 +95,7 @@ static ERR_STRING_DATA BIO_str_functs[]=
 {ERR_FUNC(BIO_F_BUFFER_CTRL),	"BUFFER_CTRL"},
 {ERR_FUNC(BIO_F_CONN_CTRL),	"CONN_CTRL"},
 {ERR_FUNC(BIO_F_CONN_STATE),	"CONN_STATE"},
+{ERR_FUNC(BIO_F_DGRAM_SCTP_READ),	"DGRAM_SCTP_READ"},
 {ERR_FUNC(BIO_F_FILE_CTRL),	"FILE_CTRL"},
 {ERR_FUNC(BIO_F_FILE_READ),	"FILE_READ"},
 {ERR_FUNC(BIO_F_LINEBUFFER_CTRL),	"LINEBUFFER_CTRL"},
diff --git a/main/openssl/crypto/bio/bio_lib.c b/main/openssl/crypto/bio/bio_lib.c
index e12bc3a2..9c9646af 100644
--- a/main/openssl/crypto/bio/bio_lib.c
+++ b/main/openssl/crypto/bio/bio_lib.c
@@ -521,40 +521,40 @@ void BIO_free_all(BIO *bio)
 
 BIO *BIO_dup_chain(BIO *in)
 	{
-	BIO *ret=NULL,*eoc=NULL,*bio,*new;
+	BIO *ret=NULL,*eoc=NULL,*bio,*new_bio;
 
 	for (bio=in; bio != NULL; bio=bio->next_bio)
 		{
-		if ((new=BIO_new(bio->method)) == NULL) goto err;
-		new->callback=bio->callback;
-		new->cb_arg=bio->cb_arg;
-		new->init=bio->init;
-		new->shutdown=bio->shutdown;
-		new->flags=bio->flags;
+		if ((new_bio=BIO_new(bio->method)) == NULL) goto err;
+		new_bio->callback=bio->callback;
+		new_bio->cb_arg=bio->cb_arg;
+		new_bio->init=bio->init;
+		new_bio->shutdown=bio->shutdown;
+		new_bio->flags=bio->flags;
 
 		/* This will let SSL_s_sock() work with stdin/stdout */
-		new->num=bio->num;
+		new_bio->num=bio->num;
 
-		if (!BIO_dup_state(bio,(char *)new))
+		if (!BIO_dup_state(bio,(char *)new_bio))
 			{
-			BIO_free(new);
+			BIO_free(new_bio);
 			goto err;
 			}
 
 		/* copy app data */
-		if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new->ex_data,
+		if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new_bio->ex_data,
 					&bio->ex_data))
 			goto err;
 
 		if (ret == NULL)
 			{
-			eoc=new;
+			eoc=new_bio;
 			ret=eoc;
 			}
 		else
 			{
-			BIO_push(eoc,new);
-			eoc=new;
+			BIO_push(eoc,new_bio);
+			eoc=new_bio;
 			}
 		}
 	return(ret);
diff --git a/main/openssl/crypto/bio/bss_bio.c b/main/openssl/crypto/bio/bss_bio.c
index 76bd48e7..52ef0ebc 100644
--- a/main/openssl/crypto/bio/bss_bio.c
+++ b/main/openssl/crypto/bio/bss_bio.c
@@ -277,10 +277,10 @@ static int bio_read(BIO *bio, char *buf, int size_)
  */
 /* WARNING: The non-copying interface is largely untested as of yet
  * and may contain bugs. */
-static ssize_t bio_nread0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nread0(BIO *bio, char **buf)
 	{
 	struct bio_bio_st *b, *peer_b;
-	ssize_t num;
+	ossl_ssize_t num;
 	
 	BIO_clear_retry_flags(bio);
 
@@ -315,15 +315,15 @@ static ssize_t bio_nread0(BIO *bio, char **buf)
 	return num;
 	}
 
-static ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
 	{
 	struct bio_bio_st *b, *peer_b;
-	ssize_t num, available;
+	ossl_ssize_t num, available;
 
 	if (num_ > SSIZE_MAX)
 		num = SSIZE_MAX;
 	else
-		num = (ssize_t)num_;
+		num = (ossl_ssize_t)num_;
 
 	available = bio_nread0(bio, buf);
 	if (num > available)
@@ -428,7 +428,7 @@ static int bio_write(BIO *bio, const char *buf, int num_)
  * (example usage:  bio_nwrite0(), write to buffer, bio_nwrite()
  *  or just         bio_nwrite(), write to buffer)
  */
-static ssize_t bio_nwrite0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nwrite0(BIO *bio, char **buf)
 	{
 	struct bio_bio_st *b;
 	size_t num;
@@ -476,15 +476,15 @@ static ssize_t bio_nwrite0(BIO *bio, char **buf)
 	return num;
 	}
 
-static ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
 	{
 	struct bio_bio_st *b;
-	ssize_t num, space;
+	ossl_ssize_t num, space;
 
 	if (num_ > SSIZE_MAX)
 		num = SSIZE_MAX;
 	else
-		num = (ssize_t)num_;
+		num = (ossl_ssize_t)num_;
 
 	space = bio_nwrite0(bio, buf);
 	if (num > space)
diff --git a/main/openssl/crypto/bio/bss_dgram.c b/main/openssl/crypto/bio/bss_dgram.c
index 71ebe987..54c012c4 100644
--- a/main/openssl/crypto/bio/bss_dgram.c
+++ b/main/openssl/crypto/bio/bss_dgram.c
@@ -70,10 +70,27 @@
 #include <sys/timeb.h>
 #endif
 
-#ifdef OPENSSL_SYS_LINUX
+#ifndef OPENSSL_NO_SCTP
+#include <netinet/sctp.h>
+#include <fcntl.h>
+#define OPENSSL_SCTP_DATA_CHUNK_TYPE            0x00
+#define OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE 0xc0
+#endif
+
+#if defined(OPENSSL_SYS_LINUX) && !defined(IP_MTU)
 #define IP_MTU      14 /* linux is lame */
 #endif
 
+#if defined(__FreeBSD__) && defined(IN6_IS_ADDR_V4MAPPED)
+/* Standard definition causes type-punning problems. */
+#undef IN6_IS_ADDR_V4MAPPED
+#define s6_addr32 __u6_addr.__u6_addr32
+#define IN6_IS_ADDR_V4MAPPED(a)               \
+        (((a)->s6_addr32[0] == 0) &&          \
+         ((a)->s6_addr32[1] == 0) &&          \
+         ((a)->s6_addr32[2] == htonl(0x0000ffff)))
+#endif
+
 #ifdef WATT32
 #define sock_write SockWrite  /* Watt-32 uses same names */
 #define sock_read  SockRead
@@ -88,6 +105,18 @@ static int dgram_new(BIO *h);
 static int dgram_free(BIO *data);
 static int dgram_clear(BIO *bio);
 
+#ifndef OPENSSL_NO_SCTP
+static int dgram_sctp_write(BIO *h, const char *buf, int num);
+static int dgram_sctp_read(BIO *h, char *buf, int size);
+static int dgram_sctp_puts(BIO *h, const char *str);
+static long dgram_sctp_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int dgram_sctp_new(BIO *h);
+static int dgram_sctp_free(BIO *data);
+#ifdef SCTP_AUTHENTICATION_EVENT
+static void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp);
+#endif
+#endif
+
 static int BIO_dgram_should_retry(int s);
 
 static void get_current_time(struct timeval *t);
@@ -106,6 +135,22 @@ static BIO_METHOD methods_dgramp=
 	NULL,
 	};
 
+#ifndef OPENSSL_NO_SCTP
+static BIO_METHOD methods_dgramp_sctp=
+	{
+	BIO_TYPE_DGRAM_SCTP,
+	"datagram sctp socket",
+	dgram_sctp_write,
+	dgram_sctp_read,
+	dgram_sctp_puts,
+	NULL, /* dgram_gets, */
+	dgram_sctp_ctrl,
+	dgram_sctp_new,
+	dgram_sctp_free,
+	NULL,
+	};
+#endif
+
 typedef struct bio_dgram_data_st
 	{
 	union {
@@ -122,6 +167,40 @@ typedef struct bio_dgram_data_st
 	struct timeval socket_timeout;
 	} bio_dgram_data;
 
+#ifndef OPENSSL_NO_SCTP
+typedef struct bio_dgram_sctp_save_message_st
+	{
+        BIO *bio;
+        char *data;
+        int length;
+	} bio_dgram_sctp_save_message;
+
+typedef struct bio_dgram_sctp_data_st
+	{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sa_in;
+#if OPENSSL_USE_IPV6
+		struct sockaddr_in6 sa_in6;
+#endif
+	} peer;
+	unsigned int connected;
+	unsigned int _errno;
+	unsigned int mtu;
+	struct bio_dgram_sctp_sndinfo sndinfo;
+	struct bio_dgram_sctp_rcvinfo rcvinfo;
+	struct bio_dgram_sctp_prinfo prinfo;
+	void (*handle_notifications)(BIO *bio, void *context, void *buf);
+	void* notification_context;
+	int in_handshake;
+	int ccs_rcvd;
+	int ccs_sent;
+	int save_shutdown;
+	int peer_auth_tested;
+	bio_dgram_sctp_save_message saved_message;
+	} bio_dgram_sctp_data;
+#endif
+
 BIO_METHOD *BIO_s_datagram(void)
 	{
 	return(&methods_dgramp);
@@ -186,7 +265,7 @@ static void dgram_adjust_rcv_timeout(BIO *b)
 	{
 #if defined(SO_RCVTIMEO)
 	bio_dgram_data *data = (bio_dgram_data *)b->ptr;
-	int sz = sizeof(int);
+	union { size_t s; int i; } sz = {0};
 
 	/* Is a timer active? */
 	if (data->next_timeout.tv_sec > 0 || data->next_timeout.tv_usec > 0)
@@ -196,8 +275,10 @@ static void dgram_adjust_rcv_timeout(BIO *b)
 		/* Read current socket timeout */
 #ifdef OPENSSL_SYS_WINDOWS
 		int timeout;
+
+		sz.i = sizeof(timeout);
 		if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
-					   (void*)&timeout, &sz) < 0)
+					   (void*)&timeout, &sz.i) < 0)
 			{ perror("getsockopt"); }
 		else
 			{
@@ -205,9 +286,12 @@ static void dgram_adjust_rcv_timeout(BIO *b)
 			data->socket_timeout.tv_usec = (timeout % 1000) * 1000;
 			}
 #else
+		sz.i = sizeof(data->socket_timeout);
 		if ( getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO, 
 						&(data->socket_timeout), (void *)&sz) < 0)
 			{ perror("getsockopt"); }
+		else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+			OPENSSL_assert(sz.s<=sizeof(data->socket_timeout));
 #endif
 
 		/* Get current time */
@@ -376,11 +460,10 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 	int *ip;
 	struct sockaddr *to = NULL;
 	bio_dgram_data *data = NULL;
-#if defined(IP_MTU_DISCOVER) || defined(IP_MTU)
-	long sockopt_val = 0;
-	unsigned int sockopt_len = 0;
-#endif
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
+	int sockopt_val = 0;
+	socklen_t sockopt_len;	/* assume that system supporting IP_MTU is
+				 * modern enough to define socklen_t */
 	socklen_t addr_len;
 	union	{
 		struct sockaddr	sa;
@@ -462,7 +545,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 		break;
 		/* (Linux)kernel sets DF bit on outgoing IP packets */
 	case BIO_CTRL_DGRAM_MTU_DISCOVER:
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DO)
 		addr_len = (socklen_t)sizeof(addr);
 		memset((void *)&addr, 0, sizeof(addr));
 		if (getsockname(b->num, &addr.sa, &addr_len) < 0)
@@ -470,7 +553,6 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 			ret = 0;
 			break;
 			}
-		sockopt_len = sizeof(sockopt_val);
 		switch (addr.sa.sa_family)
 			{
 		case AF_INET:
@@ -479,7 +561,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 				&sockopt_val, sizeof(sockopt_val))) < 0)
 				perror("setsockopt");
 			break;
-#if OPENSSL_USE_IPV6 && defined(IPV6_MTU_DISCOVER)
+#if OPENSSL_USE_IPV6 && defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DO)
 		case AF_INET6:
 			sockopt_val = IPV6_PMTUDISC_DO;
 			if ((ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
@@ -496,7 +578,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 		break;
 #endif
 	case BIO_CTRL_DGRAM_QUERY_MTU:
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU)
 		addr_len = (socklen_t)sizeof(addr);
 		memset((void *)&addr, 0, sizeof(addr));
 		if (getsockname(b->num, &addr.sa, &addr_len) < 0)
@@ -547,6 +629,27 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 		ret = 0;
 #endif
 		break;
+	case BIO_CTRL_DGRAM_GET_FALLBACK_MTU:
+		switch (data->peer.sa.sa_family)
+			{
+			case AF_INET:
+				ret = 576 - 20 - 8;
+				break;
+#if OPENSSL_USE_IPV6
+			case AF_INET6:
+#ifdef IN6_IS_ADDR_V4MAPPED
+				if (IN6_IS_ADDR_V4MAPPED(&data->peer.sa_in6.sin6_addr))
+					ret = 576 - 20 - 8;
+				else
+#endif
+					ret = 1280 - 40 - 8;
+				break;
+#endif
+			default:
+				ret = 576 - 20 - 8;
+				break;
+			}
+		break;
 	case BIO_CTRL_DGRAM_GET_MTU:
 		return data->mtu;
 		break;
@@ -637,12 +740,15 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 #endif
 		break;
 	case BIO_CTRL_DGRAM_GET_RECV_TIMEOUT:
-#ifdef OPENSSL_SYS_WINDOWS
 		{
-		int timeout, sz = sizeof(timeout);
+		union { size_t s; int i; } sz = {0};
+#ifdef OPENSSL_SYS_WINDOWS
+		int timeout;
 		struct timeval *tv = (struct timeval *)ptr;
+
+		sz.i = sizeof(timeout);
 		if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
-			(void*)&timeout, &sz) < 0)
+			(void*)&timeout, &sz.i) < 0)
 			{ perror("getsockopt"); ret = -1; }
 		else
 			{
@@ -650,12 +756,20 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 			tv->tv_usec = (timeout % 1000) * 1000;
 			ret = sizeof(*tv);
 			}
-		}
 #else
+		sz.i = sizeof(struct timeval);
 		if ( getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO, 
-			ptr, (void *)&ret) < 0)
+			ptr, (void *)&sz) < 0)
 			{ perror("getsockopt"); ret = -1; }
+		else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+			{
+			OPENSSL_assert(sz.s<=sizeof(struct timeval));
+			ret = (int)sz.s;
+			}
+		else
+			ret = sz.i;
 #endif
+		}
 		break;
 #endif
 #if defined(SO_SNDTIMEO)
@@ -675,12 +789,15 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 #endif
 		break;
 	case BIO_CTRL_DGRAM_GET_SEND_TIMEOUT:
-#ifdef OPENSSL_SYS_WINDOWS
 		{
-		int timeout, sz = sizeof(timeout);
+		union { size_t s; int i; } sz = {0};
+#ifdef OPENSSL_SYS_WINDOWS
+		int timeout;
 		struct timeval *tv = (struct timeval *)ptr;
+
+		sz.i = sizeof(timeout);
 		if (getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO,
-			(void*)&timeout, &sz) < 0)
+			(void*)&timeout, &sz.i) < 0)
 			{ perror("getsockopt"); ret = -1; }
 		else
 			{
@@ -688,12 +805,20 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 			tv->tv_usec = (timeout % 1000) * 1000;
 			ret = sizeof(*tv);
 			}
-		}
 #else
+		sz.i = sizeof(struct timeval);
 		if ( getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO, 
-			ptr, (void *)&ret) < 0)
+			ptr, (void *)&sz) < 0)
 			{ perror("getsockopt"); ret = -1; }
+		else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+			{
+			OPENSSL_assert(sz.s<=sizeof(struct timeval));
+			ret = (int)sz.s;
+			}
+		else
+			ret = sz.i;
 #endif
+		}
 		break;
 #endif
 	case BIO_CTRL_DGRAM_GET_SEND_TIMER_EXP:
@@ -738,6 +863,910 @@ static int dgram_puts(BIO *bp, const char *str)
 	return(ret);
 	}
 
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void)
+	{
+	return(&methods_dgramp_sctp);
+	}
+
+BIO *BIO_new_dgram_sctp(int fd, int close_flag)
+	{
+	BIO *bio;
+	int ret, optval = 20000;
+	int auth_data = 0, auth_forward = 0;
+	unsigned char *p;
+	struct sctp_authchunk auth;
+	struct sctp_authchunks *authchunks;
+	socklen_t sockopt_len;
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+	struct sctp_event event;
+#else
+	struct sctp_event_subscribe event;
+#endif
+#endif
+
+	bio=BIO_new(BIO_s_datagram_sctp());
+	if (bio == NULL) return(NULL);
+	BIO_set_fd(bio,fd,close_flag);
+
+	/* Activate SCTP-AUTH for DATA and FORWARD-TSN chunks */
+	auth.sauth_chunk = OPENSSL_SCTP_DATA_CHUNK_TYPE;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+	OPENSSL_assert(ret >= 0);
+	auth.sauth_chunk = OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+	OPENSSL_assert(ret >= 0);
+
+	/* Test if activation was successful. When using accept(),
+	 * SCTP-AUTH has to be activated for the listening socket
+	 * already, otherwise the connected socket won't use it. */
+	sockopt_len = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+	authchunks = OPENSSL_malloc(sockopt_len);
+	memset(authchunks, 0, sizeof(sockopt_len));
+	ret = getsockopt(fd, IPPROTO_SCTP, SCTP_LOCAL_AUTH_CHUNKS, authchunks, &sockopt_len);
+	OPENSSL_assert(ret >= 0);
+
+	for (p = (unsigned char*) authchunks->gauth_chunks;
+	     p < (unsigned char*) authchunks + sockopt_len;
+	     p += sizeof(uint8_t))
+		{
+		if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+		if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+		}
+		
+	OPENSSL_free(authchunks);
+
+	OPENSSL_assert(auth_data);
+	OPENSSL_assert(auth_forward);
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+	memset(&event, 0, sizeof(struct sctp_event));
+	event.se_assoc_id = 0;
+	event.se_type = SCTP_AUTHENTICATION_EVENT;
+	event.se_on = 1;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+	OPENSSL_assert(ret >= 0);
+#else
+	sockopt_len = (socklen_t) sizeof(struct sctp_event_subscribe);
+	ret = getsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, &sockopt_len);
+	OPENSSL_assert(ret >= 0);
+
+	event.sctp_authentication_event = 1;
+
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+	OPENSSL_assert(ret >= 0);
+#endif
+#endif
+
+	/* Disable partial delivery by setting the min size
+	 * larger than the max record size of 2^14 + 2048 + 13
+	 */
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT, &optval, sizeof(optval));
+	OPENSSL_assert(ret >= 0);
+
+	return(bio);
+	}
+
+int BIO_dgram_is_sctp(BIO *bio)
+	{
+	return (BIO_method_type(bio) == BIO_TYPE_DGRAM_SCTP);
+	}
+
+static int dgram_sctp_new(BIO *bi)
+	{
+	bio_dgram_sctp_data *data = NULL;
+
+	bi->init=0;
+	bi->num=0;
+	data = OPENSSL_malloc(sizeof(bio_dgram_sctp_data));
+	if (data == NULL)
+		return 0;
+	memset(data, 0x00, sizeof(bio_dgram_sctp_data));
+#ifdef SCTP_PR_SCTP_NONE
+	data->prinfo.pr_policy = SCTP_PR_SCTP_NONE;
+#endif
+    bi->ptr = data;
+
+	bi->flags=0;
+	return(1);
+	}
+
+static int dgram_sctp_free(BIO *a)
+	{
+	bio_dgram_sctp_data *data;
+
+	if (a == NULL) return(0);
+	if ( ! dgram_clear(a))
+		return 0;
+
+	data = (bio_dgram_sctp_data *)a->ptr;
+	if(data != NULL) OPENSSL_free(data);
+
+	return(1);
+	}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp)
+	{
+	int ret;
+	struct sctp_authkey_event* authkeyevent = &snp->sn_auth_event;
+
+	if (authkeyevent->auth_indication == SCTP_AUTH_FREE_KEY)
+		{
+		struct sctp_authkeyid authkeyid;
+
+		/* delete key */
+		authkeyid.scact_keynumber = authkeyevent->auth_keynumber;
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		}
+	}
+#endif
+
+static int dgram_sctp_read(BIO *b, char *out, int outl)
+	{
+	int ret = 0, n = 0, i, optval;
+	socklen_t optlen;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+	union sctp_notification *snp;
+	struct msghdr msg;
+	struct iovec iov;
+	struct cmsghdr *cmsg;
+	char cmsgbuf[512];
+
+	if (out != NULL)
+		{
+		clear_socket_error();
+
+		do
+			{
+			memset(&data->rcvinfo, 0x00, sizeof(struct bio_dgram_sctp_rcvinfo));
+			iov.iov_base = out;
+			iov.iov_len = outl;
+			msg.msg_name = NULL;
+			msg.msg_namelen = 0;
+			msg.msg_iov = &iov;
+			msg.msg_iovlen = 1;
+			msg.msg_control = cmsgbuf;
+			msg.msg_controllen = 512;
+			msg.msg_flags = 0;
+			n = recvmsg(b->num, &msg, 0);
+
+			if (msg.msg_controllen > 0)
+				{
+				for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg))
+					{
+					if (cmsg->cmsg_level != IPPROTO_SCTP)
+						continue;
+#ifdef SCTP_RCVINFO
+					if (cmsg->cmsg_type == SCTP_RCVINFO)
+						{
+						struct sctp_rcvinfo *rcvinfo;
+
+						rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmsg);
+						data->rcvinfo.rcv_sid = rcvinfo->rcv_sid;
+						data->rcvinfo.rcv_ssn = rcvinfo->rcv_ssn;
+						data->rcvinfo.rcv_flags = rcvinfo->rcv_flags;
+						data->rcvinfo.rcv_ppid = rcvinfo->rcv_ppid;
+						data->rcvinfo.rcv_tsn = rcvinfo->rcv_tsn;
+						data->rcvinfo.rcv_cumtsn = rcvinfo->rcv_cumtsn;
+						data->rcvinfo.rcv_context = rcvinfo->rcv_context;
+						}
+#endif
+#ifdef SCTP_SNDRCV
+					if (cmsg->cmsg_type == SCTP_SNDRCV)
+						{
+						struct sctp_sndrcvinfo *sndrcvinfo;
+
+						sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+						data->rcvinfo.rcv_sid = sndrcvinfo->sinfo_stream;
+						data->rcvinfo.rcv_ssn = sndrcvinfo->sinfo_ssn;
+						data->rcvinfo.rcv_flags = sndrcvinfo->sinfo_flags;
+						data->rcvinfo.rcv_ppid = sndrcvinfo->sinfo_ppid;
+						data->rcvinfo.rcv_tsn = sndrcvinfo->sinfo_tsn;
+						data->rcvinfo.rcv_cumtsn = sndrcvinfo->sinfo_cumtsn;
+						data->rcvinfo.rcv_context = sndrcvinfo->sinfo_context;
+						}
+#endif
+					}
+				}
+
+			if (n <= 0)
+				{
+				if (n < 0)
+					ret = n;
+				break;
+				}
+
+			if (msg.msg_flags & MSG_NOTIFICATION)
+				{
+				snp = (union sctp_notification*) out;
+				if (snp->sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+					{
+#ifdef SCTP_EVENT
+					struct sctp_event event;
+#else
+					struct sctp_event_subscribe event;
+					socklen_t eventsize;
+#endif
+					/* If a message has been delayed until the socket
+					 * is dry, it can be sent now.
+					 */
+					if (data->saved_message.length > 0)
+						{
+						dgram_sctp_write(data->saved_message.bio, data->saved_message.data,
+						                 data->saved_message.length);
+						OPENSSL_free(data->saved_message.data);
+						data->saved_message.length = 0;
+						}
+
+					/* disable sender dry event */
+#ifdef SCTP_EVENT
+					memset(&event, 0, sizeof(struct sctp_event));
+					event.se_assoc_id = 0;
+					event.se_type = SCTP_SENDER_DRY_EVENT;
+					event.se_on = 0;
+					i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+					OPENSSL_assert(i >= 0);
+#else
+					eventsize = sizeof(struct sctp_event_subscribe);
+					i = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+					OPENSSL_assert(i >= 0);
+
+					event.sctp_sender_dry_event = 0;
+
+					i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+					OPENSSL_assert(i >= 0);
+#endif
+					}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+				if (snp->sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+					dgram_sctp_handle_auth_free_key_event(b, snp);
+#endif
+
+				if (data->handle_notifications != NULL)
+					data->handle_notifications(b, data->notification_context, (void*) out);
+
+				memset(out, 0, outl);
+				}
+			else
+				ret += n;
+			}
+		while ((msg.msg_flags & MSG_NOTIFICATION) && (msg.msg_flags & MSG_EOR) && (ret < outl));
+
+		if (ret > 0 && !(msg.msg_flags & MSG_EOR))
+			{
+			/* Partial message read, this should never happen! */
+
+			/* The buffer was too small, this means the peer sent
+			 * a message that was larger than allowed. */
+			if (ret == outl)
+				return -1;
+
+			/* Test if socket buffer can handle max record
+			 * size (2^14 + 2048 + 13)
+			 */
+			optlen = (socklen_t) sizeof(int);
+			ret = getsockopt(b->num, SOL_SOCKET, SO_RCVBUF, &optval, &optlen);
+			OPENSSL_assert(ret >= 0);
+			OPENSSL_assert(optval >= 18445);
+
+			/* Test if SCTP doesn't partially deliver below
+			 * max record size (2^14 + 2048 + 13)
+			 */
+			optlen = (socklen_t) sizeof(int);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT,
+			                 &optval, &optlen);
+			OPENSSL_assert(ret >= 0);
+			OPENSSL_assert(optval >= 18445);
+
+			/* Partially delivered notification??? Probably a bug.... */
+			OPENSSL_assert(!(msg.msg_flags & MSG_NOTIFICATION));
+
+			/* Everything seems ok till now, so it's most likely
+			 * a message dropped by PR-SCTP.
+			 */
+			memset(out, 0, outl);
+			BIO_set_retry_read(b);
+			return -1;
+			}
+
+		BIO_clear_retry_flags(b);
+		if (ret < 0)
+			{
+			if (BIO_dgram_should_retry(ret))
+				{
+				BIO_set_retry_read(b);
+				data->_errno = get_last_socket_error();
+				}
+			}
+
+		/* Test if peer uses SCTP-AUTH before continuing */
+		if (!data->peer_auth_tested)
+			{
+			int ii, auth_data = 0, auth_forward = 0;
+			unsigned char *p;
+			struct sctp_authchunks *authchunks;
+
+			optlen = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+			authchunks = OPENSSL_malloc(optlen);
+			memset(authchunks, 0, sizeof(optlen));
+			ii = getsockopt(b->num, IPPROTO_SCTP, SCTP_PEER_AUTH_CHUNKS, authchunks, &optlen);
+			OPENSSL_assert(ii >= 0);
+
+			for (p = (unsigned char*) authchunks->gauth_chunks;
+				 p < (unsigned char*) authchunks + optlen;
+				 p += sizeof(uint8_t))
+				{
+				if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+				if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+				}
+
+			OPENSSL_free(authchunks);
+
+			if (!auth_data || !auth_forward)
+				{
+				BIOerr(BIO_F_DGRAM_SCTP_READ,BIO_R_CONNECT_ERROR);
+				return -1;
+				}
+
+			data->peer_auth_tested = 1;
+			}
+		}
+	return(ret);
+	}
+
+static int dgram_sctp_write(BIO *b, const char *in, int inl)
+	{
+	int ret;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+	struct bio_dgram_sctp_sndinfo *sinfo = &(data->sndinfo);
+	struct bio_dgram_sctp_prinfo *pinfo = &(data->prinfo);
+	struct bio_dgram_sctp_sndinfo handshake_sinfo;
+	struct iovec iov[1];
+	struct msghdr msg;
+	struct cmsghdr *cmsg;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+	char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo)) + CMSG_SPACE(sizeof(struct sctp_prinfo))];
+	struct sctp_sndinfo *sndinfo;
+	struct sctp_prinfo *prinfo;
+#else
+	char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct sctp_sndrcvinfo *sndrcvinfo;
+#endif
+
+	clear_socket_error();
+
+	/* If we're send anything else than application data,
+	 * disable all user parameters and flags.
+	 */
+	if (in[0] != 23) {
+		memset(&handshake_sinfo, 0x00, sizeof(struct bio_dgram_sctp_sndinfo));
+#ifdef SCTP_SACK_IMMEDIATELY
+		handshake_sinfo.snd_flags = SCTP_SACK_IMMEDIATELY;
+#endif
+		sinfo = &handshake_sinfo;
+	}
+
+	/* If we have to send a shutdown alert message and the
+	 * socket is not dry yet, we have to save it and send it
+	 * as soon as the socket gets dry.
+	 */
+	if (data->save_shutdown && !BIO_dgram_sctp_wait_for_dry(b))
+	{
+		data->saved_message.bio = b;
+		data->saved_message.length = inl;
+		data->saved_message.data = OPENSSL_malloc(inl);
+		memcpy(data->saved_message.data, in, inl);
+		return inl;
+	}
+
+	iov[0].iov_base = (char *)in;
+	iov[0].iov_len = inl;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = (caddr_t)cmsgbuf;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+	cmsg = (struct cmsghdr *)cmsgbuf;
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDINFO;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndinfo));
+	sndinfo = (struct sctp_sndinfo *)CMSG_DATA(cmsg);
+	memset(sndinfo, 0, sizeof(struct sctp_sndinfo));
+	sndinfo->snd_sid = sinfo->snd_sid;
+	sndinfo->snd_flags = sinfo->snd_flags;
+	sndinfo->snd_ppid = sinfo->snd_ppid;
+	sndinfo->snd_context = sinfo->snd_context;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndinfo));
+
+	cmsg = (struct cmsghdr *)&cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo))];
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_PRINFO;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_prinfo));
+	prinfo = (struct sctp_prinfo *)CMSG_DATA(cmsg);
+	memset(prinfo, 0, sizeof(struct sctp_prinfo));
+	prinfo->pr_policy = pinfo->pr_policy;
+	prinfo->pr_value = pinfo->pr_value;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_prinfo));
+#else
+	cmsg = (struct cmsghdr *)cmsgbuf;
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo));
+	sndrcvinfo->sinfo_stream = sinfo->snd_sid;
+	sndrcvinfo->sinfo_flags = sinfo->snd_flags;
+#ifdef __FreeBSD__
+	sndrcvinfo->sinfo_flags |= pinfo->pr_policy;
+#endif
+	sndrcvinfo->sinfo_ppid = sinfo->snd_ppid;
+	sndrcvinfo->sinfo_context = sinfo->snd_context;
+	sndrcvinfo->sinfo_timetolive = pinfo->pr_value;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo));
+#endif
+
+	ret = sendmsg(b->num, &msg, 0);
+
+	BIO_clear_retry_flags(b);
+	if (ret <= 0)
+		{
+		if (BIO_dgram_should_retry(ret))
+			{
+			BIO_set_retry_write(b);  
+			data->_errno = get_last_socket_error();
+			}
+		}
+	return(ret);
+	}
+
+static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr)
+	{
+	long ret=1;
+	bio_dgram_sctp_data *data = NULL;
+	socklen_t sockopt_len = 0;
+	struct sctp_authkeyid authkeyid;
+	struct sctp_authkey *authkey;
+
+	data = (bio_dgram_sctp_data *)b->ptr;
+
+	switch (cmd)
+		{
+	case BIO_CTRL_DGRAM_QUERY_MTU:
+		/* Set to maximum (2^14)
+		 * and ignore user input to enable transport
+		 * protocol fragmentation.
+		 * Returns always 2^14.
+		 */
+		data->mtu = 16384;
+		ret = data->mtu;
+		break;
+	case BIO_CTRL_DGRAM_SET_MTU:
+		/* Set to maximum (2^14)
+		 * and ignore input to enable transport
+		 * protocol fragmentation.
+		 * Returns always 2^14.
+		 */
+		data->mtu = 16384;
+		ret = data->mtu;
+		break;
+	case BIO_CTRL_DGRAM_SET_CONNECTED:
+	case BIO_CTRL_DGRAM_CONNECT:
+		/* Returns always -1. */
+		ret = -1;
+		break;
+	case BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT:
+		/* SCTP doesn't need the DTLS timer
+		 * Returns always 1.
+		 */
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE:
+		if (num > 0)
+			data->in_handshake = 1;
+		else
+			data->in_handshake = 0;
+
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_NODELAY, &data->in_handshake, sizeof(int));
+		break;
+	case BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY:
+		/* New shared key for SCTP AUTH.
+		 * Returns 0 on success, -1 otherwise.
+		 */
+
+		/* Get active key */
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+		if (ret < 0) break;
+
+		/* Add new key */
+		sockopt_len = sizeof(struct sctp_authkey) + 64 * sizeof(uint8_t);
+		authkey = OPENSSL_malloc(sockopt_len);
+		memset(authkey, 0x00, sockopt_len);
+		authkey->sca_keynumber = authkeyid.scact_keynumber + 1;
+#ifndef __FreeBSD__
+		/* This field is missing in FreeBSD 8.2 and earlier,
+		 * and FreeBSD 8.3 and higher work without it.
+		 */
+		authkey->sca_keylength = 64;
+#endif
+		memcpy(&authkey->sca_key[0], ptr, 64 * sizeof(uint8_t));
+
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_KEY, authkey, sockopt_len);
+		if (ret < 0) break;
+
+		/* Reset active key */
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		if (ret < 0) break;
+
+		break;
+	case BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY:
+		/* Returns 0 on success, -1 otherwise. */
+
+		/* Get active key */
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+		if (ret < 0) break;
+
+		/* Set active key */
+		authkeyid.scact_keynumber = authkeyid.scact_keynumber + 1;
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		if (ret < 0) break;
+
+		/* CCS has been sent, so remember that and fall through
+		 * to check if we need to deactivate an old key
+		 */
+		data->ccs_sent = 1;
+
+	case BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD:
+		/* Returns 0 on success, -1 otherwise. */
+
+		/* Has this command really been called or is this just a fall-through? */
+		if (cmd == BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD)
+			data->ccs_rcvd = 1;
+
+		/* CSS has been both, received and sent, so deactivate an old key */
+		if (data->ccs_rcvd == 1 && data->ccs_sent == 1)
+			{
+			/* Get active key */
+			sockopt_len = sizeof(struct sctp_authkeyid);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+			if (ret < 0) break;
+
+			/* Deactivate key or delete second last key if
+			 * SCTP_AUTHENTICATION_EVENT is not available.
+			 */
+			authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+#ifdef SCTP_AUTH_DEACTIVATE_KEY
+			sockopt_len = sizeof(struct sctp_authkeyid);
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DEACTIVATE_KEY,
+			      &authkeyid, sockopt_len);
+			if (ret < 0) break;
+#endif
+#ifndef SCTP_AUTHENTICATION_EVENT
+			if (authkeyid.scact_keynumber > 0)
+				{
+				authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+				ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+					  &authkeyid, sizeof(struct sctp_authkeyid));
+				if (ret < 0) break;
+				}
+#endif
+
+			data->ccs_rcvd = 0;
+			data->ccs_sent = 0;
+			}
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_SNDINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+			num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+		memcpy(ptr, &(data->sndinfo), num);
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_SNDINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+			num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+		memcpy(&(data->sndinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_RCVINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+			num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+		memcpy(ptr, &data->rcvinfo, num);
+
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_RCVINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+			num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+		memcpy(&(data->rcvinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_PRINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+			num = sizeof(struct bio_dgram_sctp_prinfo);
+
+		memcpy(ptr, &(data->prinfo), num);
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_PRINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+			num = sizeof(struct bio_dgram_sctp_prinfo);
+
+		memcpy(&(data->prinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN:
+		/* Returns always 1. */
+		if (num > 0)
+			data->save_shutdown = 1;
+		else
+			data->save_shutdown = 0;
+		break;
+
+	default:
+		/* Pass to default ctrl function to
+		 * process SCTP unspecific commands
+		 */
+		ret=dgram_ctrl(b, cmd, num, ptr);
+		break;
+		}
+	return(ret);
+	}
+
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context)
+	{
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+
+	if (handle_notifications != NULL)
+		{
+		data->handle_notifications = handle_notifications;
+		data->notification_context = context;
+		}
+	else
+		return -1;
+
+	return 0;
+	}
+
+int BIO_dgram_sctp_wait_for_dry(BIO *b)
+{
+	int is_dry = 0;
+	int n, sockflags, ret;
+	union sctp_notification snp;
+	struct msghdr msg;
+	struct iovec iov;
+#ifdef SCTP_EVENT
+	struct sctp_event event;
+#else
+	struct sctp_event_subscribe event;
+	socklen_t eventsize;
+#endif
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+	/* set sender dry event */
+#ifdef SCTP_EVENT
+	memset(&event, 0, sizeof(struct sctp_event));
+	event.se_assoc_id = 0;
+	event.se_type = SCTP_SENDER_DRY_EVENT;
+	event.se_on = 1;
+	ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+	eventsize = sizeof(struct sctp_event_subscribe);
+	ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+	if (ret < 0)
+		return -1;
+	
+	event.sctp_sender_dry_event = 1;
+	
+	ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+	if (ret < 0)
+		return -1;
+
+	/* peek for notification */
+	memset(&snp, 0x00, sizeof(union sctp_notification));
+	iov.iov_base = (char *)&snp;
+	iov.iov_len = sizeof(union sctp_notification);
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+
+	n = recvmsg(b->num, &msg, MSG_PEEK);
+	if (n <= 0)
+		{
+		if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+			return -1;
+		else
+			return 0;
+		}
+
+	/* if we find a notification, process it and try again if necessary */
+	while (msg.msg_flags & MSG_NOTIFICATION)
+		{
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		n = recvmsg(b->num, &msg, 0);
+		if (n <= 0)
+			{
+			if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+				return -1;
+			else
+				return is_dry;
+			}
+		
+		if (snp.sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+			{
+			is_dry = 1;
+
+			/* disable sender dry event */
+#ifdef SCTP_EVENT
+			memset(&event, 0, sizeof(struct sctp_event));
+			event.se_assoc_id = 0;
+			event.se_type = SCTP_SENDER_DRY_EVENT;
+			event.se_on = 0;
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+			eventsize = (socklen_t) sizeof(struct sctp_event_subscribe);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+			if (ret < 0)
+				return -1;
+
+			event.sctp_sender_dry_event = 0;
+
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+			if (ret < 0)
+				return -1;
+			}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+		if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+			dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+		if (data->handle_notifications != NULL)
+			data->handle_notifications(b, data->notification_context, (void*) &snp);
+
+		/* found notification, peek again */
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		/* if we have seen the dry already, don't wait */
+		if (is_dry)
+			{
+			sockflags = fcntl(b->num, F_GETFL, 0);
+			fcntl(b->num, F_SETFL, O_NONBLOCK);
+			}
+
+		n = recvmsg(b->num, &msg, MSG_PEEK);
+
+		if (is_dry)
+			{
+			fcntl(b->num, F_SETFL, sockflags);
+			}
+
+		if (n <= 0)
+			{
+			if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+				return -1;
+			else
+				return is_dry;
+			}
+		}
+
+	/* read anything else */
+	return is_dry;
+}
+
+int BIO_dgram_sctp_msg_waiting(BIO *b)
+	{
+	int n, sockflags;
+	union sctp_notification snp;
+	struct msghdr msg;
+	struct iovec iov;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+	/* Check if there are any messages waiting to be read */
+	do
+		{
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		sockflags = fcntl(b->num, F_GETFL, 0);
+		fcntl(b->num, F_SETFL, O_NONBLOCK);
+		n = recvmsg(b->num, &msg, MSG_PEEK);
+		fcntl(b->num, F_SETFL, sockflags);
+
+		/* if notification, process and try again */
+		if (n > 0 && (msg.msg_flags & MSG_NOTIFICATION))
+			{
+#ifdef SCTP_AUTHENTICATION_EVENT
+			if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+				dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+			memset(&snp, 0x00, sizeof(union sctp_notification));
+			iov.iov_base = (char *)&snp;
+			iov.iov_len = sizeof(union sctp_notification);
+			msg.msg_name = NULL;
+			msg.msg_namelen = 0;
+			msg.msg_iov = &iov;
+			msg.msg_iovlen = 1;
+			msg.msg_control = NULL;
+			msg.msg_controllen = 0;
+			msg.msg_flags = 0;
+			n = recvmsg(b->num, &msg, 0);
+
+			if (data->handle_notifications != NULL)
+				data->handle_notifications(b, data->notification_context, (void*) &snp);
+			}
+
+		} while (n > 0 && (msg.msg_flags & MSG_NOTIFICATION));
+
+	/* Return 1 if there is a message to be read, return 0 otherwise. */
+	if (n > 0)
+		return 1;
+	else
+		return 0;
+	}
+
+static int dgram_sctp_puts(BIO *bp, const char *str)
+	{
+	int n,ret;
+
+	n=strlen(str);
+	ret=dgram_sctp_write(bp,str,n);
+	return(ret);
+	}
+#endif
+
 static int BIO_dgram_should_retry(int i)
 	{
 	int err;
diff --git a/main/openssl/crypto/bn/asm/armv4-gf2m.S b/main/openssl/crypto/bn/asm/armv4-gf2m.S
new file mode 100644
index 00000000..038f0864
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/armv4-gf2m.S
@@ -0,0 +1,213 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.type	mul_1x1_neon,%function
+.align	5
+mul_1x1_neon:
+	vshl.u64	d2,d16,#8	@ q1-q3 are slided 
+	vmull.p8	q0,d16,d17	@ a·bb
+	vshl.u64	d4,d16,#16
+	vmull.p8	q1,d2,d17	@ a<<8·bb
+	vshl.u64	d6,d16,#24
+	vmull.p8	q2,d4,d17	@ a<<16·bb
+	vshr.u64	d2,#8
+	vmull.p8	q3,d6,d17	@ a<<24·bb
+	vshl.u64	d3,#24
+	veor		d0,d2
+	vshr.u64	d4,#16
+	veor		d0,d3
+	vshl.u64	d5,#16
+	veor		d0,d4
+	vshr.u64	d6,#24
+	veor		d0,d5
+	vshl.u64	d7,#8
+	veor		d0,d6
+	veor		d0,d7
+	.word	0xe12fff1e
+.size	mul_1x1_neon,.-mul_1x1_neon
+#endif
+.type	mul_1x1_ialu,%function
+.align	5
+mul_1x1_ialu:
+	mov	r4,#0
+	bic	r5,r1,#3<<30		@ a1=a&0x3fffffff
+	str	r4,[sp,#0]		@ tab[0]=0
+	add	r6,r5,r5		@ a2=a1<<1
+	str	r5,[sp,#4]		@ tab[1]=a1
+	eor	r7,r5,r6		@ a1^a2
+	str	r6,[sp,#8]		@ tab[2]=a2
+	mov	r8,r5,lsl#2		@ a4=a1<<2
+	str	r7,[sp,#12]		@ tab[3]=a1^a2
+	eor	r9,r5,r8		@ a1^a4
+	str	r8,[sp,#16]		@ tab[4]=a4
+	eor	r4,r6,r8		@ a2^a4
+	str	r9,[sp,#20]		@ tab[5]=a1^a4
+	eor	r7,r7,r8		@ a1^a2^a4
+	str	r4,[sp,#24]		@ tab[6]=a2^a4
+	and	r8,r12,r0,lsl#2
+	str	r7,[sp,#28]		@ tab[7]=a1^a2^a4
+
+	and	r9,r12,r0,lsr#1
+	ldr	r5,[sp,r8]		@ tab[b       & 0x7]
+	and	r8,r12,r0,lsr#4
+	ldr	r7,[sp,r9]		@ tab[b >>  3 & 0x7]
+	and	r9,r12,r0,lsr#7
+	ldr	r6,[sp,r8]		@ tab[b >>  6 & 0x7]
+	eor	r5,r5,r7,lsl#3	@ stall
+	mov	r4,r7,lsr#29
+	ldr	r7,[sp,r9]		@ tab[b >>  9 & 0x7]
+
+	and	r8,r12,r0,lsr#10
+	eor	r5,r5,r6,lsl#6
+	eor	r4,r4,r6,lsr#26
+	ldr	r6,[sp,r8]		@ tab[b >> 12 & 0x7]
+
+	and	r9,r12,r0,lsr#13
+	eor	r5,r5,r7,lsl#9
+	eor	r4,r4,r7,lsr#23
+	ldr	r7,[sp,r9]		@ tab[b >> 15 & 0x7]
+
+	and	r8,r12,r0,lsr#16
+	eor	r5,r5,r6,lsl#12
+	eor	r4,r4,r6,lsr#20
+	ldr	r6,[sp,r8]		@ tab[b >> 18 & 0x7]
+
+	and	r9,r12,r0,lsr#19
+	eor	r5,r5,r7,lsl#15
+	eor	r4,r4,r7,lsr#17
+	ldr	r7,[sp,r9]		@ tab[b >> 21 & 0x7]
+
+	and	r8,r12,r0,lsr#22
+	eor	r5,r5,r6,lsl#18
+	eor	r4,r4,r6,lsr#14
+	ldr	r6,[sp,r8]		@ tab[b >> 24 & 0x7]
+
+	and	r9,r12,r0,lsr#25
+	eor	r5,r5,r7,lsl#21
+	eor	r4,r4,r7,lsr#11
+	ldr	r7,[sp,r9]		@ tab[b >> 27 & 0x7]
+
+	tst	r1,#1<<30
+	and	r8,r12,r0,lsr#28
+	eor	r5,r5,r6,lsl#24
+	eor	r4,r4,r6,lsr#8
+	ldr	r6,[sp,r8]		@ tab[b >> 30      ]
+
+	eorne	r5,r5,r0,lsl#30
+	eorne	r4,r4,r0,lsr#2
+	tst	r1,#1<<31
+	eor	r5,r5,r7,lsl#27
+	eor	r4,r4,r7,lsr#5
+	eorne	r5,r5,r0,lsl#31
+	eorne	r4,r4,r0,lsr#1
+	eor	r5,r5,r6,lsl#30
+	eor	r4,r4,r6,lsr#2
+
+	mov	pc,lr
+.size	mul_1x1_ialu,.-mul_1x1_ialu
+.global	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,%function
+.align	5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+.Lpic:	ldr	r12,[pc,r12]
+	tst	r12,#1
+	beq	.Lialu
+
+	veor	d18,d18
+	vmov.32	d19,r3,r3		@ two copies of b1
+	vmov.32	d18[0],r1		@ a1
+
+	veor	d20,d20
+	vld1.32	d21[],[sp,:32]	@ two copies of b0
+	vmov.32	d20[0],r2		@ a0
+	mov	r12,lr
+
+	vmov	d16,d18
+	vmov	d17,d19
+	bl	mul_1x1_neon		@ a1·b1
+	vmov	d22,d0
+
+	vmov	d16,d20
+	vmov	d17,d21
+	bl	mul_1x1_neon		@ a0·b0
+	vmov	d23,d0
+
+	veor	d16,d20,d18
+	veor	d17,d21,d19
+	veor	d20,d23,d22
+	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1)
+
+	veor	d0,d20			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	vshl.u64 d1,d0,#32
+	vshr.u64 d0,d0,#32
+	veor	d23,d1
+	veor	d22,d0
+	vst1.32	{d23[0]},[r0,:32]!
+	vst1.32	{d23[1]},[r0,:32]!
+	vst1.32	{d22[0]},[r0,:32]!
+	vst1.32	{d22[1]},[r0,:32]
+	bx	r12
+.align	4
+.Lialu:
+#endif
+	stmdb	sp!,{r4-r10,lr}
+	mov	r10,r0			@ reassign 1st argument
+	mov	r0,r3			@ r0=b1
+	ldr	r3,[sp,#32]		@ load b0
+	mov	r12,#7<<2
+	sub	sp,sp,#32		@ allocate tab[8]
+
+	bl	mul_1x1_ialu		@ a1·b1
+	str	r5,[r10,#8]
+	str	r4,[r10,#12]
+
+	eor	r0,r0,r3		@ flip b0 and b1
+	 eor	r1,r1,r2		@ flip a0 and a1
+	eor	r3,r3,r0
+	 eor	r2,r2,r1
+	eor	r0,r0,r3
+	 eor	r1,r1,r2
+	bl	mul_1x1_ialu		@ a0·b0
+	str	r5,[r10]
+	str	r4,[r10,#4]
+
+	eor	r1,r1,r2
+	eor	r0,r0,r3
+	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
+	ldmia	r10,{r6-r9}
+	eor	r5,r5,r4
+	eor	r4,r4,r7
+	eor	r5,r5,r6
+	eor	r4,r4,r8
+	eor	r5,r5,r9
+	eor	r4,r4,r9
+	str	r4,[r10,#8]
+	eor	r5,r5,r4
+	add	sp,sp,#32		@ destroy tab[8]
+	str	r5,[r10,#4]
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r10,pc}
+#else
+	ldmia	sp!,{r4-r10,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align	5
+
+.comm	OPENSSL_armcap_P,4,4
diff --git a/main/openssl/crypto/bn/asm/armv4-gf2m.pl b/main/openssl/crypto/bn/asm/armv4-gf2m.pl
new file mode 100644
index 00000000..22ad1f85
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/armv4-gf2m.pl
@@ -0,0 +1,278 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.type	mul_1x1_neon,%function
+.align	5
+mul_1x1_neon:
+	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a
+	vmull.p8	`&Q("d0")`,d16,d17	@ a·bb
+	vshl.u64	`&Dlo("q2")`,d16,#16
+	vmull.p8	q1,`&Dlo("q1")`,d17	@ a<<8·bb
+	vshl.u64	`&Dlo("q3")`,d16,#24
+	vmull.p8	q2,`&Dlo("q2")`,d17	@ a<<16·bb
+	vshr.u64	`&Dlo("q1")`,#8
+	vmull.p8	q3,`&Dlo("q3")`,d17	@ a<<24·bb
+	vshl.u64	`&Dhi("q1")`,#24
+	veor		d0,`&Dlo("q1")`
+	vshr.u64	`&Dlo("q2")`,#16
+	veor		d0,`&Dhi("q1")`
+	vshl.u64	`&Dhi("q2")`,#16
+	veor		d0,`&Dlo("q2")`
+	vshr.u64	`&Dlo("q3")`,#24
+	veor		d0,`&Dhi("q2")`
+	vshl.u64	`&Dhi("q3")`,#8
+	veor		d0,`&Dlo("q3")`
+	veor		d0,`&Dhi("q3")`
+	bx	lr
+.size	mul_1x1_neon,.-mul_1x1_neon
+#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
+.type	mul_1x1_ialu,%function
+.align	5
+mul_1x1_ialu:
+	mov	$a0,#0
+	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
+	str	$a0,[sp,#0]		@ tab[0]=0
+	add	$a2,$a1,$a1		@ a2=a1<<1
+	str	$a1,[sp,#4]		@ tab[1]=a1
+	eor	$a12,$a1,$a2		@ a1^a2
+	str	$a2,[sp,#8]		@ tab[2]=a2
+	mov	$a4,$a1,lsl#2		@ a4=a1<<2
+	str	$a12,[sp,#12]		@ tab[3]=a1^a2
+	eor	$a14,$a1,$a4		@ a1^a4
+	str	$a4,[sp,#16]		@ tab[4]=a4
+	eor	$a0,$a2,$a4		@ a2^a4
+	str	$a14,[sp,#20]		@ tab[5]=a1^a4
+	eor	$a12,$a12,$a4		@ a1^a2^a4
+	str	$a0,[sp,#24]		@ tab[6]=a2^a4
+	and	$i0,$mask,$b,lsl#2
+	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4
+
+	and	$i1,$mask,$b,lsr#1
+	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
+	and	$i0,$mask,$b,lsr#4
+	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
+	and	$i1,$mask,$b,lsr#7
+	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
+	eor	$lo,$lo,$t1,lsl#3	@ stall
+	mov	$hi,$t1,lsr#29
+	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]
+
+	and	$i0,$mask,$b,lsr#10
+	eor	$lo,$lo,$t0,lsl#6
+	eor	$hi,$hi,$t0,lsr#26
+	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]
+
+	and	$i1,$mask,$b,lsr#13
+	eor	$lo,$lo,$t1,lsl#9
+	eor	$hi,$hi,$t1,lsr#23
+	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]
+
+	and	$i0,$mask,$b,lsr#16
+	eor	$lo,$lo,$t0,lsl#12
+	eor	$hi,$hi,$t0,lsr#20
+	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]
+
+	and	$i1,$mask,$b,lsr#19
+	eor	$lo,$lo,$t1,lsl#15
+	eor	$hi,$hi,$t1,lsr#17
+	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]
+
+	and	$i0,$mask,$b,lsr#22
+	eor	$lo,$lo,$t0,lsl#18
+	eor	$hi,$hi,$t0,lsr#14
+	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]
+
+	and	$i1,$mask,$b,lsr#25
+	eor	$lo,$lo,$t1,lsl#21
+	eor	$hi,$hi,$t1,lsr#11
+	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]
+
+	tst	$a,#1<<30
+	and	$i0,$mask,$b,lsr#28
+	eor	$lo,$lo,$t0,lsl#24
+	eor	$hi,$hi,$t0,lsr#8
+	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
+
+	eorne	$lo,$lo,$b,lsl#30
+	eorne	$hi,$hi,$b,lsr#2
+	tst	$a,#1<<31
+	eor	$lo,$lo,$t1,lsl#27
+	eor	$hi,$hi,$t1,lsr#5
+	eorne	$lo,$lo,$b,lsl#31
+	eorne	$hi,$hi,$b,lsr#1
+	eor	$lo,$lo,$t0,lsl#30
+	eor	$hi,$hi,$t0,lsr#2
+
+	mov	pc,lr
+.size	mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void	bn_GF2m_mul_2x2(BN_ULONG *r,
+#	BN_ULONG a1,BN_ULONG a0,
+#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+
+$code.=<<___;
+.global	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,%function
+.align	5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+.Lpic:	ldr	r12,[pc,r12]
+	tst	r12,#1
+	beq	.Lialu
+
+	veor	$A1,$A1
+	vmov.32	$B1,r3,r3		@ two copies of b1
+	vmov.32	${A1}[0],r1		@ a1
+
+	veor	$A0,$A0
+	vld1.32	${B0}[],[sp,:32]	@ two copies of b0
+	vmov.32	${A0}[0],r2		@ a0
+	mov	r12,lr
+
+	vmov	d16,$A1
+	vmov	d17,$B1
+	bl	mul_1x1_neon		@ a1·b1
+	vmov	$A1B1,d0
+
+	vmov	d16,$A0
+	vmov	d17,$B0
+	bl	mul_1x1_neon		@ a0·b0
+	vmov	$A0B0,d0
+
+	veor	d16,$A0,$A1
+	veor	d17,$B0,$B1
+	veor	$A0,$A0B0,$A1B1
+	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1)
+
+	veor	d0,$A0			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	vshl.u64 d1,d0,#32
+	vshr.u64 d0,d0,#32
+	veor	$A0B0,d1
+	veor	$A1B1,d0
+	vst1.32	{${A0B0}[0]},[r0,:32]!
+	vst1.32	{${A0B0}[1]},[r0,:32]!
+	vst1.32	{${A1B1}[0]},[r0,:32]!
+	vst1.32	{${A1B1}[1]},[r0,:32]
+	bx	r12
+.align	4
+.Lialu:
+#endif
+___
+$ret="r10";	# reassigned 1st argument
+$code.=<<___;
+	stmdb	sp!,{r4-r10,lr}
+	mov	$ret,r0			@ reassign 1st argument
+	mov	$b,r3			@ $b=b1
+	ldr	r3,[sp,#32]		@ load b0
+	mov	$mask,#7<<2
+	sub	sp,sp,#32		@ allocate tab[8]
+
+	bl	mul_1x1_ialu		@ a1·b1
+	str	$lo,[$ret,#8]
+	str	$hi,[$ret,#12]
+
+	eor	$b,$b,r3		@ flip b0 and b1
+	 eor	$a,$a,r2		@ flip a0 and a1
+	eor	r3,r3,$b
+	 eor	r2,r2,$a
+	eor	$b,$b,r3
+	 eor	$a,$a,r2
+	bl	mul_1x1_ialu		@ a0·b0
+	str	$lo,[$ret]
+	str	$hi,[$ret,#4]
+
+	eor	$a,$a,r2
+	eor	$b,$b,r3
+	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+	ldmia	$ret,{@r[0]-@r[3]}
+	eor	$lo,$lo,$hi
+	eor	$hi,$hi,@r[1]
+	eor	$lo,$lo,@r[0]
+	eor	$hi,$hi,@r[2]
+	eor	$lo,$lo,@r[3]
+	eor	$hi,$hi,@r[3]
+	str	$hi,[$ret,#8]
+	eor	$lo,$lo,$hi
+	add	sp,sp,#32		@ destroy tab[8]
+	str	$lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r10,pc}
+#else
+	ldmia	sp!,{r4-r10,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+
+.comm	OPENSSL_armcap_P,4,4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT;   # enforce flush
diff --git a/main/openssl/crypto/bn/asm/armv4-mont.pl b/main/openssl/crypto/bn/asm/armv4-mont.pl
index 14e0d2d1..f78a8b5f 100644
--- a/main/openssl/crypto/bn/asm/armv4-mont.pl
+++ b/main/openssl/crypto/bn/asm/armv4-mont.pl
@@ -23,6 +23,9 @@
 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
 # about decorations, ABI and instruction syntax are identical.
 
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $num="r0";	# starts as num argument, but holds &tp[num-1]
 $ap="r1";
 $bp="r2"; $bi="r2"; $rp="r2";
@@ -89,9 +92,9 @@ bn_mul_mont:
 .L1st:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
-	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
 	adds	$nlo,$nlo,$alo
@@ -101,21 +104,21 @@ bn_mul_mont:
 	bne	.L1st
 
 	adds	$nlo,$nlo,$ahi
+	ldr	$tp,[$_bp]		@ restore bp
 	mov	$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
 	adc	$nhi,$nhi,#0
-	ldr	$tp,[$_bp]		@ restore bp
 	str	$nlo,[$num]		@ tp[num-1]=
-	ldr	$n0,[$_n0]		@ restore n0
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 .Louter:
 	sub	$tj,$num,sp		@ "original" $num-1 value
 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
-	sub	$np,$np,$tj		@ "rewind" np to &np[1]
 	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
 	ldr	$aj,[$ap,#-4]		@ ap[0]
-	ldr	$nj,[$np,#-4]		@ np[0]
 	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
 	ldr	$tj,[sp,#4]		@ tp[1]
 
 	mov	$ahi,#0
@@ -129,13 +132,13 @@ bn_mul_mont:
 .Linner:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
-	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
-	ldr	$tj,[$tp,#8]		@ tp[j+1]
 	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
 	adds	$nlo,$nlo,$alo
 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
 	adc	$nlo,$nhi,#0
@@ -144,13 +147,13 @@ bn_mul_mont:
 
 	adds	$nlo,$nlo,$ahi
 	mov	$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
 	adc	$nhi,$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
 	adds	$nlo,$nlo,$tj
-	adc	$nhi,$nhi,#0
-	ldr	$tp,[$_bp]		@ restore bp
 	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
 	str	$nlo,[$num]		@ tp[num-1]=
-	ldr	$n0,[$_n0]		@ restore n0
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 	cmp	$tp,$tj
diff --git a/main/openssl/crypto/bn/asm/armv4-mont.s b/main/openssl/crypto/bn/asm/armv4-mont.s
index 0488455f..64c220b5 100644
--- a/main/openssl/crypto/bn/asm/armv4-mont.s
+++ b/main/openssl/crypto/bn/asm/armv4-mont.s
@@ -38,9 +38,9 @@ bn_mul_mont:
 .L1st:
 	ldr	r5,[r1],#4		@ ap[j],ap++
 	mov	r10,r11
+	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r11,#0
 	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
-	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r14,#0
 	umlal	r12,r14,r6,r8	@ np[j]*n0
 	adds	r12,r12,r10
@@ -50,21 +50,21 @@ bn_mul_mont:
 	bne	.L1st
 
 	adds	r12,r12,r11
+	ldr	r4,[r0,#13*4]		@ restore bp
 	mov	r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
 	adc	r14,r14,#0
-	ldr	r4,[r0,#13*4]		@ restore bp
 	str	r12,[r0]		@ tp[num-1]=
-	ldr	r8,[r0,#14*4]		@ restore n0
 	str	r14,[r0,#4]		@ tp[num]=
 
 .Louter:
 	sub	r7,r0,sp		@ "original" r0-1 value
 	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
-	sub	r3,r3,r7		@ "rewind" np to &np[1]
 	ldr	r2,[r4,#4]!		@ *(++bp)
+	sub	r3,r3,r7		@ "rewind" np to &np[1]
 	ldr	r5,[r1,#-4]		@ ap[0]
-	ldr	r6,[r3,#-4]		@ np[0]
 	ldr	r10,[sp]		@ tp[0]
+	ldr	r6,[r3,#-4]		@ np[0]
 	ldr	r7,[sp,#4]		@ tp[1]
 
 	mov	r11,#0
@@ -78,13 +78,13 @@ bn_mul_mont:
 .Linner:
 	ldr	r5,[r1],#4		@ ap[j],ap++
 	adds	r10,r11,r7		@ +=tp[j]
+	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r11,#0
 	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
-	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r14,#0
 	umlal	r12,r14,r6,r8	@ np[j]*n0
-	ldr	r7,[r4,#8]		@ tp[j+1]
 	adc	r11,r11,#0
+	ldr	r7,[r4,#8]		@ tp[j+1]
 	adds	r12,r12,r10
 	str	r12,[r4],#4		@ tp[j-1]=,tp++
 	adc	r12,r14,#0
@@ -93,13 +93,13 @@ bn_mul_mont:
 
 	adds	r12,r12,r11
 	mov	r14,#0
+	ldr	r4,[r0,#13*4]		@ restore bp
 	adc	r14,r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
 	adds	r12,r12,r7
-	adc	r14,r14,#0
-	ldr	r4,[r0,#13*4]		@ restore bp
 	ldr	r7,[r0,#15*4]		@ restore &bp[num]
+	adc	r14,r14,#0
 	str	r12,[r0]		@ tp[num-1]=
-	ldr	r8,[r0,#14*4]		@ restore n0
 	str	r14,[r0,#4]		@ tp[num]=
 
 	cmp	r4,r7
diff --git a/main/openssl/crypto/bn/asm/bn-586.S b/main/openssl/crypto/bn/asm/bn-586.S
new file mode 100644
index 00000000..fe873ce9
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/bn-586.S
@@ -0,0 +1,1384 @@
+.file	"crypto/bn/asm/bn-586.s"
+.text
+.globl	bn_mul_add_words
+.type	bn_mul_add_words,@function
+.align	16
+bn_mul_add_words:
+.L_bn_mul_add_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	28(%esp),%ecx
+	movl	24(%esp),%ebx
+	andl	$4294967288,%ecx
+	movl	32(%esp),%ebp
+	pushl	%ecx
+	jz	.L000maw_finish
+.align	16
+.L001maw_loop:
+
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+
+	movl	28(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	28(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	subl	$8,%ecx
+	leal	32(%ebx),%ebx
+	leal	32(%edi),%edi
+	jnz	.L001maw_loop
+.L000maw_finish:
+	movl	32(%esp),%ecx
+	andl	$7,%ecx
+	jnz	.L002maw_finish2
+	jmp	.L003maw_end
+.L002maw_finish2:
+
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+.L003maw_end:
+	movl	%esi,%eax
+	popl	%ecx
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_add_words,.-.L_bn_mul_add_words_begin
+.globl	bn_mul_words
+.type	bn_mul_words,@function
+.align	16
+bn_mul_words:
+.L_bn_mul_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ebp
+	movl	32(%esp),%ecx
+	andl	$4294967288,%ebp
+	jz	.L004mw_finish
+.L005mw_loop:
+
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+
+	movl	28(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	addl	$32,%ebx
+	addl	$32,%edi
+	subl	$8,%ebp
+	jz	.L004mw_finish
+	jmp	.L005mw_loop
+.L004mw_finish:
+	movl	28(%esp),%ebp
+	andl	$7,%ebp
+	jnz	.L006mw_finish2
+	jmp	.L007mw_end
+.L006mw_finish2:
+
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+.L007mw_end:
+	movl	%esi,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_words,.-.L_bn_mul_words_begin
+.globl	bn_sqr_words
+.type	bn_sqr_words,@function
+.align	16
+bn_sqr_words:
+.L_bn_sqr_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%ebx
+	andl	$4294967288,%ebx
+	jz	.L008sw_finish
+.L009sw_loop:
+
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	movl	%edx,4(%esi)
+
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	movl	%edx,12(%esi)
+
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	movl	%edx,20(%esi)
+
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	movl	%edx,28(%esi)
+
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	movl	%edx,36(%esi)
+
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	movl	%edx,44(%esi)
+
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+
+	movl	28(%edi),%eax
+	mull	%eax
+	movl	%eax,56(%esi)
+	movl	%edx,60(%esi)
+
+	addl	$32,%edi
+	addl	$64,%esi
+	subl	$8,%ebx
+	jnz	.L009sw_loop
+.L008sw_finish:
+	movl	28(%esp),%ebx
+	andl	$7,%ebx
+	jz	.L010sw_end
+
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	decl	%ebx
+	movl	%edx,4(%esi)
+	jz	.L010sw_end
+
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	decl	%ebx
+	movl	%edx,12(%esi)
+	jz	.L010sw_end
+
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	decl	%ebx
+	movl	%edx,20(%esi)
+	jz	.L010sw_end
+
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	decl	%ebx
+	movl	%edx,28(%esi)
+	jz	.L010sw_end
+
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	decl	%ebx
+	movl	%edx,36(%esi)
+	jz	.L010sw_end
+
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	decl	%ebx
+	movl	%edx,44(%esi)
+	jz	.L010sw_end
+
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+.L010sw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sqr_words,.-.L_bn_sqr_words_begin
+.globl	bn_div_words
+.type	bn_div_words,@function
+.align	16
+bn_div_words:
+.L_bn_div_words_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	movl	12(%esp),%ecx
+	divl	%ecx
+	ret
+.size	bn_div_words,.-.L_bn_div_words_begin
+.globl	bn_add_words
+.type	bn_add_words,@function
+.align	16
+bn_add_words:
+.L_bn_add_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L011aw_finish
+.L012aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L012aw_loop
+.L011aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L013aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L013aw_end
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L013aw_end
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L013aw_end
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L013aw_end
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L013aw_end
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L013aw_end
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+.L013aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_add_words,.-.L_bn_add_words_begin
+.globl	bn_sub_words
+.type	bn_sub_words,@function
+.align	16
+bn_sub_words:
+.L_bn_sub_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L014aw_finish
+.L015aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L015aw_loop
+.L014aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L016aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L016aw_end
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L016aw_end
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L016aw_end
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L016aw_end
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L016aw_end
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L016aw_end
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+.L016aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sub_words,.-.L_bn_sub_words_begin
+.globl	bn_sub_part_words
+.type	bn_sub_part_words,@function
+.align	16
+bn_sub_part_words:
+.L_bn_sub_part_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L017aw_finish
+.L018aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L018aw_loop
+.L017aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+.L019aw_end:
+	cmpl	$0,36(%esp)
+	je	.L020pw_end
+	movl	36(%esp),%ebp
+	cmpl	$0,%ebp
+	je	.L020pw_end
+	jge	.L021pw_pos
+
+	movl	$0,%edx
+	subl	%ebp,%edx
+	movl	%edx,%ebp
+	andl	$4294967288,%ebp
+	jz	.L022pw_neg_finish
+.L023pw_neg_loop:
+
+	movl	$0,%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	$0,%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	$0,%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	$0,%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	$0,%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	$0,%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	$0,%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	$0,%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L023pw_neg_loop
+.L022pw_neg_finish:
+	movl	36(%esp),%edx
+	movl	$0,%ebp
+	subl	%edx,%ebp
+	andl	$7,%ebp
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+	jmp	.L020pw_end
+.L021pw_pos:
+	andl	$4294967288,%ebp
+	jz	.L024pw_pos_finish
+.L025pw_pos_loop:
+
+	movl	(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,(%ebx)
+	jnc	.L026pw_nc0
+
+	movl	4(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,4(%ebx)
+	jnc	.L027pw_nc1
+
+	movl	8(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,8(%ebx)
+	jnc	.L028pw_nc2
+
+	movl	12(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,12(%ebx)
+	jnc	.L029pw_nc3
+
+	movl	16(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,16(%ebx)
+	jnc	.L030pw_nc4
+
+	movl	20(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,20(%ebx)
+	jnc	.L031pw_nc5
+
+	movl	24(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,24(%ebx)
+	jnc	.L032pw_nc6
+
+	movl	28(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,28(%ebx)
+	jnc	.L033pw_nc7
+
+	addl	$32,%esi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L025pw_pos_loop
+.L024pw_pos_finish:
+	movl	36(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L020pw_end
+
+	movl	(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,(%ebx)
+	jnc	.L034pw_tail_nc0
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	4(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,4(%ebx)
+	jnc	.L035pw_tail_nc1
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	8(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,8(%ebx)
+	jnc	.L036pw_tail_nc2
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	12(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,12(%ebx)
+	jnc	.L037pw_tail_nc3
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	16(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,16(%ebx)
+	jnc	.L038pw_tail_nc4
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	20(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,20(%ebx)
+	jnc	.L039pw_tail_nc5
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	24(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,24(%ebx)
+	jnc	.L040pw_tail_nc6
+	movl	$1,%eax
+	jmp	.L020pw_end
+.L041pw_nc_loop:
+	movl	(%esi),%ecx
+	movl	%ecx,(%ebx)
+.L026pw_nc0:
+	movl	4(%esi),%ecx
+	movl	%ecx,4(%ebx)
+.L027pw_nc1:
+	movl	8(%esi),%ecx
+	movl	%ecx,8(%ebx)
+.L028pw_nc2:
+	movl	12(%esi),%ecx
+	movl	%ecx,12(%ebx)
+.L029pw_nc3:
+	movl	16(%esi),%ecx
+	movl	%ecx,16(%ebx)
+.L030pw_nc4:
+	movl	20(%esi),%ecx
+	movl	%ecx,20(%ebx)
+.L031pw_nc5:
+	movl	24(%esi),%ecx
+	movl	%ecx,24(%ebx)
+.L032pw_nc6:
+	movl	28(%esi),%ecx
+	movl	%ecx,28(%ebx)
+.L033pw_nc7:
+
+	addl	$32,%esi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L041pw_nc_loop
+	movl	36(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L042pw_nc_end
+	movl	(%esi),%ecx
+	movl	%ecx,(%ebx)
+.L034pw_tail_nc0:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	4(%esi),%ecx
+	movl	%ecx,4(%ebx)
+.L035pw_tail_nc1:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	8(%esi),%ecx
+	movl	%ecx,8(%ebx)
+.L036pw_tail_nc2:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	12(%esi),%ecx
+	movl	%ecx,12(%ebx)
+.L037pw_tail_nc3:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	16(%esi),%ecx
+	movl	%ecx,16(%ebx)
+.L038pw_tail_nc4:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	20(%esi),%ecx
+	movl	%ecx,20(%ebx)
+.L039pw_tail_nc5:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	24(%esi),%ecx
+	movl	%ecx,24(%ebx)
+.L040pw_tail_nc6:
+.L042pw_nc_end:
+	movl	$0,%eax
+.L020pw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sub_part_words,.-.L_bn_sub_part_words_begin
diff --git a/main/openssl/crypto/bn/asm/bn-mips.S b/main/openssl/crypto/bn/asm/bn-mips.S
new file mode 100644
index 00000000..2e7cccb7
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/bn-mips.S
@@ -0,0 +1,2175 @@
+.set	mips2
+.rdata
+.asciiz	"mips3.s, Version 1.2"
+.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
+
+.text
+.set	noat
+
+.align	5
+.globl	bn_mul_add_words
+.ent	bn_mul_add_words
+bn_mul_add_words:
+	.set	noreorder
+	bgtz	$6,bn_mul_add_words_internal
+	move	$2,$0
+	jr	$31
+	move	$4,$2
+.end	bn_mul_add_words
+
+.align	5
+.ent	bn_mul_add_words_internal
+bn_mul_add_words_internal:
+	.set	reorder
+	li	$3,-4
+	and	$8,$6,$3
+	beqz	$8,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+	lw	$12,0($5)
+	multu	$12,$7
+	lw	$13,0($4)
+	lw	$14,4($5)
+	lw	$15,4($4)
+	lw	$8,2*4($5)
+	lw	$9,2*4($4)
+	addu	$13,$2
+	sltu	$2,$13,$2	# All manuals say it "compares 32-bit
+				# values", but it seems to work fine
+				# even on 64-bit registers.
+	mflo	$1
+	mfhi	$12
+	addu	$13,$1
+	addu	$2,$12
+	 multu	$14,$7
+	sltu	$1,$13,$1
+	sw	$13,0($4)
+	addu	$2,$1
+
+	lw	$10,3*4($5)
+	lw	$11,3*4($4)
+	addu	$15,$2
+	sltu	$2,$15,$2
+	mflo	$1
+	mfhi	$14
+	addu	$15,$1
+	addu	$2,$14
+	 multu	$8,$7
+	sltu	$1,$15,$1
+	sw	$15,4($4)
+	addu	$2,$1
+
+	subu	$6,4
+	addu $4,4*4
+	addu $5,4*4
+	addu	$9,$2
+	sltu	$2,$9,$2
+	mflo	$1
+	mfhi	$8
+	addu	$9,$1
+	addu	$2,$8
+	 multu	$10,$7
+	sltu	$1,$9,$1
+	sw	$9,-2*4($4)
+	addu	$2,$1
+
+
+	and	$8,$6,$3
+	addu	$11,$2
+	sltu	$2,$11,$2
+	mflo	$1
+	mfhi	$10
+	addu	$11,$1
+	addu	$2,$10
+	sltu	$1,$11,$1
+	sw	$11,-4($4)
+	.set	noreorder
+	bgtz	$8,.L_bn_mul_add_words_loop
+	addu	$2,$1
+
+	beqz	$6,.L_bn_mul_add_words_return
+	nop
+
+.L_bn_mul_add_words_tail:
+	.set	reorder
+	lw	$12,0($5)
+	multu	$12,$7
+	lw	$13,0($4)
+	subu	$6,1
+	addu	$13,$2
+	sltu	$2,$13,$2
+	mflo	$1
+	mfhi	$12
+	addu	$13,$1
+	addu	$2,$12
+	sltu	$1,$13,$1
+	sw	$13,0($4)
+	addu	$2,$1
+	beqz	$6,.L_bn_mul_add_words_return
+
+	lw	$12,4($5)
+	multu	$12,$7
+	lw	$13,4($4)
+	subu	$6,1
+	addu	$13,$2
+	sltu	$2,$13,$2
+	mflo	$1
+	mfhi	$12
+	addu	$13,$1
+	addu	$2,$12
+	sltu	$1,$13,$1
+	sw	$13,4($4)
+	addu	$2,$1
+	beqz	$6,.L_bn_mul_add_words_return
+
+	lw	$12,2*4($5)
+	multu	$12,$7
+	lw	$13,2*4($4)
+	addu	$13,$2
+	sltu	$2,$13,$2
+	mflo	$1
+	mfhi	$12
+	addu	$13,$1
+	addu	$2,$12
+	sltu	$1,$13,$1
+	sw	$13,2*4($4)
+	addu	$2,$1
+
+.L_bn_mul_add_words_return:
+	.set	noreorder
+	jr	$31
+	move	$4,$2
+.end	bn_mul_add_words_internal
+
+.align	5
+.globl	bn_mul_words
+.ent	bn_mul_words
+bn_mul_words:
+	.set	noreorder
+	bgtz	$6,bn_mul_words_internal
+	move	$2,$0
+	jr	$31
+	move	$4,$2
+.end	bn_mul_words
+
+.align	5
+.ent	bn_mul_words_internal
+bn_mul_words_internal:
+	.set	reorder
+	li	$3,-4
+	and	$8,$6,$3
+	beqz	$8,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+	lw	$12,0($5)
+	multu	$12,$7
+	lw	$14,4($5)
+	lw	$8,2*4($5)
+	lw	$10,3*4($5)
+	mflo	$1
+	mfhi	$12
+	addu	$2,$1
+	sltu	$13,$2,$1
+	 multu	$14,$7
+	sw	$2,0($4)
+	addu	$2,$13,$12
+
+	subu	$6,4
+	addu $4,4*4
+	addu $5,4*4
+	mflo	$1
+	mfhi	$14
+	addu	$2,$1
+	sltu	$15,$2,$1
+	 multu	$8,$7
+	sw	$2,-3*4($4)
+	addu	$2,$15,$14
+
+	mflo	$1
+	mfhi	$8
+	addu	$2,$1
+	sltu	$9,$2,$1
+	 multu	$10,$7
+	sw	$2,-2*4($4)
+	addu	$2,$9,$8
+
+	and	$8,$6,$3
+	mflo	$1
+	mfhi	$10
+	addu	$2,$1
+	sltu	$11,$2,$1
+	sw	$2,-4($4)
+	.set	noreorder
+	bgtz	$8,.L_bn_mul_words_loop
+	addu	$2,$11,$10
+
+	beqz	$6,.L_bn_mul_words_return
+	nop
+
+.L_bn_mul_words_tail:
+	.set	reorder
+	lw	$12,0($5)
+	multu	$12,$7
+	subu	$6,1
+	mflo	$1
+	mfhi	$12
+	addu	$2,$1
+	sltu	$13,$2,$1
+	sw	$2,0($4)
+	addu	$2,$13,$12
+	beqz	$6,.L_bn_mul_words_return
+
+	lw	$12,4($5)
+	multu	$12,$7
+	subu	$6,1
+	mflo	$1
+	mfhi	$12
+	addu	$2,$1
+	sltu	$13,$2,$1
+	sw	$2,4($4)
+	addu	$2,$13,$12
+	beqz	$6,.L_bn_mul_words_return
+
+	lw	$12,2*4($5)
+	multu	$12,$7
+	mflo	$1
+	mfhi	$12
+	addu	$2,$1
+	sltu	$13,$2,$1
+	sw	$2,2*4($4)
+	addu	$2,$13,$12
+
+.L_bn_mul_words_return:
+	.set	noreorder
+	jr	$31
+	move	$4,$2
+.end	bn_mul_words_internal
+
+.align	5
+.globl	bn_sqr_words
+.ent	bn_sqr_words
+bn_sqr_words:
+	.set	noreorder
+	bgtz	$6,bn_sqr_words_internal
+	move	$2,$0
+	jr	$31
+	move	$4,$2
+.end	bn_sqr_words
+
+.align	5
+.ent	bn_sqr_words_internal
+bn_sqr_words_internal:
+	.set	reorder
+	li	$3,-4
+	and	$8,$6,$3
+	beqz	$8,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+	lw	$12,0($5)
+	multu	$12,$12
+	lw	$14,4($5)
+	lw	$8,2*4($5)
+	lw	$10,3*4($5)
+	mflo	$13
+	mfhi	$12
+	sw	$13,0($4)
+	sw	$12,4($4)
+
+	multu	$14,$14
+	subu	$6,4
+	addu $4,8*4
+	addu $5,4*4
+	mflo	$15
+	mfhi	$14
+	sw	$15,-6*4($4)
+	sw	$14,-5*4($4)
+
+	multu	$8,$8
+	mflo	$9
+	mfhi	$8
+	sw	$9,-4*4($4)
+	sw	$8,-3*4($4)
+
+
+	multu	$10,$10
+	and	$8,$6,$3
+	mflo	$11
+	mfhi	$10
+	sw	$11,-2*4($4)
+
+	.set	noreorder
+	bgtz	$8,.L_bn_sqr_words_loop
+	sw	$10,-4($4)
+
+	beqz	$6,.L_bn_sqr_words_return
+	nop
+
+.L_bn_sqr_words_tail:
+	.set	reorder
+	lw	$12,0($5)
+	multu	$12,$12
+	subu	$6,1
+	mflo	$13
+	mfhi	$12
+	sw	$13,0($4)
+	sw	$12,4($4)
+	beqz	$6,.L_bn_sqr_words_return
+
+	lw	$12,4($5)
+	multu	$12,$12
+	subu	$6,1
+	mflo	$13
+	mfhi	$12
+	sw	$13,2*4($4)
+	sw	$12,3*4($4)
+	beqz	$6,.L_bn_sqr_words_return
+
+	lw	$12,2*4($5)
+	multu	$12,$12
+	mflo	$13
+	mfhi	$12
+	sw	$13,4*4($4)
+	sw	$12,5*4($4)
+
+.L_bn_sqr_words_return:
+	.set	noreorder
+	jr	$31
+	move	$4,$2
+
+.end	bn_sqr_words_internal
+
+.align	5
+.globl	bn_add_words
+.ent	bn_add_words
+bn_add_words:
+	.set	noreorder
+	bgtz	$7,bn_add_words_internal
+	move	$2,$0
+	jr	$31
+	move	$4,$2
+.end	bn_add_words
+
+.align	5
+.ent	bn_add_words_internal
+bn_add_words_internal:
+	.set	reorder
+	li	$3,-4
+	and	$1,$7,$3
+	beqz	$1,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+	lw	$12,0($5)
+	lw	$8,0($6)
+	subu	$7,4
+	lw	$13,4($5)
+	and	$1,$7,$3
+	lw	$14,2*4($5)
+	addu $6,4*4
+	lw	$15,3*4($5)
+	addu $4,4*4
+	lw	$9,-3*4($6)
+	addu $5,4*4
+	lw	$10,-2*4($6)
+	lw	$11,-4($6)
+	addu	$8,$12
+	sltu	$24,$8,$12
+	addu	$12,$8,$2
+	sltu	$2,$12,$8
+	sw	$12,-4*4($4)
+	addu	$2,$24
+
+	addu	$9,$13
+	sltu	$25,$9,$13
+	addu	$13,$9,$2
+	sltu	$2,$13,$9
+	sw	$13,-3*4($4)
+	addu	$2,$25
+
+	addu	$10,$14
+	sltu	$24,$10,$14
+	addu	$14,$10,$2
+	sltu	$2,$14,$10
+	sw	$14,-2*4($4)
+	addu	$2,$24
+	
+	addu	$11,$15
+	sltu	$25,$11,$15
+	addu	$15,$11,$2
+	sltu	$2,$15,$11
+	sw	$15,-4($4)
+	
+	.set	noreorder
+	bgtz	$1,.L_bn_add_words_loop
+	addu	$2,$25
+
+	beqz	$7,.L_bn_add_words_return
+	nop
+
+.L_bn_add_words_tail:
+	.set	reorder
+	lw	$12,0($5)
+	lw	$8,0($6)
+	addu	$8,$12
+	subu	$7,1
+	sltu	$24,$8,$12
+	addu	$12,$8,$2
+	sltu	$2,$12,$8
+	sw	$12,0($4)
+	addu	$2,$24
+	beqz	$7,.L_bn_add_words_return
+
+	lw	$13,4($5)
+	lw	$9,4($6)
+	addu	$9,$13
+	subu	$7,1
+	sltu	$25,$9,$13
+	addu	$13,$9,$2
+	sltu	$2,$13,$9
+	sw	$13,4($4)
+	addu	$2,$25
+	beqz	$7,.L_bn_add_words_return
+
+	lw	$14,2*4($5)
+	lw	$10,2*4($6)
+	addu	$10,$14
+	sltu	$24,$10,$14
+	addu	$14,$10,$2
+	sltu	$2,$14,$10
+	sw	$14,2*4($4)
+	addu	$2,$24
+
+.L_bn_add_words_return:
+	.set	noreorder
+	jr	$31
+	move	$4,$2
+
+.end	bn_add_words_internal
+
+.align	5
+.globl	bn_sub_words
+.ent	bn_sub_words
+bn_sub_words:
+	.set	noreorder
+	bgtz	$7,bn_sub_words_internal
+	move	$2,$0
+	jr	$31
+	move	$4,$0
+.end	bn_sub_words
+
+.align	5
+.ent	bn_sub_words_internal
+bn_sub_words_internal:
+	.set	reorder
+	li	$3,-4
+	and	$1,$7,$3
+	beqz	$1,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+	lw	$12,0($5)
+	lw	$8,0($6)
+	subu	$7,4
+	lw	$13,4($5)
+	and	$1,$7,$3
+	lw	$14,2*4($5)
+	addu $6,4*4
+	lw	$15,3*4($5)
+	addu $4,4*4
+	lw	$9,-3*4($6)
+	addu $5,4*4
+	lw	$10,-2*4($6)
+	lw	$11,-4($6)
+	sltu	$24,$12,$8
+	subu	$8,$12,$8
+	subu	$12,$8,$2
+	sgtu	$2,$12,$8
+	sw	$12,-4*4($4)
+	addu	$2,$24
+
+	sltu	$25,$13,$9
+	subu	$9,$13,$9
+	subu	$13,$9,$2
+	sgtu	$2,$13,$9
+	sw	$13,-3*4($4)
+	addu	$2,$25
+
+
+	sltu	$24,$14,$10
+	subu	$10,$14,$10
+	subu	$14,$10,$2
+	sgtu	$2,$14,$10
+	sw	$14,-2*4($4)
+	addu	$2,$24
+
+	sltu	$25,$15,$11
+	subu	$11,$15,$11
+	subu	$15,$11,$2
+	sgtu	$2,$15,$11
+	sw	$15,-4($4)
+
+	.set	noreorder
+	bgtz	$1,.L_bn_sub_words_loop
+	addu	$2,$25
+
+	beqz	$7,.L_bn_sub_words_return
+	nop
+
+.L_bn_sub_words_tail:
+	.set	reorder
+	lw	$12,0($5)
+	lw	$8,0($6)
+	subu	$7,1
+	sltu	$24,$12,$8
+	subu	$8,$12,$8
+	subu	$12,$8,$2
+	sgtu	$2,$12,$8
+	sw	$12,0($4)
+	addu	$2,$24
+	beqz	$7,.L_bn_sub_words_return
+
+	lw	$13,4($5)
+	subu	$7,1
+	lw	$9,4($6)
+	sltu	$25,$13,$9
+	subu	$9,$13,$9
+	subu	$13,$9,$2
+	sgtu	$2,$13,$9
+	sw	$13,4($4)
+	addu	$2,$25
+	beqz	$7,.L_bn_sub_words_return
+
+	lw	$14,2*4($5)
+	lw	$10,2*4($6)
+	sltu	$24,$14,$10
+	subu	$10,$14,$10
+	subu	$14,$10,$2
+	sgtu	$2,$14,$10
+	sw	$14,2*4($4)
+	addu	$2,$24
+
+.L_bn_sub_words_return:
+	.set	noreorder
+	jr	$31
+	move	$4,$2
+.end	bn_sub_words_internal
+
+.align 5
+.globl	bn_div_3_words
+.ent	bn_div_3_words
+bn_div_3_words:
+	.set	noreorder
+	move	$7,$4		# we know that bn_div_words does not
+				# touch $7, $10, $11 and preserves $6
+				# so that we can save two arguments
+				# and return address in registers
+				# instead of stack:-)
+				
+	lw	$4,($7)
+	move	$10,$5
+	bne	$4,$6,bn_div_3_words_internal
+	lw	$5,-4($7)
+	li	$2,-1
+	jr	$31
+	move	$4,$2
+.end	bn_div_3_words
+
+.align	5
+.ent	bn_div_3_words_internal
+bn_div_3_words_internal:
+	.set	reorder
+	move	$11,$31
+	bal	bn_div_words_internal
+	move	$31,$11
+	multu	$10,$2
+	lw	$14,-2*4($7)
+	move	$8,$0
+	mfhi	$13
+	mflo	$12
+	sltu	$24,$13,$5
+.L_bn_div_3_words_inner_loop:
+	bnez	$24,.L_bn_div_3_words_inner_loop_done
+	sgeu	$1,$14,$12
+	seq	$25,$13,$5
+	and	$1,$25
+	sltu	$15,$12,$10
+	addu	$5,$6
+	subu	$13,$15
+	subu	$12,$10
+	sltu	$24,$13,$5
+	sltu	$8,$5,$6
+	or	$24,$8
+	.set	noreorder
+	beqz	$1,.L_bn_div_3_words_inner_loop
+	subu	$2,1
+	addu	$2,1
+	.set	reorder
+.L_bn_div_3_words_inner_loop_done:
+	.set	noreorder
+	jr	$31
+	move	$4,$2
+.end	bn_div_3_words_internal
+
+.align	5
+.globl	bn_div_words
+.ent	bn_div_words
+bn_div_words:
+	.set	noreorder
+	bnez	$6,bn_div_words_internal
+	li	$2,-1		# I would rather signal div-by-zero
+				# which can be done with 'break 7'
+	jr	$31
+	move	$4,$2
+.end	bn_div_words
+
+.align	5
+.ent	bn_div_words_internal
+bn_div_words_internal:
+	move	$3,$0
+	bltz	$6,.L_bn_div_words_body
+	move	$25,$3
+	sll	$6,1
+	bgtz	$6,.-4
+	addu	$25,1
+
+	.set	reorder
+	negu	$13,$25
+	li	$14,-1
+	sll	$14,$13
+	and	$14,$4
+	srl	$1,$5,$13
+	.set	noreorder
+	beqz	$14,.+12
+	nop
+	break	6		# signal overflow
+	.set	reorder
+	sll	$4,$25
+	sll	$5,$25
+	or	$4,$1
+.L_bn_div_words_body:
+	srl	$3,$6,4*4	# bits
+	sgeu	$1,$4,$6
+	.set	noreorder
+	beqz	$1,.+12
+	nop
+	subu	$4,$6
+	.set	reorder
+
+	li	$8,-1
+	srl	$9,$4,4*4	# bits
+	srl	$8,4*4	# q=0xffffffff
+	beq	$3,$9,.L_bn_div_words_skip_div1
+	divu	$0,$4,$3
+	mflo	$8
+.L_bn_div_words_skip_div1:
+	multu	$6,$8
+	sll	$15,$4,4*4	# bits
+	srl	$1,$5,4*4	# bits
+	or	$15,$1
+	mflo	$12
+	mfhi	$13
+.L_bn_div_words_inner_loop1:
+	sltu	$14,$15,$12
+	seq	$24,$9,$13
+	sltu	$1,$9,$13
+	and	$14,$24
+	sltu	$2,$12,$6
+	or	$1,$14
+	.set	noreorder
+	beqz	$1,.L_bn_div_words_inner_loop1_done
+	subu	$13,$2
+	subu	$12,$6
+	b	.L_bn_div_words_inner_loop1
+	subu	$8,1
+	.set	reorder
+.L_bn_div_words_inner_loop1_done:
+
+	sll	$5,4*4	# bits
+	subu	$4,$15,$12
+	sll	$2,$8,4*4	# bits
+
+	li	$8,-1
+	srl	$9,$4,4*4	# bits
+	srl	$8,4*4	# q=0xffffffff
+	beq	$3,$9,.L_bn_div_words_skip_div2
+	divu	$0,$4,$3
+	mflo	$8
+.L_bn_div_words_skip_div2:
+	multu	$6,$8
+	sll	$15,$4,4*4	# bits
+	srl	$1,$5,4*4	# bits
+	or	$15,$1
+	mflo	$12
+	mfhi	$13
+.L_bn_div_words_inner_loop2:
+	sltu	$14,$15,$12
+	seq	$24,$9,$13
+	sltu	$1,$9,$13
+	and	$14,$24
+	sltu	$3,$12,$6
+	or	$1,$14
+	.set	noreorder
+	beqz	$1,.L_bn_div_words_inner_loop2_done
+	subu	$13,$3
+	subu	$12,$6
+	b	.L_bn_div_words_inner_loop2
+	subu	$8,1
+	.set	reorder
+.L_bn_div_words_inner_loop2_done:
+
+	subu	$4,$15,$12
+	or	$2,$8
+	srl	$3,$4,$25	# $3 contains remainder if anybody wants it
+	srl	$6,$25		# restore $6
+
+	.set	noreorder
+	move	$5,$3
+	jr	$31
+	move	$4,$2
+.end	bn_div_words_internal
+
+.align	5
+.globl	bn_mul_comba8
+.ent	bn_mul_comba8
+bn_mul_comba8:
+	.set	noreorder
+	.frame	$29,6*4,$31
+	.mask	0x003f0000,-4
+	subu $29,6*4
+	sw	$21,5*4($29)
+	sw	$20,4*4($29)
+	sw	$19,3*4($29)
+	sw	$18,2*4($29)
+	sw	$17,1*4($29)
+	sw	$16,0*4($29)
+
+	.set	reorder
+	lw	$12,0($5)	# If compiled with -mips3 option on
+				# R5000 box assembler barks on this
+				# 1ine with "should not have mult/div
+				# as last instruction in bb (R10K
+				# bug)" warning. If anybody out there
+				# has a clue about how to circumvent
+				# this do send me a note.
+				#		<appro@fy.chalmers.se>
+
+	lw	$8,0($6)
+	lw	$13,4($5)
+	lw	$14,2*4($5)
+	multu	$12,$8		# mul_add_c(a[0],b[0],c1,c2,c3);
+	lw	$15,3*4($5)
+	lw	$9,4($6)
+	lw	$10,2*4($6)
+	lw	$11,3*4($6)
+	mflo	$2
+	mfhi	$3
+
+	lw	$16,4*4($5)
+	lw	$18,5*4($5)
+	multu	$12,$9		# mul_add_c(a[0],b[1],c2,c3,c1);
+	lw	$20,6*4($5)
+	lw	$5,7*4($5)
+	lw	$17,4*4($6)
+	lw	$19,5*4($6)
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$13,$8		# mul_add_c(a[1],b[0],c2,c3,c1);
+	addu	$7,$25,$1
+	lw	$21,6*4($6)
+	lw	$6,7*4($6)
+	sw	$2,0($4)	# r[0]=c1;
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	 multu	$14,$8		# mul_add_c(a[2],b[0],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$2,$7,$25
+	sw	$3,4($4)	# r[1]=c2;
+
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$13,$9		# mul_add_c(a[1],b[1],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$12,$10		# mul_add_c(a[0],b[2],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$3,$2,$25
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	 multu	$12,$11		# mul_add_c(a[0],b[3],c1,c2,c3);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,2*4($4)	# r[2]=c3;
+
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$13,$10		# mul_add_c(a[1],b[2],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$7,$3,$25
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$14,$9		# mul_add_c(a[2],b[1],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$15,$8		# mul_add_c(a[3],b[0],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	 multu	$16,$8		# mul_add_c(a[4],b[0],c2,c3,c1);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,3*4($4)	# r[3]=c1;
+
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$15,$9		# mul_add_c(a[3],b[1],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$2,$7,$25
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$14,$10		# mul_add_c(a[2],b[2],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$13,$11		# mul_add_c(a[1],b[3],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$12,$17		# mul_add_c(a[0],b[4],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	 multu	$12,$19		# mul_add_c(a[0],b[5],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,4*4($4)	# r[4]=c2;
+
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$13,$17		# mul_add_c(a[1],b[4],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$3,$2,$25
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$14,$11		# mul_add_c(a[2],b[3],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$15,$10		# mul_add_c(a[3],b[2],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$16,$9		# mul_add_c(a[4],b[1],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$18,$8		# mul_add_c(a[5],b[0],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	 multu	$20,$8		# mul_add_c(a[6],b[0],c1,c2,c3);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,5*4($4)	# r[5]=c3;
+
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$18,$9		# mul_add_c(a[5],b[1],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$7,$3,$25
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$16,$10		# mul_add_c(a[4],b[2],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$15,$11		# mul_add_c(a[3],b[3],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$14,$17		# mul_add_c(a[2],b[4],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$13,$19		# mul_add_c(a[1],b[5],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$12,$21		# mul_add_c(a[0],b[6],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	 multu	$12,$6		# mul_add_c(a[0],b[7],c2,c3,c1);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,6*4($4)	# r[6]=c1;
+
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$13,$21		# mul_add_c(a[1],b[6],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$2,$7,$25
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$14,$19		# mul_add_c(a[2],b[5],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$15,$17		# mul_add_c(a[3],b[4],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$16,$11		# mul_add_c(a[4],b[3],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$18,$10		# mul_add_c(a[5],b[2],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$20,$9		# mul_add_c(a[6],b[1],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$5,$8		# mul_add_c(a[7],b[0],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	 multu	$5,$9		# mul_add_c(a[7],b[1],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,7*4($4)	# r[7]=c2;
+
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$20,$10		# mul_add_c(a[6],b[2],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$3,$2,$25
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$18,$11		# mul_add_c(a[5],b[3],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$16,$17		# mul_add_c(a[4],b[4],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$15,$19		# mul_add_c(a[3],b[5],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$14,$21		# mul_add_c(a[2],b[6],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$13,$6		# mul_add_c(a[1],b[7],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	 multu	$14,$6		# mul_add_c(a[2],b[7],c1,c2,c3);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,8*4($4)	# r[8]=c3;
+
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$15,$21		# mul_add_c(a[3],b[6],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$7,$3,$25
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$16,$19		# mul_add_c(a[4],b[5],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$18,$17		# mul_add_c(a[5],b[4],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$20,$11		# mul_add_c(a[6],b[3],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$5,$10		# mul_add_c(a[7],b[2],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	 multu	$5,$11		# mul_add_c(a[7],b[3],c2,c3,c1);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,9*4($4)	# r[9]=c1;
+
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$20,$17		# mul_add_c(a[6],b[4],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$2,$7,$25
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$18,$19		# mul_add_c(a[5],b[5],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$16,$21		# mul_add_c(a[4],b[6],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$15,$6		# mul_add_c(a[3],b[7],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$16,$6		# mul_add_c(a[4],b[7],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,10*4($4)	# r[10]=c2;
+
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$18,$21		# mul_add_c(a[5],b[6],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$3,$2,$25
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$20,$19		# mul_add_c(a[6],b[5],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$5,$17		# mul_add_c(a[7],b[4],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	 multu	$5,$19		# mul_add_c(a[7],b[5],c1,c2,c3);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,11*4($4)	# r[11]=c3;
+
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$20,$21		# mul_add_c(a[6],b[6],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$7,$3,$25
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$18,$6		# mul_add_c(a[5],b[7],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	 multu	$20,$6		# mul_add_c(a[6],b[7],c2,c3,c1);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,12*4($4)	# r[12]=c1;
+
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$5,$21		# mul_add_c(a[7],b[6],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$2,$7,$25
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$5,$6		# mul_add_c(a[7],b[7],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,13*4($4)	# r[13]=c2;
+
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sw	$7,14*4($4)	# r[14]=c3;
+	sw	$2,15*4($4)	# r[15]=c1;
+
+	.set	noreorder
+	lw	$21,5*4($29)
+	lw	$20,4*4($29)
+	lw	$19,3*4($29)
+	lw	$18,2*4($29)
+	lw	$17,1*4($29)
+	lw	$16,0*4($29)
+	jr	$31
+	addu $29,6*4
+.end	bn_mul_comba8
+
+.align	5
+.globl	bn_mul_comba4
+.ent	bn_mul_comba4
+bn_mul_comba4:
+	.set	reorder
+	lw	$12,0($5)
+	lw	$8,0($6)
+	lw	$13,4($5)
+	lw	$14,2*4($5)
+	multu	$12,$8		# mul_add_c(a[0],b[0],c1,c2,c3);
+	lw	$15,3*4($5)
+	lw	$9,4($6)
+	lw	$10,2*4($6)
+	lw	$11,3*4($6)
+	mflo	$2
+	mfhi	$3
+	sw	$2,0($4)
+
+	multu	$12,$9		# mul_add_c(a[0],b[1],c2,c3,c1);
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$13,$8		# mul_add_c(a[1],b[0],c2,c3,c1);
+	addu	$7,$25,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	 multu	$14,$8		# mul_add_c(a[2],b[0],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$2,$7,$25
+	sw	$3,4($4)
+
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$13,$9		# mul_add_c(a[1],b[1],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$12,$10		# mul_add_c(a[0],b[2],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$3,$2,$25
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	 multu	$12,$11		# mul_add_c(a[0],b[3],c1,c2,c3);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,2*4($4)
+
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$13,$10		# mul_add_c(a[1],b[2],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$7,$3,$25
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$14,$9		# mul_add_c(a[2],b[1],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	multu	$15,$8		# mul_add_c(a[3],b[0],c1,c2,c3);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	 multu	$15,$9		# mul_add_c(a[3],b[1],c2,c3,c1);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,3*4($4)
+
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$14,$10		# mul_add_c(a[2],b[2],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$2,$7,$25
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	multu	$13,$11		# mul_add_c(a[1],b[3],c2,c3,c1);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	 multu	$14,$11		# mul_add_c(a[2],b[3],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,4*4($4)
+
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	multu	$15,$10		# mul_add_c(a[3],b[2],c3,c1,c2);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$3,$2,$25
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	 multu	$15,$11		# mul_add_c(a[3],b[3],c1,c2,c3);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,5*4($4)
+
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sw	$2,6*4($4)
+	sw	$3,7*4($4)
+
+	.set	noreorder
+	jr	$31
+	nop
+.end	bn_mul_comba4
+
+.align	5
+.globl	bn_sqr_comba8
+.ent	bn_sqr_comba8
+bn_sqr_comba8:
+	.set	reorder
+	lw	$12,0($5)
+	lw	$13,4($5)
+	lw	$14,2*4($5)
+	lw	$15,3*4($5)
+
+	multu	$12,$12		# mul_add_c(a[0],b[0],c1,c2,c3);
+	lw	$8,4*4($5)
+	lw	$9,5*4($5)
+	lw	$10,6*4($5)
+	lw	$11,7*4($5)
+	mflo	$2
+	mfhi	$3
+	sw	$2,0($4)
+
+	multu	$12,$13		# mul_add_c2(a[0],b[1],c2,c3,c1);
+	mflo	$24
+	mfhi	$25
+	slt	$2,$25,$0
+	sll	$25,1
+	 multu	$14,$12		# mul_add_c2(a[2],b[0],c3,c1,c2);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$7,$25,$1
+	sw	$3,4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$3,$25,$0
+	sll	$25,1
+	multu	$13,$13		# mul_add_c(a[1],b[1],c3,c1,c2);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	 multu	$12,$15		# mul_add_c2(a[0],b[3],c1,c2,c3);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,2*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$7,$25,$0
+	sll	$25,1
+	multu	$13,$14		# mul_add_c2(a[1],b[2],c1,c2,c3);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$7,$1
+	 multu	$8,$12		# mul_add_c2(a[4],b[0],c2,c3,c1);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,3*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$2,$25,$0
+	sll	$25,1
+	multu	$15,$13		# mul_add_c2(a[3],b[1],c2,c3,c1);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$2,$1
+	multu	$14,$14		# mul_add_c(a[2],b[2],c2,c3,c1);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	 multu	$12,$9		# mul_add_c2(a[0],b[5],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,4*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$3,$25,$0
+	sll	$25,1
+	multu	$13,$8		# mul_add_c2(a[1],b[4],c3,c1,c2);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$3,$1
+	multu	$14,$15		# mul_add_c2(a[2],b[3],c3,c1,c2);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	 multu	$10,$12		# mul_add_c2(a[6],b[0],c1,c2,c3);
+	addu	$3,$1
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,5*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$7,$25,$0
+	sll	$25,1
+	multu	$9,$13		# mul_add_c2(a[5],b[1],c1,c2,c3);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$7,$1
+	multu	$8,$14		# mul_add_c2(a[4],b[2],c1,c2,c3);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$7,$1
+	multu	$15,$15		# mul_add_c(a[3],b[3],c1,c2,c3);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	 multu	$12,$11		# mul_add_c2(a[0],b[7],c2,c3,c1);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,6*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$2,$25,$0
+	sll	$25,1
+	multu	$13,$10		# mul_add_c2(a[1],b[6],c2,c3,c1);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$2,$1
+	multu	$14,$9		# mul_add_c2(a[2],b[5],c2,c3,c1);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$2,$1
+	multu	$15,$8		# mul_add_c2(a[3],b[4],c2,c3,c1);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$2,$1
+	 multu	$11,$13		# mul_add_c2(a[7],b[1],c3,c1,c2);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,7*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$3,$25,$0
+	sll	$25,1
+	multu	$10,$14		# mul_add_c2(a[6],b[2],c3,c1,c2);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$3,$1
+	multu	$9,$15		# mul_add_c2(a[5],b[3],c3,c1,c2);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$3,$1
+	multu	$8,$8		# mul_add_c(a[4],b[4],c3,c1,c2);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	 multu	$14,$11		# mul_add_c2(a[2],b[7],c1,c2,c3);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,8*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$7,$25,$0
+	sll	$25,1
+	multu	$15,$10		# mul_add_c2(a[3],b[6],c1,c2,c3);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$7,$1
+	multu	$8,$9		# mul_add_c2(a[4],b[5],c1,c2,c3);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$7,$1
+	 multu	$11,$15		# mul_add_c2(a[7],b[3],c2,c3,c1);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,9*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$2,$25,$0
+	sll	$25,1
+	multu	$10,$8		# mul_add_c2(a[6],b[4],c2,c3,c1);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$2,$1
+	multu	$9,$9		# mul_add_c(a[5],b[5],c2,c3,c1);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	 multu	$8,$11		# mul_add_c2(a[4],b[7],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,10*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$3,$25,$0
+	sll	$25,1
+	multu	$9,$10		# mul_add_c2(a[5],b[6],c3,c1,c2);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$3,$1
+	 multu	$11,$9		# mul_add_c2(a[7],b[5],c1,c2,c3);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,11*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$7,$25,$0
+	sll	$25,1
+	multu	$10,$10		# mul_add_c(a[6],b[6],c1,c2,c3);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	 multu	$10,$11		# mul_add_c2(a[6],b[7],c2,c3,c1);
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,12*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$2,$25,$0
+	sll	$25,1
+	 multu	$11,$11		# mul_add_c(a[7],b[7],c3,c1,c2);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,13*4($4)
+
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sw	$7,14*4($4)
+	sw	$2,15*4($4)
+
+	.set	noreorder
+	jr	$31
+	nop
+.end	bn_sqr_comba8
+
+.align	5
+.globl	bn_sqr_comba4
+.ent	bn_sqr_comba4
+bn_sqr_comba4:
+	.set	reorder
+	lw	$12,0($5)
+	lw	$13,4($5)
+	multu	$12,$12		# mul_add_c(a[0],b[0],c1,c2,c3);
+	lw	$14,2*4($5)
+	lw	$15,3*4($5)
+	mflo	$2
+	mfhi	$3
+	sw	$2,0($4)
+
+	multu	$12,$13		# mul_add_c2(a[0],b[1],c2,c3,c1);
+	mflo	$24
+	mfhi	$25
+	slt	$2,$25,$0
+	sll	$25,1
+	 multu	$14,$12		# mul_add_c2(a[2],b[0],c3,c1,c2);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$7,$25,$1
+	sw	$3,4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$3,$25,$0
+	sll	$25,1
+	multu	$13,$13		# mul_add_c(a[1],b[1],c3,c1,c2);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	mflo	$24
+	mfhi	$25
+	addu	$7,$24
+	sltu	$1,$7,$24
+	 multu	$12,$15		# mul_add_c2(a[0],b[3],c1,c2,c3);
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,2*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$7,$25,$0
+	sll	$25,1
+	multu	$13,$14		# mul_add_c(a2[1],b[2],c1,c2,c3);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	mflo	$24
+	mfhi	$25
+	slt	$1,$25,$0
+	addu	$7,$1
+	 multu	$15,$13		# mul_add_c2(a[3],b[1],c2,c3,c1);
+	sll	$25,1
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sltu	$1,$3,$25
+	addu	$7,$1
+	sw	$2,3*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$2,$25,$0
+	sll	$25,1
+	multu	$14,$14		# mul_add_c(a[2],b[2],c2,c3,c1);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$3,$24
+	sltu	$1,$3,$24
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	mflo	$24
+	mfhi	$25
+	addu	$3,$24
+	sltu	$1,$3,$24
+	 multu	$14,$15		# mul_add_c2(a[2],b[3],c3,c1,c2);
+	addu	$25,$1
+	addu	$7,$25
+	sltu	$1,$7,$25
+	addu	$2,$1
+	sw	$3,4*4($4)
+
+	mflo	$24
+	mfhi	$25
+	slt	$3,$25,$0
+	sll	$25,1
+	 multu	$15,$15		# mul_add_c(a[3],b[3],c1,c2,c3);
+	slt	$6,$24,$0
+	addu	$25,$6
+	sll	$24,1
+	addu	$7,$24
+	sltu	$1,$7,$24
+	addu	$25,$1
+	addu	$2,$25
+	sltu	$1,$2,$25
+	addu	$3,$1
+	sw	$7,5*4($4)
+
+	mflo	$24
+	mfhi	$25
+	addu	$2,$24
+	sltu	$1,$2,$24
+	addu	$25,$1
+	addu	$3,$25
+	sw	$2,6*4($4)
+	sw	$3,7*4($4)
+
+	.set	noreorder
+	jr	$31
+	nop
+.end	bn_sqr_comba4
diff --git a/main/openssl/crypto/bn/asm/co-586.S b/main/openssl/crypto/bn/asm/co-586.S
new file mode 100644
index 00000000..3cb80735
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/co-586.S
@@ -0,0 +1,1254 @@
+.file	"crypto/bn/asm/co-586.s"
+.text
+.globl	bn_mul_comba8
+.type	bn_mul_comba8,@function
+.align	16
+bn_mul_comba8:
+.L_bn_mul_comba8_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	16(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	20(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	24(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,28(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,36(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,44(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,52(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%eax)
+
+
+	movl	%ebx,60(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_mul_comba8,.-.L_bn_mul_comba8_begin
+.globl	bn_mul_comba4
+.type	bn_mul_comba4,@function
+.align	16
+bn_mul_comba4:
+.L_bn_mul_comba4_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+
+
+	movl	%ecx,28(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_mul_comba4,.-.L_bn_mul_comba4_begin
+.globl	bn_sqr_comba8
+.type	bn_sqr_comba8,@function
+.align	16
+bn_sqr_comba8:
+.L_bn_sqr_comba8_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	12(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	20(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%eax
+	adcl	$0,%ebx
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,28(%edi)
+	movl	4(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	20(%esi),%eax
+	adcl	$0,%ecx
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	8(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	16(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	28(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,36(%edi)
+	movl	12(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	16(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	20(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	28(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,44(%edi)
+	movl	20(%esi),%edx
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,52(%edi)
+
+
+	xorl	%ecx,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%edi)
+
+	movl	%ebx,60(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_sqr_comba8,.-.L_bn_sqr_comba8_begin
+.globl	bn_sqr_comba4
+.type	bn_sqr_comba4,@function
+.align	16
+bn_sqr_comba4:
+.L_bn_sqr_comba4_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	4(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+
+	movl	%ecx,28(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_sqr_comba4,.-.L_bn_sqr_comba4_begin
diff --git a/main/openssl/crypto/bn/asm/ia64-mont.pl b/main/openssl/crypto/bn/asm/ia64-mont.pl
new file mode 100644
index 00000000..e2586584
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+#   stalls after ldf8, xma and getf.sig outside inner loop and
+#   improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+#   once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+#   acute interest, because upcoming Tukwila's individual cores are
+#   reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
+# rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
+# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
+# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
+# dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
+# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
+# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
+# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
+# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
+# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
+# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
+# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
+# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+//		    const BN_ULONG *bp,const BN_ULONG *np,
+//		    const BN_ULONG *n0p,int num);			
+.align	64
+.global	bn_mul_mont#
+.proc	bn_mul_mont#
+bn_mul_mont:
+	.prologue
+	.body
+{ .mmi;	cmp4.le		p6,p7=2,r37;;
+(p6)	cmp4.lt.unc	p8,p9=8,r37
+	mov		ret0=r0		};;
+{ .bbb;
+(p9)	br.cond.dptk.many	bn_mul_mont_8
+(p8)	br.cond.dpnt.many	bn_mul_mont_general
+(p7)	br.ret.spnt.many	b0	};;
+.endp	bn_mul_mont#
+
+prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
+
+rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
+tptr=r16;	// &tp[0]
+tp_1=r17;	// &tp[-1]
+num=r18;	len=r19;	lc=r20;
+topbit=r21;	// carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.align	64
+.local	bn_mul_mont_general#
+.proc	bn_mul_mont_general#
+bn_mul_mont_general:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,6,2,0,8
+	$ADDP	aptr=0,in1
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc		}
+{ .mmi;	.vframe	prevsp
+	mov	prevsp=sp
+	$ADDP	bptr=0,in2
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
+	.rotr		a[3],n[3],t[2]
+
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		alo[4]=[aptr],16	// ap[0]
+	$ADDP		r30=8,in1	};;
+{ .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
+	ldf8		alo[2]=[aptr],16	// ap[2]
+	$ADDP		in4=0,in4	};;
+{ .mmi;	ldf8		alo[1]=[r30]		// ap[3]
+	ldf8		n0=[in4]		// n0
+	$ADDP		rptr=0,in0		}
+{ .mmi;	$ADDP		nptr=0,in3
+	mov		r31=16
+	zxt4		num=in5		};;
+{ .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
+	shladd		len=num,3,r0
+	shladd		r31=num,3,r31	};;
+{ .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
+	add		lc=-5,num
+	sub		r31=sp,r31	};;
+{ .mfb;	and		sp=-16,r31		// alloca
+	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
+	nop.b		0		}
+{ .mfb;	nop.m		0
+	xmpy.lu		alo[4]=alo[4],bi
+	brp.loop.imp	.L1st_ctop,.L1st_cend-16
+					};;
+{ .mfi;	nop.m		0
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
+	add		tp_1=8,sp	}
+{ .mfi;	nop.m		0
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20001f<<16
+			// ------^----- (p40) at first (p23)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	nop.m		0
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;	nop.m		0
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+.align	32
+.L1st_ctop:
+.pred.rel	"mutex",p40,p42
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
+{ .mfi;	(p23)	st8		[tp_1]=n[2],8
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mmb;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	br.ctop.sptk	.L1st_ctop			};;
+.L1st_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	add		num=-1,num	};;	// num--
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+	sub		nptr=nptr,len	};;
+{ .mmi;	.pred.rel	"mutex",p39,p41
+(p39)	add		topbit=r0,r0
+(p41)	add		topbit=r0,r0,1
+	nop.i		0		}	
+{ .mmi;	st8		[tp_1]=n[0]
+	add		tptr=16,sp
+	add		tp_1=8,sp	};;
+
+.Louter:
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		ahi[3]=[tptr]		// tp[0]
+	add		r30=8,aptr	};;
+{ .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
+	ldf8		alo[3]=[r30],16		// ap[1]
+	add		r31=8,nptr	};;
+{ .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
+	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
+	brp.loop.imp	.Linner_ctop,.Linner_cend-16
+					}
+{ .mfb;	ldf8		alo[1]=[r30]		// ap[3]
+	xma.lu		alo[4]=alo[4],bi,ahi[3]
+	clrrrb.pr			};;
+{ .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
+	nop.i		0		}
+{ .mfi;	ldf8		nlo[1]=[r31]		// np[1]
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20101f<<16
+			// ------^----- (p40) at first (p23)
+			// --------^--- (p30) at first (p22)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align	32
+.Linner_ctop:
+.pred.rel	"mutex",p40,p42
+.pred.rel	"mutex",p30,p32
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
+{ .mfi;	(p16)	nop.m		0
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p16)	nop.f		0
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mfi;	(p21)	ld8		t[0]=[tptr],8
+	(p16)	nop.f		0
+	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
+{ .mfi;	(p16)	nop.m		0
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
+{ .mmi;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
+{ .mmb;	(p23)	st8		[tp_1]=n[2],8
+	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
+	br.ctop.sptk	.Linner_ctop			};;
+.Linner_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	nop.i		0		};;
+
+{ .mmi;	.pred.rel	"mutex",p31,p33
+(p31)	add		a[0]=a[0],topbit
+(p33)	add		a[0]=a[0],topbit,1
+	mov		topbit=r0	};;
+{ .mfi; .pred.rel	"mutex",p31,p33
+(p31)	cmp.ltu		p32,p30=a[0],topbit
+(p33)	cmp.leu		p32,p30=a[0],topbit
+					}
+{ .mfi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+					};;
+{ .mmi;	.pred.rel	"mutex",p44,p46
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+(p32)	add		topbit=r0,r0,1	}
+
+{ .mmi;	st8		[tp_1]=n[0],8
+	cmp4.ne		p6,p0=1,num
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	sub		nptr=nptr,len
+(p41)	add		topbit=r0,r0,1
+	add		tptr=16,sp	}
+{ .mmb;	add		tp_1=8,sp
+	add		num=-1,num		// num--
+(p6)	br.cond.sptk.many	.Louter	};;
+
+{ .mbb;	add		lc=4,lc
+	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
+	clrrrb.pr			};;
+{ .mii;	nop.m		0
+	mov		pr.rot=0x10001<<16
+			// ------^---- (p33) at first (p17)
+	mov		ar.lc=lc	}
+{ .mii;	nop.m		0
+	mov		ar.ec=3
+	nop.i		0		};;
+
+.Lsub_ctop:
+.pred.rel	"mutex",p33,p35
+{ .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
+	(p16)	nop.f		0
+	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
+{ .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
+	(p16)	nop.f		0
+	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
+{ .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
+	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
+	(p18)	nop.b		0			}
+{ .mib;	(p18)	nop.m		0
+	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
+	br.ctop.sptk	.Lsub_ctop			};;
+.Lsub_cend:
+
+{ .mmb;	.pred.rel	"mutex",p34,p36
+(p34)	sub	topbit=topbit,r0	// (p19)
+(p36)	sub	topbit=topbit,r0,1
+	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
+					}
+{ .mmb;	sub	rptr=rptr,len		// rewind
+	sub	tptr=tptr,len
+	clrrrb.pr			};;
+{ .mmi;	and	aptr=tptr,topbit
+	andcm	bptr=rptr,topbit
+	mov	pr.rot=1<<16		};;
+{ .mii;	or	nptr=aptr,bptr
+	mov	ar.lc=lc
+	mov	ar.ec=3			};;
+
+.Lcopy_ctop:
+{ .mmb;	(p16)	ld8	n[0]=[nptr],8
+	(p18)	st8	[tptr]=r0,8
+	(p16)	nop.b	0		}
+{ .mmb;	(p16)	nop.m	0
+	(p18)	st8	[rptr]=n[2],8
+	br.ctop.sptk	.Lcopy_ctop	};;
+.Lcopy_cend:
+
+{ .mmi;	mov		ret0=1			// signal "handled"
+	rum		1<<5			// clear um.mfh
+	mov		ar.lc=prevlc	}
+{ .mib;	.restore	sp
+	mov		sp=prevsp
+	mov		pr=prevpr,0x1ffff
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_general#
+
+a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
+n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
+t0=r15;
+
+ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.align	64
+.skip	48		// aligns loop body
+.local	bn_mul_mont_8#
+.proc	bn_mul_mont_8#
+bn_mul_mont_8:
+	.prologue
+{ .mmi;	.save		ar.pfs,prevfs
+	alloc		prevfs=ar.pfs,6,2,0,8
+	.vframe		prevsp
+	mov		prevsp=sp
+	.save		ar.lc,prevlc
+	mov		prevlc=ar.lc	}
+{ .mmi;	add		r17=-6*16,sp
+	add		sp=-7*16,sp
+	.save		pr,prevpr
+	mov		prevpr=pr	};;
+
+{ .mmi;	.save.gf	0,0x10
+	stf.spill	[sp]=f16,-16
+	.save.gf	0,0x20
+	stf.spill	[r17]=f17,32
+	add		r16=-5*16,prevsp};;
+{ .mmi;	.save.gf	0,0x40
+	stf.spill	[r16]=f18,32
+	.save.gf	0,0x80
+	stf.spill	[r17]=f19,32
+	$ADDP		aptr=0,in1	};;
+{ .mmi;	.save.gf	0,0x100
+	stf.spill	[r16]=f20,32
+	.save.gf	0,0x200
+	stf.spill	[r17]=f21,32
+	$ADDP		r29=8,in1	};;
+{ .mmi;	.save.gf	0,0x400
+	stf.spill	[r16]=f22
+	.save.gf	0,0x800
+	stf.spill	[r17]=f23
+	$ADDP		rptr=0,in0	};;
+
+	.body
+	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+	.rotr		t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi;	ldf8		ai0=[aptr],16		// ap[0]
+	ldf8		ai1=[r29],16		// ap[1]
+	$ADDP		bptr=0,in2	}
+{ .mmi;	$ADDP		r30=8,in2
+	$ADDP		nptr=0,in3
+	$ADDP		r31=8,in3	};;
+{ .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
+	ldf8		bj[6]=[r30],16		// bp[1]
+	cmp4.le		p4,p5=3,in5	}
+{ .mmi;	ldf8		ni0=[nptr],16		// np[0]
+	ldf8		ni1=[r31],16		// np[1]
+	cmp4.le		p6,p7=4,in5	};;
+
+{ .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
+	(p5)fcvt.fxu	ai2=f0
+	cmp4.le		p8,p9=5,in5	}
+{ .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
+	(p7)fcvt.fxu	ai3=f0
+	cmp4.le		p10,p11=6,in5	}
+{ .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
+	(p5)fcvt.fxu	bj[5]=f0
+	cmp4.le		p12,p13=7,in5	}
+{ .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
+	(p7)fcvt.fxu	bj[4]=f0
+	cmp4.le		p14,p15=8,in5	}
+{ .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
+	(p5)fcvt.fxu	ni2=f0
+	addp4		r28=-1,in5	}
+{ .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
+	(p7)fcvt.fxu	ni3=f0
+	$ADDP		in4=0,in4	};;
+
+{ .mfi;	ldf8		n0=[in4]
+	fcvt.fxu	tf[1]=f0
+	nop.i		0		}
+
+{ .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
+	(p9)fcvt.fxu	ai4=f0
+	mov		t[0]=r0		}
+{ .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
+	(p11)fcvt.fxu	ai5=f0
+	mov		t[1]=r0		}
+{ .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
+	(p9)fcvt.fxu	bj[3]=f0
+	mov		t[2]=r0		}
+{ .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
+	(p11)fcvt.fxu	bj[2]=f0
+	mov		t[3]=r0		}
+{ .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
+	(p9)fcvt.fxu	ni4=f0
+	mov		t[4]=r0		}
+{ .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
+	(p11)fcvt.fxu	ni5=f0
+	mov		t[5]=r0		};;
+
+{ .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
+	(p13)fcvt.fxu	ai6=f0
+	mov		t[6]=r0		}
+{ .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
+	(p15)fcvt.fxu	ai7=f0
+	mov		t[7]=r0		}
+{ .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
+	(p13)fcvt.fxu	bj[1]=f0
+	mov		ar.lc=r28	}
+{ .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
+	(p15)fcvt.fxu	bj[0]=f0
+	mov		ar.ec=1		}
+{ .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
+	(p13)fcvt.fxu	ni6=f0
+	mov		pr.rot=1<<16	}
+{ .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
+	(p15)fcvt.fxu	ni7=f0
+	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
+					};;
+
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 0:
+	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
+	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
+{ .mfi;	(p42)	add		a3=a3,n3,1
+	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 4:
+	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
+	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
+{ .mfi;	(p43)	add		a4=a4,n4,1
+	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p16)	nop.m		0			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 8:
+	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
+	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
+{ .mfi;	(p42)	add		a5=a5,n5,1
+	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mfi;	(p16)	nop.m		0			// 10:
+	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
+	(p40)	cmp.ltu		p43,p41=a5,n5	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
+	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
+	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
+{ .mfi;	(p43)	add		a6=a6,n6,1
+	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mfi;	(p16)	nop.m		0			// 14:
+	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
+	(p41)	cmp.ltu		p42,p40=a6,n6	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
+	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	nop.m		0			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 16:
+	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
+	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
+{ .mfi;	(p42)	add		a7=a7,n7,1
+	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mfi;	(p16)	nop.m		0			// 18:
+	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
+	(p40)	cmp.ltu		p43,p41=a7,n7	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
+	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 20:
+	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
+	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
+{ .mfi;	(p43)	add		a8=a8,n8,1
+	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	};;
+{ .mfi;	(p16)	nop.m		0			// 22:
+	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
+	(p41)	cmp.ltu		p42,p40=a8,n8	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
+	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	};;
+{ .mfi;	(p16)	nop.m		0			// 24:
+	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
+	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
+{ .mfi;	(p16)	nop.m		0
+	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
+	(p17)	mov		t[0]=r0		};;
+{ .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
+	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
+	(p42)	add		t[0]=t[0],r0,1	};;
+{ .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
+	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
+	(p50)	add		t[0]=t[0],r0,1	}
+{ .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
+	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
+	(p16)	cmp.ltu.unc	p50,p48=t0,a1
+	(p16)	nop.i		0		};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 28:
+	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
+	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
+{ .mfi;	(p42)	add		a2=a2,n2,1
+	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
+	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
+	(p50)	add		t[6]=t[6],a2,1	};;
+{ .mfi;	(p16)	nop.m		0			// 30:
+	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
+	(p40)	cmp.ltu		p41,p39=a2,n2	}
+{ .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
+	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
+	(p16)	nop.f		0
+	(p48)	cmp.ltu		p49,p47=t[6],a2	}
+{ .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
+	(p16)	nop.f		0
+	br.ctop.sptk.many	.Louter_8_ctop	};;
+.Louter_8_cend:
+
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mmi;	(p0)	getf.sig	n1=ni0			// 0:
+	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
+	(p42)	add		a3=a3,n3,1	};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mmi;	(p0)	getf.sig	n2=ni1			// 4:
+	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
+	(p43)	add		a4=a4,n4,1	};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p0)	nop.f		0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p0)	getf.sig	n3=ni2			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	getf.sig	n4=ni3			// 8:
+	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
+	(p42)	add		a5=a5,n5,1	};;
+{ .mii;	(p0)	nop.m		0			// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mii;	(p0)	nop.m		0			// 10:
+	(p40)	cmp.ltu		p43,p41=a5,n5
+	(p42)	cmp.leu		p43,p41=a5,n5	};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
+	(p43)	add		a6=a6,n6,1	};;
+{ .mii;	(p0)	getf.sig	n5=ni4			// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mii;	(p0)	nop.m		0			// 14:
+	(p41)	cmp.ltu		p42,p40=a6,n6
+	(p43)	cmp.leu		p42,p40=a6,n6	};;
+{ .mii;	(p0)	getf.sig	n6=ni5			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	nop.m		0			// 16:
+	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
+	(p42)	add		a7=a7,n7,1	};;
+{ .mii;	(p0)	nop.m		0			// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mii;	(p0)	nop.m		0			// 18:
+	(p40)	cmp.ltu		p43,p41=a7,n7
+	(p42)	cmp.leu		p43,p41=a7,n7	};;
+{ .mii;	(p0)	getf.sig	n7=ni6			// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p0)	nop.m		0			// 20:
+	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
+	(p43)	add		a8=a8,n8,1	};;
+{ .mmi;	(p0)	nop.m		0			// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	}
+{ .mmi;	(p17)	mov		t[0]=r0
+	(p41)	cmp.ltu		p42,p40=a8,n8
+	(p43)	cmp.leu		p42,p40=a8,n8	};;
+{ .mmi;	(p0)	getf.sig	n8=ni7			// 22:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	}
+{ .mmi;	(p42)	add		t[0]=t[0],r0,1
+	(p0)	add		r16=-7*16,prevsp
+	(p0)	add		r17=-6*16,prevsp	};;
+
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+//	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi;	(p50)add	t[0]=t[0],r0,1
+	add		r18=-5*16,prevsp
+	sub		n1=t0,n1	};;
+{ .mmi;	cmp.gtu		p34,p32=n1,t0;;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n2=t[7],n2
+	(p34)sub	n2=t[7],n2,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
+	(p34)cmp.geu	p35,p33=n2,t[7];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n3=t[6],n3	}
+{ .mmi;	(p35)sub	n3=t[6],n3,1;;
+	(p33)cmp.gtu	p34,p32=n3,t[6]
+	(p35)cmp.geu	p34,p32=n3,t[6]	};;
+	.pred.rel	"mutex",p32,p34
+{ .mii;	(p32)sub	n4=t[5],n4
+	(p34)sub	n4=t[5],n4,1;;
+	(p32)cmp.gtu	p35,p33=n4,t[5]	}
+{ .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n5=t[4],n5
+	(p35)sub	n5=t[4],n5,1	};;
+{ .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
+	(p35)cmp.geu	p34,p32=n5,t[4];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n6=t[3],n6	}
+{ .mmi;	(p34)sub	n6=t[3],n6,1;;
+	(p32)cmp.gtu	p35,p33=n6,t[3]
+	(p34)cmp.geu	p35,p33=n6,t[3]	};;
+	.pred.rel	"mutex",p33,p35
+{ .mii;	(p33)sub	n7=t[2],n7
+	(p35)sub	n7=t[2],n7,1;;
+	(p33)cmp.gtu	p34,p32=n7,t[2]	}
+{ .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n8=t[1],n8
+	(p34)sub	n8=t[1],n8,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
+	(p34)cmp.geu	p35,p33=n8,t[1];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	a8=t[0],r0	}
+{ .mmi;	(p35)sub	a8=t[0],r0,1;;
+	(p33)cmp.gtu	p34,p32=a8,t[0]
+	(p35)cmp.geu	p34,p32=a8,t[0]	};;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+	.pred.rel	"mutex",p32,p34
+{ .mmi;	(p32)st8	[rptr]=n1,8
+	(p34)st8	[rptr]=t0,8
+	add		r19=-4*16,prevsp};;
+{ .mmb;	(p32)st8	[rptr]=n2,8
+	(p34)st8	[rptr]=t[7],8
+	(p5)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n3,8
+	(p34)st8	[rptr]=t[6],8
+	(p7)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n4,8
+	(p34)st8	[rptr]=t[5],8
+	(p9)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n5,8
+	(p34)st8	[rptr]=t[4],8
+	(p11)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n6,8
+	(p34)st8	[rptr]=t[3],8
+	(p13)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n7,8
+	(p34)st8	[rptr]=t[2],8
+	(p15)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n8,8
+	(p34)st8	[rptr]=t[1],8
+	nop.b		0		};;
+.Ldone:						// epilogue
+{ .mmi;	ldf.fill	f16=[r16],64
+	ldf.fill	f17=[r17],64
+	nop.i		0		}
+{ .mmi;	ldf.fill	f18=[r18],64
+	ldf.fill	f19=[r19],64
+	mov		pr=prevpr,0x1ffff	};;
+{ .mmi;	ldf.fill	f20=[r16]
+	ldf.fill	f21=[r17]
+	mov		ar.lc=prevlc	}
+{ .mmi;	ldf.fill	f22=[r18]
+	ldf.fill	f23=[r19]
+	mov		ret0=1		}	// signal "handled"
+{ .mib;	rum		1<<5
+	.restore	sp
+	mov		sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_8#
+
+.type	copyright#,\@object
+copyright:
+stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/ia64.S b/main/openssl/crypto/bn/asm/ia64.S
index 951abc53..c0cee821 100644
--- a/main/openssl/crypto/bn/asm/ia64.S
+++ b/main/openssl/crypto/bn/asm/ia64.S
@@ -568,7 +568,7 @@ bn_sqr_comba8:
 // I've estimated this routine to run in ~120 ticks, but in reality
 // (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
 // cycles consumed for instructions fetch? Or did I misinterpret some
-// clause in Itanium �-architecture manual? Comments are welcomed and
+// clause in Itanium µ-architecture manual? Comments are welcomed and
 // highly appreciated.
 //
 // On Itanium 2 it takes ~190 ticks. This is because of stalls on
diff --git a/main/openssl/crypto/bn/asm/mips-mont.S b/main/openssl/crypto/bn/asm/mips-mont.S
new file mode 100644
index 00000000..1b875a2a
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/mips-mont.S
@@ -0,0 +1,284 @@
+.text
+
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	bn_mul_mont
+.ent	bn_mul_mont
+bn_mul_mont:
+	lw	$8,16($29)
+	lw	$9,20($29)
+	slt	$1,$9,4
+	bnez	$1,1f
+	li	$2,0
+	slt	$1,$9,17	# on in-order CPU
+	bnez	$1,bn_mul_mont_internal
+	nop
+1:	jr	$31
+	li	$4,0
+.end	bn_mul_mont
+
+.align	5
+.ent	bn_mul_mont_internal
+bn_mul_mont_internal:
+	.frame	$30,14*4,$31
+	.mask	0x40000000|16711680,-4
+	sub $29,14*4
+	sw	$30,(14-1)*4($29)
+	sw	$23,(14-2)*4($29)
+	sw	$22,(14-3)*4($29)
+	sw	$21,(14-4)*4($29)
+	sw	$20,(14-5)*4($29)
+	sw	$19,(14-6)*4($29)
+	sw	$18,(14-7)*4($29)
+	sw	$17,(14-8)*4($29)
+	sw	$16,(14-9)*4($29)
+	move	$30,$29
+
+	.set	reorder
+	lw	$8,0($8)
+	lw	$13,0($6)	# bp[0]
+	lw	$12,0($5)	# ap[0]
+	lw	$14,0($7)	# np[0]
+
+	sub $29,2*4	# place for two extra words
+	sll	$9,2
+	li	$1,-4096
+	sub $29,$9
+	and	$29,$1
+
+	multu	$12,$13
+	lw	$16,4($5)
+	lw	$18,4($7)
+	mflo	$10
+	mfhi	$11
+	multu	$10,$8
+	mflo	$23
+
+	multu	$16,$13
+	mflo	$16
+	mfhi	$17
+
+	multu	$14,$23
+	mflo	$24
+	mfhi	$25
+	multu	$18,$23
+	addu	$24,$10
+	sltu	$1,$24,$10
+	addu	$25,$1
+	mflo	$18
+	mfhi	$19
+
+	move	$15,$29
+	li	$22,2*4
+.align	4
+.L1st:
+	.set	noreorder
+	add $12,$5,$22
+	add $14,$7,$22
+	lw	$12,($12)
+	lw	$14,($14)
+
+	multu	$12,$13
+	addu	$10,$16,$11
+	addu	$24,$18,$25
+	sltu	$1,$10,$11
+	sltu	$2,$24,$25
+	addu	$11,$17,$1
+	addu	$25,$19,$2
+	mflo	$16
+	mfhi	$17
+
+	addu	$24,$10
+	sltu	$1,$24,$10
+	multu	$14,$23
+	addu	$25,$1
+	addu	$22,4
+	sw	$24,($15)
+	sltu	$2,$22,$9
+	mflo	$18
+	mfhi	$19
+
+	bnez	$2,.L1st
+	add $15,4
+	.set	reorder
+
+	addu	$10,$16,$11
+	sltu	$1,$10,$11
+	addu	$11,$17,$1
+
+	addu	$24,$18,$25
+	sltu	$2,$24,$25
+	addu	$25,$19,$2
+	addu	$24,$10
+	sltu	$1,$24,$10
+	addu	$25,$1
+
+	sw	$24,($15)
+
+	addu	$25,$11
+	sltu	$1,$25,$11
+	sw	$25,4($15)
+	sw	$1,2*4($15)
+
+	li	$21,4
+.align	4
+.Louter:
+	add $13,$6,$21
+	lw	$13,($13)
+	lw	$12,($5)
+	lw	$16,4($5)
+	lw	$20,($29)
+
+	multu	$12,$13
+	lw	$14,($7)
+	lw	$18,4($7)
+	mflo	$10
+	mfhi	$11
+	addu	$10,$20
+	multu	$10,$8
+	sltu	$1,$10,$20
+	addu	$11,$1
+	mflo	$23
+
+	multu	$16,$13
+	mflo	$16
+	mfhi	$17
+
+	multu	$14,$23
+	mflo	$24
+	mfhi	$25
+
+	multu	$18,$23
+	addu	$24,$10
+	sltu	$1,$24,$10
+	addu	$25,$1
+	mflo	$18
+	mfhi	$19
+
+	move	$15,$29
+	li	$22,2*4
+	lw	$20,4($15)
+.align	4
+.Linner:
+	.set	noreorder
+	add $12,$5,$22
+	add $14,$7,$22
+	lw	$12,($12)
+	lw	$14,($14)
+
+	multu	$12,$13
+	addu	$10,$16,$11
+	addu	$24,$18,$25
+	sltu	$1,$10,$11
+	sltu	$2,$24,$25
+	addu	$11,$17,$1
+	addu	$25,$19,$2
+	mflo	$16
+	mfhi	$17
+
+	addu	$10,$20
+	addu	$22,4
+	multu	$14,$23
+	sltu	$1,$10,$20
+	addu	$24,$10
+	addu	$11,$1
+	sltu	$2,$24,$10
+	lw	$20,2*4($15)
+	addu	$25,$2
+	sltu	$1,$22,$9
+	mflo	$18
+	mfhi	$19
+	sw	$24,($15)
+	bnez	$1,.Linner
+	add $15,4
+	.set	reorder
+
+	addu	$10,$16,$11
+	sltu	$1,$10,$11
+	addu	$11,$17,$1
+	addu	$10,$20
+	sltu	$2,$10,$20
+	addu	$11,$2
+
+	lw	$20,2*4($15)
+	addu	$24,$18,$25
+	sltu	$1,$24,$25
+	addu	$25,$19,$1
+	addu	$24,$10
+	sltu	$2,$24,$10
+	addu	$25,$2
+	sw	$24,($15)
+
+	addu	$24,$25,$11
+	sltu	$25,$24,$11
+	addu	$24,$20
+	sltu	$1,$24,$20
+	addu	$25,$1
+	sw	$24,4($15)
+	sw	$25,2*4($15)
+
+	addu	$21,4
+	sltu	$2,$21,$9
+	bnez	$2,.Louter
+
+	.set	noreorder
+	add $20,$29,$9	# &tp[num]
+	move	$15,$29
+	move	$5,$29
+	li	$11,0		# clear borrow bit
+
+.align	4
+.Lsub:	lw	$10,($15)
+	lw	$24,($7)
+	add $15,4
+	add $7,4
+	subu	$24,$10,$24	# tp[i]-np[i]
+	sgtu	$1,$24,$10
+	subu	$10,$24,$11
+	sgtu	$11,$10,$24
+	sw	$10,($4)
+	or	$11,$1
+	sltu	$1,$15,$20
+	bnez	$1,.Lsub
+	add $4,4
+
+	subu	$11,$25,$11	# handle upmost overflow bit
+	move	$15,$29
+	sub $4,$9	# restore rp
+	not	$25,$11
+
+	and	$5,$11,$29
+	and	$6,$25,$4
+	or	$5,$5,$6	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	lw	$12,($5)
+	add $5,4
+	sw	$0,($15)
+	add $15,4
+	sltu	$1,$15,$20
+	sw	$12,($4)
+	bnez	$1,.Lcopy
+	add $4,4
+
+	li	$4,1
+	li	$2,1
+
+	.set	noreorder
+	move	$29,$30
+	lw	$30,(14-1)*4($29)
+	lw	$23,(14-2)*4($29)
+	lw	$22,(14-3)*4($29)
+	lw	$21,(14-4)*4($29)
+	lw	$20,(14-5)*4($29)
+	lw	$19,(14-6)*4($29)
+	lw	$18,(14-7)*4($29)
+	lw	$17,(14-8)*4($29)
+	lw	$16,(14-9)*4($29)
+	jr	$31
+	add $29,14*4
+.end	bn_mul_mont_internal
+.rdata
+.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro@openssl.org>"
diff --git a/main/openssl/crypto/bn/asm/mips-mont.pl b/main/openssl/crypto/bn/asm/mips-mont.pl
new file mode 100644
index 00000000..caae04ed
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/mips-mont.pl
@@ -0,0 +1,426 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys, at least not on
+# in-order-execution cores. While 512-bit RSA sign operations can be
+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
+# verify:-( All comparisons are against bn_mul_mont-free assembler.
+# The module might be of interest to embedded system developers, as
+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
+# code.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$SZREG=4;
+}
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+	$LD="ld";
+	$ST="sd";
+	$MULTU="dmultu";
+	$ADDU="daddu";
+	$SUBU="dsubu";
+	$BNSZ=8;
+} else {
+	$LD="lw";
+	$ST="sw";
+	$MULTU="multu";
+	$ADDU="addu";
+	$SUBU="subu";
+	$BNSZ=4;
+}
+
+# int bn_mul_mont(
+$rp=$a0;	# BN_ULONG *rp,
+$ap=$a1;	# const BN_ULONG *ap,
+$bp=$a2;	# const BN_ULONG *bp,
+$np=$a3;	# const BN_ULONG *np,
+$n0=$a4;	# const BN_ULONG *n0,
+$num=$a5;	# int num);
+
+$lo0=$a6;
+$hi0=$a7;
+$lo1=$t1;
+$hi1=$t2;
+$aj=$s0;
+$bi=$s1;
+$nj=$s2;
+$tp=$s3;
+$alo=$s4;
+$ahi=$s5;
+$nlo=$s6;
+$nhi=$s7;
+$tj=$s8;
+$i=$s9;
+$j=$s10;
+$m1=$s11;
+
+$FRAMESIZE=14;
+
+$code=<<___;
+.text
+
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	bn_mul_mont
+.ent	bn_mul_mont
+bn_mul_mont:
+___
+$code.=<<___ if ($flavour =~ /o32/i);
+	lw	$n0,16($sp)
+	lw	$num,20($sp)
+___
+$code.=<<___;
+	slt	$at,$num,4
+	bnez	$at,1f
+	li	$t0,0
+	slt	$at,$num,17	# on in-order CPU
+	bnez	$at,bn_mul_mont_internal
+	nop
+1:	jr	$ra
+	li	$a0,0
+.end	bn_mul_mont
+
+.align	5
+.ent	bn_mul_mont_internal
+bn_mul_mont_internal:
+	.frame	$fp,$FRAMESIZE*$SZREG,$ra
+	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
+	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
+	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+	move	$fp,$sp
+
+	.set	reorder
+	$LD	$n0,0($n0)
+	$LD	$bi,0($bp)	# bp[0]
+	$LD	$aj,0($ap)	# ap[0]
+	$LD	$nj,0($np)	# np[0]
+
+	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
+	sll	$num,`log($BNSZ)/log(2)`
+	li	$at,-4096
+	$PTR_SUB $sp,$num
+	and	$sp,$at
+
+	$MULTU	$aj,$bi
+	$LD	$alo,$BNSZ($ap)
+	$LD	$nlo,$BNSZ($np)
+	mflo	$lo0
+	mfhi	$hi0
+	$MULTU	$lo0,$n0
+	mflo	$m1
+
+	$MULTU	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	$MULTU	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+	$MULTU	$nlo,$m1
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,$sp
+	li	$j,2*$BNSZ
+.align	4
+.L1st:
+	.set	noreorder
+	$PTR_ADD $aj,$ap,$j
+	$PTR_ADD $nj,$np,$j
+	$LD	$aj,($aj)
+	$LD	$nj,($nj)
+
+	$MULTU	$aj,$bi
+	$ADDU	$lo0,$alo,$hi0
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo0,$hi0
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$hi1,$nhi,$t0
+	mflo	$alo
+	mfhi	$ahi
+
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$MULTU	$nj,$m1
+	$ADDU	$hi1,$at
+	addu	$j,$BNSZ
+	$ST	$lo1,($tp)
+	sltu	$t0,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+
+	bnez	$t0,.L1st
+	$PTR_ADD $tp,$BNSZ
+	.set	reorder
+
+	$ADDU	$lo0,$alo,$hi0
+	sltu	$at,$lo0,$hi0
+	$ADDU	$hi0,$ahi,$at
+
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi1,$nhi,$t0
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+
+	$ST	$lo1,($tp)
+
+	$ADDU	$hi1,$hi0
+	sltu	$at,$hi1,$hi0
+	$ST	$hi1,$BNSZ($tp)
+	$ST	$at,2*$BNSZ($tp)
+
+	li	$i,$BNSZ
+.align	4
+.Louter:
+	$PTR_ADD $bi,$bp,$i
+	$LD	$bi,($bi)
+	$LD	$aj,($ap)
+	$LD	$alo,$BNSZ($ap)
+	$LD	$tj,($sp)
+
+	$MULTU	$aj,$bi
+	$LD	$nj,($np)
+	$LD	$nlo,$BNSZ($np)
+	mflo	$lo0
+	mfhi	$hi0
+	$ADDU	$lo0,$tj
+	$MULTU	$lo0,$n0
+	sltu	$at,$lo0,$tj
+	$ADDU	$hi0,$at
+	mflo	$m1
+
+	$MULTU	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	$MULTU	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+
+	$MULTU	$nlo,$m1
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,$sp
+	li	$j,2*$BNSZ
+	$LD	$tj,$BNSZ($tp)
+.align	4
+.Linner:
+	.set	noreorder
+	$PTR_ADD $aj,$ap,$j
+	$PTR_ADD $nj,$np,$j
+	$LD	$aj,($aj)
+	$LD	$nj,($nj)
+
+	$MULTU	$aj,$bi
+	$ADDU	$lo0,$alo,$hi0
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo0,$hi0
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$hi1,$nhi,$t0
+	mflo	$alo
+	mfhi	$ahi
+
+	$ADDU	$lo0,$tj
+	addu	$j,$BNSZ
+	$MULTU	$nj,$m1
+	sltu	$at,$lo0,$tj
+	$ADDU	$lo1,$lo0
+	$ADDU	$hi0,$at
+	sltu	$t0,$lo1,$lo0
+	$LD	$tj,2*$BNSZ($tp)
+	$ADDU	$hi1,$t0
+	sltu	$at,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+	$ST	$lo1,($tp)
+	bnez	$at,.Linner
+	$PTR_ADD $tp,$BNSZ
+	.set	reorder
+
+	$ADDU	$lo0,$alo,$hi0
+	sltu	$at,$lo0,$hi0
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$lo0,$tj
+	sltu	$t0,$lo0,$tj
+	$ADDU	$hi0,$t0
+
+	$LD	$tj,2*$BNSZ($tp)
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo1,$hi1
+	$ADDU	$hi1,$nhi,$at
+	$ADDU	$lo1,$lo0
+	sltu	$t0,$lo1,$lo0
+	$ADDU	$hi1,$t0
+	$ST	$lo1,($tp)
+
+	$ADDU	$lo1,$hi1,$hi0
+	sltu	$hi1,$lo1,$hi0
+	$ADDU	$lo1,$tj
+	sltu	$at,$lo1,$tj
+	$ADDU	$hi1,$at
+	$ST	$lo1,$BNSZ($tp)
+	$ST	$hi1,2*$BNSZ($tp)
+
+	addu	$i,$BNSZ
+	sltu	$t0,$i,$num
+	bnez	$t0,.Louter
+
+	.set	noreorder
+	$PTR_ADD $tj,$sp,$num	# &tp[num]
+	move	$tp,$sp
+	move	$ap,$sp
+	li	$hi0,0		# clear borrow bit
+
+.align	4
+.Lsub:	$LD	$lo0,($tp)
+	$LD	$lo1,($np)
+	$PTR_ADD $tp,$BNSZ
+	$PTR_ADD $np,$BNSZ
+	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
+	sgtu	$at,$lo1,$lo0
+	$SUBU	$lo0,$lo1,$hi0
+	sgtu	$hi0,$lo0,$lo1
+	$ST	$lo0,($rp)
+	or	$hi0,$at
+	sltu	$at,$tp,$tj
+	bnez	$at,.Lsub
+	$PTR_ADD $rp,$BNSZ
+
+	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
+	move	$tp,$sp
+	$PTR_SUB $rp,$num	# restore rp
+	not	$hi1,$hi0
+
+	and	$ap,$hi0,$sp
+	and	$bp,$hi1,$rp
+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	$LD	$aj,($ap)
+	$PTR_ADD $ap,$BNSZ
+	$ST	$zero,($tp)
+	$PTR_ADD $tp,$BNSZ
+	sltu	$at,$tp,$tj
+	$ST	$aj,($rp)
+	bnez	$at,.Lcopy
+	$PTR_ADD $rp,$BNSZ
+
+	li	$a0,1
+	li	$t0,1
+
+	.set	noreorder
+	move	$sp,$fp
+	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
+	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end	bn_mul_mont_internal
+.rdata
+.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/mips.pl b/main/openssl/crypto/bn/asm/mips.pl
new file mode 100644
index 00000000..d2f3ef7b
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/mips.pl
@@ -0,0 +1,2583 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project.
+#
+# Rights for redistribution and usage in source and binary forms are
+# granted according to the OpenSSL license. Warranty of any kind is
+# disclaimed.
+# ====================================================================
+
+
+# July 1999
+#
+# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
+#
+# The module is designed to work with either of the "new" MIPS ABI(5),
+# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+# IRIX 5.x not only because it doesn't support new ABIs but also
+# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+# cause illegal instruction exception:-(
+#
+# In addition the code depends on preprocessor flags set up by MIPSpro
+# compiler driver (either as or cc) and therefore (probably?) can't be
+# compiled by the GNU assembler. GNU C driver manages fine though...
+# I mean as long as -mmips-as is specified or is the default option,
+# because then it simply invokes /usr/bin/as which in turn takes
+# perfect care of the preprocessor definitions. Another neat feature
+# offered by the MIPSpro assembler is an optimization pass. This gave
+# me the opportunity to have the code looking more regular as all those
+# architecture dependent instruction rescheduling details were left to
+# the assembler. Cool, huh?
+#
+# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
+# goes way over 3 times faster!
+#
+#					<appro@fy.chalmers.se>
+
+# October 2010
+#
+# Adapt the module even for 32-bit ABIs and other OSes. The former was
+# achieved by mechanical replacement of 64-bit arithmetic instructions
+# such as dmultu, daddu, etc. with their 32-bit counterparts and
+# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
+# >3x performance improvement naturally does not apply to 32-bit code
+# [because there is no instruction 32-bit compiler can't use], one
+# has to content with 40-85% improvement depending on benchmark and
+# key length, more for longer keys.
+
+$flavour = shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+	$LD="ld";
+	$ST="sd";
+	$MULTU="dmultu";
+	$DIVU="ddivu";
+	$ADDU="daddu";
+	$SUBU="dsubu";
+	$SRL="dsrl";
+	$SLL="dsll";
+	$BNSZ=8;
+	$PTR_ADD="daddu";
+	$PTR_SUB="dsubu";
+	$SZREG=8;
+	$REG_S="sd";
+	$REG_L="ld";
+} else {
+	$LD="lw";
+	$ST="sw";
+	$MULTU="multu";
+	$DIVU="divu";
+	$ADDU="addu";
+	$SUBU="subu";
+	$SRL="srl";
+	$SLL="sll";
+	$BNSZ=4;
+	$PTR_ADD="addu";
+	$PTR_SUB="subu";
+	$SZREG=4;
+	$REG_S="sw";
+	$REG_L="lw";
+	$code=".set	mips2\n";
+}
+
+# Below is N32/64 register layout used in the original module.
+#
+($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
+#
+# No special adaptation is required for O32. NUBI on the other hand
+# is treated by saving/restoring ($v1,$t0..$t3).
+
+$gp=$v1 if ($flavour =~ /nubi/i);
+
+$minus4=$v1;
+
+$code.=<<___;
+.rdata
+.asciiz	"mips3.s, Version 1.2"
+.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
+
+.text
+.set	noat
+
+.align	5
+.globl	bn_mul_add_words
+.ent	bn_mul_add_words
+bn_mul_add_words:
+	.set	noreorder
+	bgtz	$a2,bn_mul_add_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$v0
+.end	bn_mul_add_words
+
+.align	5
+.ent	bn_mul_add_words_internal
+bn_mul_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$ta0,$a2,$minus4
+	beqz	$ta0,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+	$LD	$t0,0($a1)
+	$MULTU	$t0,$a3
+	$LD	$t1,0($a0)
+	$LD	$t2,$BNSZ($a1)
+	$LD	$t3,$BNSZ($a0)
+	$LD	$ta0,2*$BNSZ($a1)
+	$LD	$ta1,2*$BNSZ($a0)
+	$ADDU	$t1,$v0
+	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
+				# values", but it seems to work fine
+				# even on 64-bit registers.
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$t1,$at
+	$ADDU	$v0,$t0
+	 $MULTU	$t2,$a3
+	sltu	$at,$t1,$at
+	$ST	$t1,0($a0)
+	$ADDU	$v0,$at
+
+	$LD	$ta2,3*$BNSZ($a1)
+	$LD	$ta3,3*$BNSZ($a0)
+	$ADDU	$t3,$v0
+	sltu	$v0,$t3,$v0
+	mflo	$at
+	mfhi	$t2
+	$ADDU	$t3,$at
+	$ADDU	$v0,$t2
+	 $MULTU	$ta0,$a3
+	sltu	$at,$t3,$at
+	$ST	$t3,$BNSZ($a0)
+	$ADDU	$v0,$at
+
+	subu	$a2,4
+	$PTR_ADD $a0,4*$BNSZ
+	$PTR_ADD $a1,4*$BNSZ
+	$ADDU	$ta1,$v0
+	sltu	$v0,$ta1,$v0
+	mflo	$at
+	mfhi	$ta0
+	$ADDU	$ta1,$at
+	$ADDU	$v0,$ta0
+	 $MULTU	$ta2,$a3
+	sltu	$at,$ta1,$at
+	$ST	$ta1,-2*$BNSZ($a0)
+	$ADDU	$v0,$at
+
+
+	and	$ta0,$a2,$minus4
+	$ADDU	$ta3,$v0
+	sltu	$v0,$ta3,$v0
+	mflo	$at
+	mfhi	$ta2
+	$ADDU	$ta3,$at
+	$ADDU	$v0,$ta2
+	sltu	$at,$ta3,$at
+	$ST	$ta3,-$BNSZ($a0)
+	.set	noreorder
+	bgtz	$ta0,.L_bn_mul_add_words_loop
+	$ADDU	$v0,$at
+
+	beqz	$a2,.L_bn_mul_add_words_return
+	nop
+
+.L_bn_mul_add_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$MULTU	$t0,$a3
+	$LD	$t1,0($a0)
+	subu	$a2,1
+	$ADDU	$t1,$v0
+	sltu	$v0,$t1,$v0
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$t1,$at
+	$ADDU	$v0,$t0
+	sltu	$at,$t1,$at
+	$ST	$t1,0($a0)
+	$ADDU	$v0,$at
+	beqz	$a2,.L_bn_mul_add_words_return
+
+	$LD	$t0,$BNSZ($a1)
+	$MULTU	$t0,$a3
+	$LD	$t1,$BNSZ($a0)
+	subu	$a2,1
+	$ADDU	$t1,$v0
+	sltu	$v0,$t1,$v0
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$t1,$at
+	$ADDU	$v0,$t0
+	sltu	$at,$t1,$at
+	$ST	$t1,$BNSZ($a0)
+	$ADDU	$v0,$at
+	beqz	$a2,.L_bn_mul_add_words_return
+
+	$LD	$t0,2*$BNSZ($a1)
+	$MULTU	$t0,$a3
+	$LD	$t1,2*$BNSZ($a0)
+	$ADDU	$t1,$v0
+	sltu	$v0,$t1,$v0
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$t1,$at
+	$ADDU	$v0,$t0
+	sltu	$at,$t1,$at
+	$ST	$t1,2*$BNSZ($a0)
+	$ADDU	$v0,$at
+
+.L_bn_mul_add_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_mul_add_words_internal
+
+.align	5
+.globl	bn_mul_words
+.ent	bn_mul_words
+bn_mul_words:
+	.set	noreorder
+	bgtz	$a2,bn_mul_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$v0
+.end	bn_mul_words
+
+.align	5
+.ent	bn_mul_words_internal
+bn_mul_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$ta0,$a2,$minus4
+	beqz	$ta0,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+	$LD	$t0,0($a1)
+	$MULTU	$t0,$a3
+	$LD	$t2,$BNSZ($a1)
+	$LD	$ta0,2*$BNSZ($a1)
+	$LD	$ta2,3*$BNSZ($a1)
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$v0,$at
+	sltu	$t1,$v0,$at
+	 $MULTU	$t2,$a3
+	$ST	$v0,0($a0)
+	$ADDU	$v0,$t1,$t0
+
+	subu	$a2,4
+	$PTR_ADD $a0,4*$BNSZ
+	$PTR_ADD $a1,4*$BNSZ
+	mflo	$at
+	mfhi	$t2
+	$ADDU	$v0,$at
+	sltu	$t3,$v0,$at
+	 $MULTU	$ta0,$a3
+	$ST	$v0,-3*$BNSZ($a0)
+	$ADDU	$v0,$t3,$t2
+
+	mflo	$at
+	mfhi	$ta0
+	$ADDU	$v0,$at
+	sltu	$ta1,$v0,$at
+	 $MULTU	$ta2,$a3
+	$ST	$v0,-2*$BNSZ($a0)
+	$ADDU	$v0,$ta1,$ta0
+
+	and	$ta0,$a2,$minus4
+	mflo	$at
+	mfhi	$ta2
+	$ADDU	$v0,$at
+	sltu	$ta3,$v0,$at
+	$ST	$v0,-$BNSZ($a0)
+	.set	noreorder
+	bgtz	$ta0,.L_bn_mul_words_loop
+	$ADDU	$v0,$ta3,$ta2
+
+	beqz	$a2,.L_bn_mul_words_return
+	nop
+
+.L_bn_mul_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$MULTU	$t0,$a3
+	subu	$a2,1
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$v0,$at
+	sltu	$t1,$v0,$at
+	$ST	$v0,0($a0)
+	$ADDU	$v0,$t1,$t0
+	beqz	$a2,.L_bn_mul_words_return
+
+	$LD	$t0,$BNSZ($a1)
+	$MULTU	$t0,$a3
+	subu	$a2,1
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$v0,$at
+	sltu	$t1,$v0,$at
+	$ST	$v0,$BNSZ($a0)
+	$ADDU	$v0,$t1,$t0
+	beqz	$a2,.L_bn_mul_words_return
+
+	$LD	$t0,2*$BNSZ($a1)
+	$MULTU	$t0,$a3
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$v0,$at
+	sltu	$t1,$v0,$at
+	$ST	$v0,2*$BNSZ($a0)
+	$ADDU	$v0,$t1,$t0
+
+.L_bn_mul_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_mul_words_internal
+
+.align	5
+.globl	bn_sqr_words
+.ent	bn_sqr_words
+bn_sqr_words:
+	.set	noreorder
+	bgtz	$a2,bn_sqr_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$v0
+.end	bn_sqr_words
+
+.align	5
+.ent	bn_sqr_words_internal
+bn_sqr_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$ta0,$a2,$minus4
+	beqz	$ta0,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+	$LD	$t0,0($a1)
+	$MULTU	$t0,$t0
+	$LD	$t2,$BNSZ($a1)
+	$LD	$ta0,2*$BNSZ($a1)
+	$LD	$ta2,3*$BNSZ($a1)
+	mflo	$t1
+	mfhi	$t0
+	$ST	$t1,0($a0)
+	$ST	$t0,$BNSZ($a0)
+
+	$MULTU	$t2,$t2
+	subu	$a2,4
+	$PTR_ADD $a0,8*$BNSZ
+	$PTR_ADD $a1,4*$BNSZ
+	mflo	$t3
+	mfhi	$t2
+	$ST	$t3,-6*$BNSZ($a0)
+	$ST	$t2,-5*$BNSZ($a0)
+
+	$MULTU	$ta0,$ta0
+	mflo	$ta1
+	mfhi	$ta0
+	$ST	$ta1,-4*$BNSZ($a0)
+	$ST	$ta0,-3*$BNSZ($a0)
+
+
+	$MULTU	$ta2,$ta2
+	and	$ta0,$a2,$minus4
+	mflo	$ta3
+	mfhi	$ta2
+	$ST	$ta3,-2*$BNSZ($a0)
+
+	.set	noreorder
+	bgtz	$ta0,.L_bn_sqr_words_loop
+	$ST	$ta2,-$BNSZ($a0)
+
+	beqz	$a2,.L_bn_sqr_words_return
+	nop
+
+.L_bn_sqr_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$MULTU	$t0,$t0
+	subu	$a2,1
+	mflo	$t1
+	mfhi	$t0
+	$ST	$t1,0($a0)
+	$ST	$t0,$BNSZ($a0)
+	beqz	$a2,.L_bn_sqr_words_return
+
+	$LD	$t0,$BNSZ($a1)
+	$MULTU	$t0,$t0
+	subu	$a2,1
+	mflo	$t1
+	mfhi	$t0
+	$ST	$t1,2*$BNSZ($a0)
+	$ST	$t0,3*$BNSZ($a0)
+	beqz	$a2,.L_bn_sqr_words_return
+
+	$LD	$t0,2*$BNSZ($a1)
+	$MULTU	$t0,$t0
+	mflo	$t1
+	mfhi	$t0
+	$ST	$t1,4*$BNSZ($a0)
+	$ST	$t0,5*$BNSZ($a0)
+
+.L_bn_sqr_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+
+.end	bn_sqr_words_internal
+
+.align	5
+.globl	bn_add_words
+.ent	bn_add_words
+bn_add_words:
+	.set	noreorder
+	bgtz	$a3,bn_add_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$v0
+.end	bn_add_words
+
+.align	5
+.ent	bn_add_words_internal
+bn_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$at,$a3,$minus4
+	beqz	$at,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+	$LD	$t0,0($a1)
+	$LD	$ta0,0($a2)
+	subu	$a3,4
+	$LD	$t1,$BNSZ($a1)
+	and	$at,$a3,$minus4
+	$LD	$t2,2*$BNSZ($a1)
+	$PTR_ADD $a2,4*$BNSZ
+	$LD	$t3,3*$BNSZ($a1)
+	$PTR_ADD $a0,4*$BNSZ
+	$LD	$ta1,-3*$BNSZ($a2)
+	$PTR_ADD $a1,4*$BNSZ
+	$LD	$ta2,-2*$BNSZ($a2)
+	$LD	$ta3,-$BNSZ($a2)
+	$ADDU	$ta0,$t0
+	sltu	$t8,$ta0,$t0
+	$ADDU	$t0,$ta0,$v0
+	sltu	$v0,$t0,$ta0
+	$ST	$t0,-4*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+	$ADDU	$ta1,$t1
+	sltu	$t9,$ta1,$t1
+	$ADDU	$t1,$ta1,$v0
+	sltu	$v0,$t1,$ta1
+	$ST	$t1,-3*$BNSZ($a0)
+	$ADDU	$v0,$t9
+
+	$ADDU	$ta2,$t2
+	sltu	$t8,$ta2,$t2
+	$ADDU	$t2,$ta2,$v0
+	sltu	$v0,$t2,$ta2
+	$ST	$t2,-2*$BNSZ($a0)
+	$ADDU	$v0,$t8
+	
+	$ADDU	$ta3,$t3
+	sltu	$t9,$ta3,$t3
+	$ADDU	$t3,$ta3,$v0
+	sltu	$v0,$t3,$ta3
+	$ST	$t3,-$BNSZ($a0)
+	
+	.set	noreorder
+	bgtz	$at,.L_bn_add_words_loop
+	$ADDU	$v0,$t9
+
+	beqz	$a3,.L_bn_add_words_return
+	nop
+
+.L_bn_add_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$LD	$ta0,0($a2)
+	$ADDU	$ta0,$t0
+	subu	$a3,1
+	sltu	$t8,$ta0,$t0
+	$ADDU	$t0,$ta0,$v0
+	sltu	$v0,$t0,$ta0
+	$ST	$t0,0($a0)
+	$ADDU	$v0,$t8
+	beqz	$a3,.L_bn_add_words_return
+
+	$LD	$t1,$BNSZ($a1)
+	$LD	$ta1,$BNSZ($a2)
+	$ADDU	$ta1,$t1
+	subu	$a3,1
+	sltu	$t9,$ta1,$t1
+	$ADDU	$t1,$ta1,$v0
+	sltu	$v0,$t1,$ta1
+	$ST	$t1,$BNSZ($a0)
+	$ADDU	$v0,$t9
+	beqz	$a3,.L_bn_add_words_return
+
+	$LD	$t2,2*$BNSZ($a1)
+	$LD	$ta2,2*$BNSZ($a2)
+	$ADDU	$ta2,$t2
+	sltu	$t8,$ta2,$t2
+	$ADDU	$t2,$ta2,$v0
+	sltu	$v0,$t2,$ta2
+	$ST	$t2,2*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+.L_bn_add_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+
+.end	bn_add_words_internal
+
+.align	5
+.globl	bn_sub_words
+.ent	bn_sub_words
+bn_sub_words:
+	.set	noreorder
+	bgtz	$a3,bn_sub_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$zero
+.end	bn_sub_words
+
+.align	5
+.ent	bn_sub_words_internal
+bn_sub_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$at,$a3,$minus4
+	beqz	$at,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+	$LD	$t0,0($a1)
+	$LD	$ta0,0($a2)
+	subu	$a3,4
+	$LD	$t1,$BNSZ($a1)
+	and	$at,$a3,$minus4
+	$LD	$t2,2*$BNSZ($a1)
+	$PTR_ADD $a2,4*$BNSZ
+	$LD	$t3,3*$BNSZ($a1)
+	$PTR_ADD $a0,4*$BNSZ
+	$LD	$ta1,-3*$BNSZ($a2)
+	$PTR_ADD $a1,4*$BNSZ
+	$LD	$ta2,-2*$BNSZ($a2)
+	$LD	$ta3,-$BNSZ($a2)
+	sltu	$t8,$t0,$ta0
+	$SUBU	$ta0,$t0,$ta0
+	$SUBU	$t0,$ta0,$v0
+	sgtu	$v0,$t0,$ta0
+	$ST	$t0,-4*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+	sltu	$t9,$t1,$ta1
+	$SUBU	$ta1,$t1,$ta1
+	$SUBU	$t1,$ta1,$v0
+	sgtu	$v0,$t1,$ta1
+	$ST	$t1,-3*$BNSZ($a0)
+	$ADDU	$v0,$t9
+
+
+	sltu	$t8,$t2,$ta2
+	$SUBU	$ta2,$t2,$ta2
+	$SUBU	$t2,$ta2,$v0
+	sgtu	$v0,$t2,$ta2
+	$ST	$t2,-2*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+	sltu	$t9,$t3,$ta3
+	$SUBU	$ta3,$t3,$ta3
+	$SUBU	$t3,$ta3,$v0
+	sgtu	$v0,$t3,$ta3
+	$ST	$t3,-$BNSZ($a0)
+
+	.set	noreorder
+	bgtz	$at,.L_bn_sub_words_loop
+	$ADDU	$v0,$t9
+
+	beqz	$a3,.L_bn_sub_words_return
+	nop
+
+.L_bn_sub_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$LD	$ta0,0($a2)
+	subu	$a3,1
+	sltu	$t8,$t0,$ta0
+	$SUBU	$ta0,$t0,$ta0
+	$SUBU	$t0,$ta0,$v0
+	sgtu	$v0,$t0,$ta0
+	$ST	$t0,0($a0)
+	$ADDU	$v0,$t8
+	beqz	$a3,.L_bn_sub_words_return
+
+	$LD	$t1,$BNSZ($a1)
+	subu	$a3,1
+	$LD	$ta1,$BNSZ($a2)
+	sltu	$t9,$t1,$ta1
+	$SUBU	$ta1,$t1,$ta1
+	$SUBU	$t1,$ta1,$v0
+	sgtu	$v0,$t1,$ta1
+	$ST	$t1,$BNSZ($a0)
+	$ADDU	$v0,$t9
+	beqz	$a3,.L_bn_sub_words_return
+
+	$LD	$t2,2*$BNSZ($a1)
+	$LD	$ta2,2*$BNSZ($a2)
+	sltu	$t8,$t2,$ta2
+	$SUBU	$ta2,$t2,$ta2
+	$SUBU	$t2,$ta2,$v0
+	sgtu	$v0,$t2,$ta2
+	$ST	$t2,2*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+.L_bn_sub_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_sub_words_internal
+
+.align 5
+.globl	bn_div_3_words
+.ent	bn_div_3_words
+bn_div_3_words:
+	.set	noreorder
+	move	$a3,$a0		# we know that bn_div_words does not
+				# touch $a3, $ta2, $ta3 and preserves $a2
+				# so that we can save two arguments
+				# and return address in registers
+				# instead of stack:-)
+				
+	$LD	$a0,($a3)
+	move	$ta2,$a1
+	bne	$a0,$a2,bn_div_3_words_internal
+	$LD	$a1,-$BNSZ($a3)
+	li	$v0,-1
+	jr	$ra
+	move	$a0,$v0
+.end	bn_div_3_words
+
+.align	5
+.ent	bn_div_3_words_internal
+bn_div_3_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	move	$ta3,$ra
+	bal	bn_div_words_internal
+	move	$ra,$ta3
+	$MULTU	$ta2,$v0
+	$LD	$t2,-2*$BNSZ($a3)
+	move	$ta0,$zero
+	mfhi	$t1
+	mflo	$t0
+	sltu	$t8,$t1,$a1
+.L_bn_div_3_words_inner_loop:
+	bnez	$t8,.L_bn_div_3_words_inner_loop_done
+	sgeu	$at,$t2,$t0
+	seq	$t9,$t1,$a1
+	and	$at,$t9
+	sltu	$t3,$t0,$ta2
+	$ADDU	$a1,$a2
+	$SUBU	$t1,$t3
+	$SUBU	$t0,$ta2
+	sltu	$t8,$t1,$a1
+	sltu	$ta0,$a1,$a2
+	or	$t8,$ta0
+	.set	noreorder
+	beqz	$at,.L_bn_div_3_words_inner_loop
+	$SUBU	$v0,1
+	$ADDU	$v0,1
+	.set	reorder
+.L_bn_div_3_words_inner_loop_done:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_div_3_words_internal
+
+.align	5
+.globl	bn_div_words
+.ent	bn_div_words
+bn_div_words:
+	.set	noreorder
+	bnez	$a2,bn_div_words_internal
+	li	$v0,-1		# I would rather signal div-by-zero
+				# which can be done with 'break 7'
+	jr	$ra
+	move	$a0,$v0
+.end	bn_div_words
+
+.align	5
+.ent	bn_div_words_internal
+bn_div_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	move	$v1,$zero
+	bltz	$a2,.L_bn_div_words_body
+	move	$t9,$v1
+	$SLL	$a2,1
+	bgtz	$a2,.-4
+	addu	$t9,1
+
+	.set	reorder
+	negu	$t1,$t9
+	li	$t2,-1
+	$SLL	$t2,$t1
+	and	$t2,$a0
+	$SRL	$at,$a1,$t1
+	.set	noreorder
+	beqz	$t2,.+12
+	nop
+	break	6		# signal overflow
+	.set	reorder
+	$SLL	$a0,$t9
+	$SLL	$a1,$t9
+	or	$a0,$at
+___
+$QT=$ta0;
+$HH=$ta1;
+$DH=$v1;
+$code.=<<___;
+.L_bn_div_words_body:
+	$SRL	$DH,$a2,4*$BNSZ	# bits
+	sgeu	$at,$a0,$a2
+	.set	noreorder
+	beqz	$at,.+12
+	nop
+	$SUBU	$a0,$a2
+	.set	reorder
+
+	li	$QT,-1
+	$SRL	$HH,$a0,4*$BNSZ	# bits
+	$SRL	$QT,4*$BNSZ	# q=0xffffffff
+	beq	$DH,$HH,.L_bn_div_words_skip_div1
+	$DIVU	$zero,$a0,$DH
+	mflo	$QT
+.L_bn_div_words_skip_div1:
+	$MULTU	$a2,$QT
+	$SLL	$t3,$a0,4*$BNSZ	# bits
+	$SRL	$at,$a1,4*$BNSZ	# bits
+	or	$t3,$at
+	mflo	$t0
+	mfhi	$t1
+.L_bn_div_words_inner_loop1:
+	sltu	$t2,$t3,$t0
+	seq	$t8,$HH,$t1
+	sltu	$at,$HH,$t1
+	and	$t2,$t8
+	sltu	$v0,$t0,$a2
+	or	$at,$t2
+	.set	noreorder
+	beqz	$at,.L_bn_div_words_inner_loop1_done
+	$SUBU	$t1,$v0
+	$SUBU	$t0,$a2
+	b	.L_bn_div_words_inner_loop1
+	$SUBU	$QT,1
+	.set	reorder
+.L_bn_div_words_inner_loop1_done:
+
+	$SLL	$a1,4*$BNSZ	# bits
+	$SUBU	$a0,$t3,$t0
+	$SLL	$v0,$QT,4*$BNSZ	# bits
+
+	li	$QT,-1
+	$SRL	$HH,$a0,4*$BNSZ	# bits
+	$SRL	$QT,4*$BNSZ	# q=0xffffffff
+	beq	$DH,$HH,.L_bn_div_words_skip_div2
+	$DIVU	$zero,$a0,$DH
+	mflo	$QT
+.L_bn_div_words_skip_div2:
+	$MULTU	$a2,$QT
+	$SLL	$t3,$a0,4*$BNSZ	# bits
+	$SRL	$at,$a1,4*$BNSZ	# bits
+	or	$t3,$at
+	mflo	$t0
+	mfhi	$t1
+.L_bn_div_words_inner_loop2:
+	sltu	$t2,$t3,$t0
+	seq	$t8,$HH,$t1
+	sltu	$at,$HH,$t1
+	and	$t2,$t8
+	sltu	$v1,$t0,$a2
+	or	$at,$t2
+	.set	noreorder
+	beqz	$at,.L_bn_div_words_inner_loop2_done
+	$SUBU	$t1,$v1
+	$SUBU	$t0,$a2
+	b	.L_bn_div_words_inner_loop2
+	$SUBU	$QT,1
+	.set	reorder
+.L_bn_div_words_inner_loop2_done:
+
+	$SUBU	$a0,$t3,$t0
+	or	$v0,$QT
+	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
+	$SRL	$a2,$t9		# restore $a2
+
+	.set	noreorder
+	move	$a1,$v1
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_div_words_internal
+___
+undef $HH; undef $QT; undef $DH;
+
+($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
+($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
+
+($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
+($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
+
+($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
+
+$code.=<<___;
+
+.align	5
+.globl	bn_mul_comba8
+.ent	bn_mul_comba8
+bn_mul_comba8:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,12*$SZREG,$ra
+	.mask	0x803ff008,-$SZREG
+	$PTR_SUB $sp,12*$SZREG
+	$REG_S	$ra,11*$SZREG($sp)
+	$REG_S	$s5,10*$SZREG($sp)
+	$REG_S	$s4,9*$SZREG($sp)
+	$REG_S	$s3,8*$SZREG($sp)
+	$REG_S	$s2,7*$SZREG($sp)
+	$REG_S	$s1,6*$SZREG($sp)
+	$REG_S	$s0,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x003f0000,-$SZREG
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$s5,5*$SZREG($sp)
+	$REG_S	$s4,4*$SZREG($sp)
+	$REG_S	$s3,3*$SZREG($sp)
+	$REG_S	$s2,2*$SZREG($sp)
+	$REG_S	$s1,1*$SZREG($sp)
+	$REG_S	$s0,0*$SZREG($sp)
+___
+$code.=<<___;
+
+	.set	reorder
+	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
+				# R5000 box assembler barks on this
+				# 1ine with "should not have mult/div
+				# as last instruction in bb (R10K
+				# bug)" warning. If anybody out there
+				# has a clue about how to circumvent
+				# this do send me a note.
+				#		<appro\@fy.chalmers.se>
+
+	$LD	$b_0,0($a2)
+	$LD	$a_1,$BNSZ($a1)
+	$LD	$a_2,2*$BNSZ($a1)
+	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
+	$LD	$a_3,3*$BNSZ($a1)
+	$LD	$b_1,$BNSZ($a2)
+	$LD	$b_2,2*$BNSZ($a2)
+	$LD	$b_3,3*$BNSZ($a2)
+	mflo	$c_1
+	mfhi	$c_2
+
+	$LD	$a_4,4*$BNSZ($a1)
+	$LD	$a_5,5*$BNSZ($a1)
+	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
+	$LD	$a_6,6*$BNSZ($a1)
+	$LD	$a_7,7*$BNSZ($a1)
+	$LD	$b_4,4*$BNSZ($a2)
+	$LD	$b_5,5*$BNSZ($a2)
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
+	$ADDU	$c_3,$t_2,$at
+	$LD	$b_6,6*$BNSZ($a2)
+	$LD	$b_7,7*$BNSZ($a2)
+	$ST	$c_1,0($a0)	# r[0]=c1;
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
+	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
+
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s5,10*$SZREG($sp)
+	$REG_L	$s4,9*$SZREG($sp)
+	$REG_L	$s3,8*$SZREG($sp)
+	$REG_L	$s2,7*$SZREG($sp)
+	$REG_L	$s1,6*$SZREG($sp)
+	$REG_L	$s0,5*$SZREG($sp)
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	jr	$ra
+	$PTR_ADD $sp,12*$SZREG
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+	$REG_L	$s5,5*$SZREG($sp)
+	$REG_L	$s4,4*$SZREG($sp)
+	$REG_L	$s3,3*$SZREG($sp)
+	$REG_L	$s2,2*$SZREG($sp)
+	$REG_L	$s1,1*$SZREG($sp)
+	$REG_L	$s0,0*$SZREG($sp)
+	jr	$ra
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+.end	bn_mul_comba8
+
+.align	5
+.globl	bn_mul_comba4
+.ent	bn_mul_comba4
+bn_mul_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	$LD	$a_0,0($a1)
+	$LD	$b_0,0($a2)
+	$LD	$a_1,$BNSZ($a1)
+	$LD	$a_2,2*$BNSZ($a1)
+	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
+	$LD	$a_3,3*$BNSZ($a1)
+	$LD	$b_1,$BNSZ($a2)
+	$LD	$b_2,2*$BNSZ($a2)
+	$LD	$b_3,3*$BNSZ($a2)
+	mflo	$c_1
+	mfhi	$c_2
+	$ST	$c_1,0($a0)
+
+	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
+	$ADDU	$c_3,$t_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	$ST	$c_2,$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,2*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,3*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,4*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,5*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	$ST	$c_1,6*$BNSZ($a0)
+	$ST	$c_2,7*$BNSZ($a0)
+
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	nop
+.end	bn_mul_comba4
+___
+
+($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
+
+$code.=<<___;
+
+.align	5
+.globl	bn_sqr_comba8
+.ent	bn_sqr_comba8
+bn_sqr_comba8:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	$LD	$a_0,0($a1)
+	$LD	$a_1,$BNSZ($a1)
+	$LD	$a_2,2*$BNSZ($a1)
+	$LD	$a_3,3*$BNSZ($a1)
+
+	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
+	$LD	$a_4,4*$BNSZ($a1)
+	$LD	$a_5,5*$BNSZ($a1)
+	$LD	$a_6,6*$BNSZ($a1)
+	$LD	$a_7,7*$BNSZ($a1)
+	mflo	$c_1
+	mfhi	$c_2
+	$ST	$c_1,0($a0)
+
+	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$c_3,$t_2,$at
+	$ST	$c_2,$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,2*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,3*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,4*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_2,$at
+	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
+	$ADDU	$c_2,$at
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,5*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,6*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,7*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_2,$at
+	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_2,$at
+	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,8*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,9*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,10*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_2,$at
+	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,11*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,12*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,13*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	$ST	$c_3,14*$BNSZ($a0)
+	$ST	$c_1,15*$BNSZ($a0)
+
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	nop
+.end	bn_sqr_comba8
+
+.align	5
+.globl	bn_sqr_comba4
+.ent	bn_sqr_comba4
+bn_sqr_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	$LD	$a_0,0($a1)
+	$LD	$a_1,$BNSZ($a1)
+	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
+	$LD	$a_2,2*$BNSZ($a1)
+	$LD	$a_3,3*$BNSZ($a1)
+	mflo	$c_1
+	mfhi	$c_2
+	$ST	$c_1,0($a0)
+
+	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$c_3,$t_2,$at
+	$ST	$c_2,$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,2*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,3*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,4*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,5*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	$ST	$c_1,6*$BNSZ($a0)
+	$ST	$c_2,7*$BNSZ($a0)
+
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	nop
+.end	bn_sqr_comba4
+___
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/modexp512-x86_64.S b/main/openssl/crypto/bn/asm/modexp512-x86_64.S
new file mode 100644
index 00000000..6cccafb8
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/modexp512-x86_64.S
@@ -0,0 +1,1773 @@
+.text	
+
+.type	MULADD_128x512,@function
+.align	16
+MULADD_128x512:
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%r8,0(%rcx)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	8(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	movq	%r9,8(%rcx)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	.byte	0xf3,0xc3
+.size	MULADD_128x512,.-MULADD_128x512
+.type	mont_reduce,@function
+.align	16
+mont_reduce:
+	leaq	192(%rsp),%rdi
+	movq	32(%rsp),%rsi
+	addq	$576,%rsi
+	leaq	520(%rsp),%rcx
+
+	movq	96(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	movq	(%rcx),%r8
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%r8,0(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	movq	8(%rcx),%r9
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	movq	16(%rcx),%r10
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	movq	24(%rcx),%r11
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	movq	32(%rcx),%r12
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	movq	40(%rcx),%r13
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	movq	48(%rcx),%r14
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	movq	56(%rcx),%r15
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	104(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	movq	%r9,8(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	movq	112(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%r10,16(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	120(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,24(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	xorq	%rax,%rax
+
+	addq	64(%rcx),%r8
+	adcq	72(%rcx),%r9
+	adcq	80(%rcx),%r10
+	adcq	88(%rcx),%r11
+	adcq	$0,%rax
+
+
+
+
+	movq	%r8,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r10,%rbp
+	movq	%r11,88(%rdi)
+
+	movq	%rax,384(%rsp)
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+
+
+
+
+
+
+
+
+	addq	$80,%rdi
+
+	addq	$64,%rsi
+	leaq	296(%rsp),%rcx
+
+	call	MULADD_128x512			
+
+	movq	384(%rsp),%rax
+
+
+	addq	-16(%rdi),%r8
+	adcq	-8(%rdi),%r9
+	movq	%r8,64(%rcx)
+	movq	%r9,72(%rcx)
+
+	adcq	%rax,%rax
+	movq	%rax,384(%rsp)
+
+	leaq	192(%rsp),%rdi
+	addq	$64,%rsi
+
+
+
+
+
+	movq	(%rsi),%r8
+	movq	8(%rsi),%rbx
+
+	movq	(%rcx),%rax
+	mulq	%r8
+	movq	%rax,%rbp
+	movq	%rdx,%r9
+
+	movq	8(%rcx),%rax
+	mulq	%r8
+	addq	%rax,%r9
+
+	movq	(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r9
+
+	movq	%r9,8(%rdi)
+
+
+	subq	$192,%rsi
+
+	movq	(%rcx),%r8
+	movq	8(%rcx),%r9
+
+	call	MULADD_128x512			
+
+
+
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	16(%rsi),%rdi
+	movq	24(%rsi),%rdx
+
+
+	movq	384(%rsp),%rbp
+
+	addq	64(%rcx),%r8
+	adcq	72(%rcx),%r9
+
+
+	adcq	%rbp,%rbp
+
+
+
+	shlq	$3,%rbp
+	movq	32(%rsp),%rcx
+	addq	%rcx,%rbp
+
+
+	xorq	%rsi,%rsi
+
+	addq	0(%rbp),%r10
+	adcq	64(%rbp),%r11
+	adcq	128(%rbp),%r12
+	adcq	192(%rbp),%r13
+	adcq	256(%rbp),%r14
+	adcq	320(%rbp),%r15
+	adcq	384(%rbp),%r8
+	adcq	448(%rbp),%r9
+
+
+
+	sbbq	$0,%rsi
+
+
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rdi
+	andq	%rsi,%rdx
+
+	movq	$1,%rbp
+	subq	%rax,%r10
+	sbbq	%rbx,%r11
+	sbbq	%rdi,%r12
+	sbbq	%rdx,%r13
+
+
+
+
+	sbbq	$0,%rbp
+
+
+
+	addq	$512,%rcx
+	movq	32(%rcx),%rax
+	movq	40(%rcx),%rbx
+	movq	48(%rcx),%rdi
+	movq	56(%rcx),%rdx
+
+
+
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rdi
+	andq	%rsi,%rdx
+
+
+
+	subq	$1,%rbp
+
+	sbbq	%rax,%r14
+	sbbq	%rbx,%r15
+	sbbq	%rdi,%r8
+	sbbq	%rdx,%r9
+
+
+
+	movq	144(%rsp),%rsi
+	movq	%r10,0(%rsi)
+	movq	%r11,8(%rsi)
+	movq	%r12,16(%rsi)
+	movq	%r13,24(%rsi)
+	movq	%r14,32(%rsi)
+	movq	%r15,40(%rsi)
+	movq	%r8,48(%rsi)
+	movq	%r9,56(%rsi)
+
+	.byte	0xf3,0xc3
+.size	mont_reduce,.-mont_reduce
+.type	mont_mul_a3b,@function
+.align	16
+mont_mul_a3b:
+
+
+
+
+	movq	0(%rdi),%rbp
+
+	movq	%r10,%rax
+	mulq	%rbp
+	movq	%rax,520(%rsp)
+	movq	%rdx,%r10
+	movq	%r11,%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r12,%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r13,%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r14,%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%r15,%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	movq	%r8,%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	%r9,%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	movq	8(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%r10,528(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	16(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,536(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	24(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%r12,544(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	32(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%r13,552(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	40(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%r14,560(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	48(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%r15,568(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	movq	56(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%r8,576(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	%r9,584(%rsp)
+	movq	%r10,592(%rsp)
+	movq	%r11,600(%rsp)
+	movq	%r12,608(%rsp)
+	movq	%r13,616(%rsp)
+	movq	%r14,624(%rsp)
+	movq	%r15,632(%rsp)
+	movq	%r8,640(%rsp)
+
+
+
+
+
+	jmp	mont_reduce
+
+
+.size	mont_mul_a3b,.-mont_mul_a3b
+.type	sqr_reduce,@function
+.align	16
+sqr_reduce:
+	movq	16(%rsp),%rcx
+
+
+
+	movq	%r10,%rbx
+
+	movq	%r11,%rax
+	mulq	%rbx
+	movq	%rax,528(%rsp)
+	movq	%rdx,%r10
+	movq	%r12,%rax
+	mulq	%rbx
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r13,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r14,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r15,%rax
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rsi
+
+	movq	%r10,536(%rsp)
+
+
+
+
+
+	movq	8(%rcx),%rbx
+
+	movq	16(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,544(%rsp)
+
+	movq	%rdx,%r10
+	movq	24(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%r12,552(%rsp)
+
+	movq	%rdx,%r10
+	movq	32(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%rsi
+	adcq	$0,%rdx
+	addq	%r10,%rsi
+	adcq	$0,%rdx
+
+	movq	%rdx,%r11
+
+
+
+
+	movq	16(%rcx),%rbx
+
+	movq	24(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%r13,560(%rsp)
+
+	movq	%rdx,%r10
+	movq	32(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%r14,568(%rsp)
+
+	movq	%rdx,%r10
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%rsi
+	adcq	$0,%rdx
+	addq	%r10,%rsi
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%r10,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%r12
+
+
+
+
+
+	movq	24(%rcx),%rbx
+
+	movq	32(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%r15,576(%rsp)
+
+	movq	%rdx,%r10
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%rsi
+	adcq	$0,%rdx
+	addq	%r10,%rsi
+	adcq	$0,%rdx
+	movq	%rsi,584(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%r10,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+
+	movq	%rdx,%r15
+
+
+
+
+	movq	32(%rcx),%rbx
+
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,592(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%r12,600(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+
+	movq	%rdx,%r11
+
+
+
+
+	movq	40(%rcx),%rbx
+
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%r15,608(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r11,616(%rsp)
+
+	movq	%rdx,%r12
+
+
+
+
+	movq	%r8,%rbx
+
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%r12,624(%rsp)
+
+	movq	%rdx,632(%rsp)
+
+
+	movq	528(%rsp),%r10
+	movq	536(%rsp),%r11
+	movq	544(%rsp),%r12
+	movq	552(%rsp),%r13
+	movq	560(%rsp),%r14
+	movq	568(%rsp),%r15
+
+	movq	24(%rcx),%rax
+	mulq	%rax
+	movq	%rax,%rdi
+	movq	%rdx,%r8
+
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	$0,%r8
+
+	movq	0(%rcx),%rax
+	mulq	%rax
+	movq	%rax,520(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbx,%r10
+	adcq	%rax,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbx
+	movq	%r10,528(%rsp)
+	movq	%r11,536(%rsp)
+
+	movq	16(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbx,%r12
+	adcq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbx
+
+	movq	%r12,544(%rsp)
+	movq	%r13,552(%rsp)
+
+	xorq	%rbp,%rbp
+	addq	%rbx,%r14
+	adcq	%rdi,%r15
+	adcq	$0,%rbp
+
+	movq	%r14,560(%rsp)
+	movq	%r15,568(%rsp)
+
+
+
+
+	movq	576(%rsp),%r10
+	movq	584(%rsp),%r11
+	movq	592(%rsp),%r12
+	movq	600(%rsp),%r13
+	movq	608(%rsp),%r14
+	movq	616(%rsp),%r15
+	movq	624(%rsp),%rdi
+	movq	632(%rsp),%rsi
+
+	movq	%r9,%rax
+	mulq	%rax
+	movq	%rax,%r9
+	movq	%rdx,%rbx
+
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%rdi,%rdi
+	adcq	%rsi,%rsi
+	adcq	$0,%rbx
+
+	addq	%rbp,%r10
+
+	movq	32(%rcx),%rax
+	mulq	%rax
+
+	addq	%r8,%r10
+	adcq	%rax,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbp
+
+	movq	%r10,576(%rsp)
+	movq	%r11,584(%rsp)
+
+	movq	40(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbp,%r12
+	adcq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbp
+
+	movq	%r12,592(%rsp)
+	movq	%r13,600(%rsp)
+
+	movq	48(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbp,%r14
+	adcq	%rax,%r15
+	adcq	$0,%rdx
+
+	movq	%r14,608(%rsp)
+	movq	%r15,616(%rsp)
+
+	addq	%rdx,%rdi
+	adcq	%r9,%rsi
+	adcq	$0,%rbx
+
+	movq	%rdi,624(%rsp)
+	movq	%rsi,632(%rsp)
+	movq	%rbx,640(%rsp)
+
+	jmp	mont_reduce
+
+
+.size	sqr_reduce,.-sqr_reduce
+.globl	mod_exp_512
+.type	mod_exp_512,@function
+mod_exp_512:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+	movq	%rsp,%r8
+	subq	$2688,%rsp
+	andq	$-64,%rsp
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rcx,24(%rsp)
+.Lbody:
+
+
+
+	pxor	%xmm4,%xmm4
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqa	%xmm4,512(%rsp)
+	movdqa	%xmm4,528(%rsp)
+	movdqa	%xmm4,608(%rsp)
+	movdqa	%xmm4,624(%rsp)
+	movdqa	%xmm0,544(%rsp)
+	movdqa	%xmm1,560(%rsp)
+	movdqa	%xmm2,576(%rsp)
+	movdqa	%xmm3,592(%rsp)
+
+
+	movdqu	0(%rdx),%xmm0
+	movdqu	16(%rdx),%xmm1
+	movdqu	32(%rdx),%xmm2
+	movdqu	48(%rdx),%xmm3
+
+	leaq	384(%rsp),%rbx
+	movq	%rbx,136(%rsp)
+	call	mont_reduce
+
+
+	leaq	448(%rsp),%rcx
+	xorq	%rax,%rax
+	movq	%rax,0(%rcx)
+	movq	%rax,8(%rcx)
+	movq	%rax,24(%rcx)
+	movq	%rax,32(%rcx)
+	movq	%rax,40(%rcx)
+	movq	%rax,48(%rcx)
+	movq	%rax,56(%rcx)
+	movq	%rax,128(%rsp)
+	movq	$1,16(%rcx)
+
+	leaq	640(%rsp),%rbp
+	movq	%rcx,%rsi
+	movq	%rbp,%rdi
+	movq	$8,%rax
+loop_0:
+	movq	(%rcx),%rbx
+	movw	%bx,(%rdi)
+	shrq	$16,%rbx
+	movw	%bx,64(%rdi)
+	shrq	$16,%rbx
+	movw	%bx,128(%rdi)
+	shrq	$16,%rbx
+	movw	%bx,192(%rdi)
+	leaq	8(%rcx),%rcx
+	leaq	256(%rdi),%rdi
+	decq	%rax
+	jnz	loop_0
+	movq	$31,%rax
+	movq	%rax,32(%rsp)
+	movq	%rbp,40(%rsp)
+
+	movq	%rsi,136(%rsp)
+	movq	0(%rsi),%r10
+	movq	8(%rsi),%r11
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	32(%rsi),%r14
+	movq	40(%rsi),%r15
+	movq	48(%rsi),%r8
+	movq	56(%rsi),%r9
+init_loop:
+	leaq	384(%rsp),%rdi
+	call	mont_mul_a3b
+	leaq	448(%rsp),%rsi
+	movq	40(%rsp),%rbp
+	addq	$2,%rbp
+	movq	%rbp,40(%rsp)
+	movq	%rsi,%rcx
+	movq	$8,%rax
+loop_1:
+	movq	(%rcx),%rbx
+	movw	%bx,(%rbp)
+	shrq	$16,%rbx
+	movw	%bx,64(%rbp)
+	shrq	$16,%rbx
+	movw	%bx,128(%rbp)
+	shrq	$16,%rbx
+	movw	%bx,192(%rbp)
+	leaq	8(%rcx),%rcx
+	leaq	256(%rbp),%rbp
+	decq	%rax
+	jnz	loop_1
+	movq	32(%rsp),%rax
+	subq	$1,%rax
+	movq	%rax,32(%rsp)
+	jne	init_loop
+
+
+
+	movdqa	%xmm0,64(%rsp)
+	movdqa	%xmm1,80(%rsp)
+	movdqa	%xmm2,96(%rsp)
+	movdqa	%xmm3,112(%rsp)
+
+
+
+
+
+	movl	126(%rsp),%eax
+	movq	%rax,%rdx
+	shrq	$11,%rax
+	andl	$2047,%edx
+	movl	%edx,126(%rsp)
+	leaq	640(%rsp,%rax,2),%rsi
+	movq	8(%rsp),%rdx
+	movq	$4,%rbp
+loop_2:
+	movzwq	192(%rsi),%rbx
+	movzwq	448(%rsi),%rax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	128(%rsi),%bx
+	movw	384(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	64(%rsi),%bx
+	movw	320(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	0(%rsi),%bx
+	movw	256(%rsi),%ax
+	movq	%rbx,0(%rdx)
+	movq	%rax,8(%rdx)
+	leaq	512(%rsi),%rsi
+	leaq	16(%rdx),%rdx
+	subq	$1,%rbp
+	jnz	loop_2
+	movq	$505,48(%rsp)
+
+	movq	8(%rsp),%rcx
+	movq	%rcx,136(%rsp)
+	movq	0(%rcx),%r10
+	movq	8(%rcx),%r11
+	movq	16(%rcx),%r12
+	movq	24(%rcx),%r13
+	movq	32(%rcx),%r14
+	movq	40(%rcx),%r15
+	movq	48(%rcx),%r8
+	movq	56(%rcx),%r9
+	jmp	sqr_2
+
+main_loop_a3b:
+	call	sqr_reduce
+	call	sqr_reduce
+	call	sqr_reduce
+sqr_2:
+	call	sqr_reduce
+	call	sqr_reduce
+
+
+
+	movq	48(%rsp),%rcx
+	movq	%rcx,%rax
+	shrq	$4,%rax
+	movl	64(%rsp,%rax,2),%edx
+	andq	$15,%rcx
+	shrq	%cl,%rdx
+	andq	$31,%rdx
+
+	leaq	640(%rsp,%rdx,2),%rsi
+	leaq	448(%rsp),%rdx
+	movq	%rdx,%rdi
+	movq	$4,%rbp
+loop_3:
+	movzwq	192(%rsi),%rbx
+	movzwq	448(%rsi),%rax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	128(%rsi),%bx
+	movw	384(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	64(%rsi),%bx
+	movw	320(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	0(%rsi),%bx
+	movw	256(%rsi),%ax
+	movq	%rbx,0(%rdx)
+	movq	%rax,8(%rdx)
+	leaq	512(%rsi),%rsi
+	leaq	16(%rdx),%rdx
+	subq	$1,%rbp
+	jnz	loop_3
+	movq	8(%rsp),%rsi
+	call	mont_mul_a3b
+
+
+
+	movq	48(%rsp),%rcx
+	subq	$5,%rcx
+	movq	%rcx,48(%rsp)
+	jge	main_loop_a3b
+
+
+
+end_main_loop_a3b:
+
+
+	movq	8(%rsp),%rdx
+	pxor	%xmm4,%xmm4
+	movdqu	0(%rdx),%xmm0
+	movdqu	16(%rdx),%xmm1
+	movdqu	32(%rdx),%xmm2
+	movdqu	48(%rdx),%xmm3
+	movdqa	%xmm4,576(%rsp)
+	movdqa	%xmm4,592(%rsp)
+	movdqa	%xmm4,608(%rsp)
+	movdqa	%xmm4,624(%rsp)
+	movdqa	%xmm0,512(%rsp)
+	movdqa	%xmm1,528(%rsp)
+	movdqa	%xmm2,544(%rsp)
+	movdqa	%xmm3,560(%rsp)
+	call	mont_reduce
+
+
+
+	movq	8(%rsp),%rax
+	movq	0(%rax),%r8
+	movq	8(%rax),%r9
+	movq	16(%rax),%r10
+	movq	24(%rax),%r11
+	movq	32(%rax),%r12
+	movq	40(%rax),%r13
+	movq	48(%rax),%r14
+	movq	56(%rax),%r15
+
+
+	movq	24(%rsp),%rbx
+	addq	$512,%rbx
+
+	subq	0(%rbx),%r8
+	sbbq	8(%rbx),%r9
+	sbbq	16(%rbx),%r10
+	sbbq	24(%rbx),%r11
+	sbbq	32(%rbx),%r12
+	sbbq	40(%rbx),%r13
+	sbbq	48(%rbx),%r14
+	sbbq	56(%rbx),%r15
+
+
+	movq	0(%rax),%rsi
+	movq	8(%rax),%rdi
+	movq	16(%rax),%rcx
+	movq	24(%rax),%rdx
+	cmovncq	%r8,%rsi
+	cmovncq	%r9,%rdi
+	cmovncq	%r10,%rcx
+	cmovncq	%r11,%rdx
+	movq	%rsi,0(%rax)
+	movq	%rdi,8(%rax)
+	movq	%rcx,16(%rax)
+	movq	%rdx,24(%rax)
+
+	movq	32(%rax),%rsi
+	movq	40(%rax),%rdi
+	movq	48(%rax),%rcx
+	movq	56(%rax),%rdx
+	cmovncq	%r12,%rsi
+	cmovncq	%r13,%rdi
+	cmovncq	%r14,%rcx
+	cmovncq	%r15,%rdx
+	movq	%rsi,32(%rax)
+	movq	%rdi,40(%rax)
+	movq	%rcx,48(%rax)
+	movq	%rdx,56(%rax)
+
+	movq	0(%rsp),%rsi
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbx
+	movq	40(%rsi),%rbp
+	leaq	48(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	mod_exp_512, . - mod_exp_512
diff --git a/main/openssl/crypto/bn/asm/modexp512-x86_64.pl b/main/openssl/crypto/bn/asm/modexp512-x86_64.pl
new file mode 100644
index 00000000..bfd6e975
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/modexp512-x86_64.pl
@@ -0,0 +1,1497 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2010-2011 Intel Corp.
+#   Author: Vinodh.Gopal@intel.com
+#           Jim Guilford
+#           Erdinc.Ozturk@intel.com
+#           Maxim.Perminov@intel.com
+#
+# More information about algorithm used can be found at:
+#   http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
+#
+# ====================================================================
+# Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# 3. All advertising materials mentioning features or use of this
+#    software must display the following acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+#
+# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+#    endorse or promote products derived from this software without
+#    prior written permission. For written permission, please contact
+#    licensing@OpenSSL.org.
+#
+# 5. Products derived from this software may not be called "OpenSSL"
+#    nor may "OpenSSL" appear in their names without prior written
+#    permission of the OpenSSL Project.
+#
+# 6. Redistributions of any form whatsoever must retain the following
+#    acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+#
+# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+# ====================================================================
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+use strict;
+my $code=".text\n\n";
+my $m=0;
+
+#
+# Define x512 macros
+#
+
+#MULSTEP_512_ADD	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
+#
+# uses rax, rdx, and args
+sub MULSTEP_512_ADD
+{
+ my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	($ASRC), $X[0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 mov	(+8*$i)($ASRC), $X[$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#MULSTEP_512	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
+#
+# uses rax, rdx, and args
+sub MULSTEP_512
+{
+ my ($x, $DST, $SRC2, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#
+# Swizzle Macros
+#
+
+# macro to copy data from flat space to swizzled table
+#MACRO swizzle	pDst, pSrc, tmp1, tmp2
+# pDst and pSrc are modified
+sub swizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0)=@_;
+$code.=<<___;
+	 mov	\$8, $cnt
+loop_$m:
+	 mov	($pSrc), $d0
+	 mov	$d0#w, ($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*1)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*2)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*3)($pDst)
+	 lea	8($pSrc), $pSrc
+	 lea	64*4($pDst), $pDst
+	 dec	$cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+# macro to copy data from swizzled table to  flat space
+#MACRO unswizzle	pDst, pSrc, tmp*3
+sub unswizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
+$code.=<<___;
+	 mov	\$4, $cnt
+loop_$m:
+	 movzxw	(+64*3+256*0)($pSrc), $d0
+	 movzxw	(+64*3+256*1)($pSrc), $d1
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*2+256*0)($pSrc), $d0#w
+	 mov	(+64*2+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*1+256*0)($pSrc), $d0#w
+	 mov	(+64*1+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*0+256*0)($pSrc), $d0#w
+	 mov	(+64*0+256*1)($pSrc), $d1#w
+	 mov	$d0, (+8*0)($pDst)
+	 mov	$d1, (+8*1)($pDst)
+	 lea	256*2($pSrc), $pSrc
+	 lea	8*2($pDst), $pDst
+	 sub	\$1, $cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+#
+# Data Structures
+#
+
+# Reduce Data
+#
+#
+# Offset  Value
+# 0C0     Carries
+# 0B8     X2[10]
+# 0B0     X2[9]
+# 0A8     X2[8]
+# 0A0     X2[7]
+# 098     X2[6]
+# 090     X2[5]
+# 088     X2[4]
+# 080     X2[3]
+# 078     X2[2]
+# 070     X2[1]
+# 068     X2[0]
+# 060     X1[12]  P[10]
+# 058     X1[11]  P[9]  Z[8]
+# 050     X1[10]  P[8]  Z[7]
+# 048     X1[9]   P[7]  Z[6]
+# 040     X1[8]   P[6]  Z[5]
+# 038     X1[7]   P[5]  Z[4]
+# 030     X1[6]   P[4]  Z[3]
+# 028     X1[5]   P[3]  Z[2]
+# 020     X1[4]   P[2]  Z[1]
+# 018     X1[3]   P[1]  Z[0]
+# 010     X1[2]   P[0]  Y[2]
+# 008     X1[1]   Q[1]  Y[1]
+# 000     X1[0]   Q[0]  Y[0]
+
+my $X1_offset           =  0;			# 13 qwords
+my $X2_offset           =  $X1_offset + 13*8;			# 11 qwords
+my $Carries_offset      =  $X2_offset + 11*8;			# 1 qword
+my $Q_offset            =  0;			# 2 qwords
+my $P_offset            =  $Q_offset + 2*8;			# 11 qwords
+my $Y_offset            =  0;			# 3 qwords
+my $Z_offset            =  $Y_offset + 3*8;			# 9 qwords
+
+my $Red_Data_Size       =  $Carries_offset + 1*8;			# (25 qwords)
+
+#
+# Stack Frame
+#
+#
+# offset	value
+# ...		<old stack contents>
+# ...
+# 280		Garray
+
+# 278		tmp16[15]
+# ...		...
+# 200		tmp16[0]
+
+# 1F8		tmp[7]
+# ...		...
+# 1C0		tmp[0]
+
+# 1B8		GT[7]
+# ...		...
+# 180		GT[0]
+
+# 178		Reduce Data
+# ...		...
+# 0B8		Reduce Data
+# 0B0		reserved
+# 0A8		reserved
+# 0A0		reserved
+# 098		reserved
+# 090		reserved
+# 088		reduce result addr
+# 080		exp[8]
+
+# ...
+# 048		exp[1]
+# 040		exp[0]
+
+# 038		reserved
+# 030		loop_idx
+# 028		pg
+# 020		i
+# 018		pData	; arg 4
+# 010		pG	; arg 2
+# 008		pResult	; arg 1
+# 000		rsp	; stack pointer before subtract
+
+my $rsp_offset          =  0;
+my $pResult_offset      =  8*1 + $rsp_offset;
+my $pG_offset           =  8*1 + $pResult_offset;
+my $pData_offset        =  8*1 + $pG_offset;
+my $i_offset            =  8*1 + $pData_offset;
+my $pg_offset           =  8*1 + $i_offset;
+my $loop_idx_offset     =  8*1 + $pg_offset;
+my $reserved1_offset    =  8*1 + $loop_idx_offset;
+my $exp_offset          =  8*1 + $reserved1_offset;
+my $red_result_addr_offset=  8*9 + $exp_offset;
+my $reserved2_offset    =  8*1 + $red_result_addr_offset;
+my $Reduce_Data_offset  =  8*5 + $reserved2_offset;
+my $GT_offset           =  $Red_Data_Size + $Reduce_Data_offset;
+my $tmp_offset          =  8*8 + $GT_offset;
+my $tmp16_offset        =  8*8 + $tmp_offset;
+my $garray_offset       =  8*16 + $tmp16_offset;
+my $mem_size            =  8*8*32 + $garray_offset;
+
+#
+# Offsets within Reduce Data
+#
+#
+#	struct MODF_2FOLD_MONT_512_C1_DATA {
+#	UINT64 t[8][8];
+#	UINT64 m[8];
+#	UINT64 m1[8]; /* 2^768 % m */
+#	UINT64 m2[8]; /* 2^640 % m */
+#	UINT64 k1[2]; /* (- 1/m) % 2^128 */
+#	};
+
+my $T                   =  0;
+my $M                   =  512;			# = 8 * 8 * 8
+my $M1                  =  576;			# = 8 * 8 * 9 /* += 8 * 8 */
+my $M2                  =  640;			# = 8 * 8 * 10 /* += 8 * 8 */
+my $K1                  =  704;			# = 8 * 8 * 11 /* += 8 * 8 */
+
+#
+#   FUNCTIONS
+#
+
+{{{
+#
+# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
+#                       and add 512-bits (8 qwords)
+#                       to get 640 bits (10 qwords)
+# Input: 128-bit mul source: [rdi+8*1], rbp
+#        512-bit mul source: [rsi+8*n]
+#        512-bit add source: r15, r14, ..., r9, r8
+# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
+# Clobbers all regs except: rcx, rsi, rdi
+$code.=<<___;
+.type	MULADD_128x512,\@abi-omnipotent
+.align	16
+MULADD_128x512:
+___
+	&MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 mov	(+8*1)(%rdi), %rbp
+___
+	&MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 ret
+.size	MULADD_128x512,.-MULADD_128x512
+___
+}}}
+
+{{{
+#MULADD_256x512	MACRO	pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
+#
+# Inputs: pDst: Destination  (768 bits, 12 qwords)
+#         pA:   Multiplicand (1024 bits, 16 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Dst = Ah * B + Al
+# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
+# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
+# Uses registers: arguments, RAX, RDX
+sub MULADD_256x512
+{
+ my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
+$code.=<<___;
+	mov	(+8*12)($pA), $OP
+___
+	&MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*13)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*14)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*15)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+}
+
+#
+# mont_reduce(UINT64 *x,  /* 1024 bits, 16 qwords */
+#	       UINT64 *m,  /*  512 bits,  8 qwords */
+#	       MODF_2FOLD_MONT_512_C1_DATA *data,
+#             UINT64 *r)  /*  512 bits,  8 qwords */
+# Input:  x (number to be reduced): tmp16 (Implicit)
+#         m (modulus):              [pM]  (Implicit)
+#         data (reduce data):       [pData] (Implicit)
+# Output: r (result):		     Address in [red_res_addr]
+#         result also in: r9, r8, r15, r14, r13, r12, r11, r10
+
+my @X=map("%r$_",(8..15));
+
+$code.=<<___;
+.type	mont_reduce,\@abi-omnipotent
+.align	16
+mont_reduce:
+___
+
+my $STACK_DEPTH         =  8;
+	#
+	# X1 = Xh * M1 + Xl
+$code.=<<___;
+	 lea	(+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi			# pX1 (Dst) 769 bits, 13 qwords
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rsi			# pM1 (Bsrc) 512 bits, 8 qwords
+	 add	\$$M1, %rsi
+	 lea	(+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx			# X (Asrc) 1024 bits, 16 qwords
+
+___
+
+	&MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X);	# rotates @X 4 times
+	# results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
+
+$code.=<<___;
+	 xor	%rax, %rax
+	# X1 += xl
+	 add	(+8*8)(%rcx), $X[4]
+	 adc	(+8*9)(%rcx), $X[5]
+	 adc	(+8*10)(%rcx), $X[6]
+	 adc	(+8*11)(%rcx), $X[7]
+	 adc	\$0, %rax
+	# X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
+
+	#
+	# check for carry ;; carry stored in rax
+	 mov	$X[4], (+8*8)(%rdi)			# rdi points to X1
+	 mov	$X[5], (+8*9)(%rdi)
+	 mov	$X[6], %rbp
+	 mov	$X[7], (+8*11)(%rdi)
+
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 mov	(+8*0)(%rdi), $X[4]
+	 mov	(+8*1)(%rdi), $X[5]
+	 mov	(+8*2)(%rdi), $X[6]
+	 mov	(+8*3)(%rdi), $X[7]
+
+	# X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
+	# rdi -> X1
+	# rsi -> M1
+
+	#
+	# X2 = Xh * M2 + Xl
+	# do first part (X2 = Xh * M2)
+	 add	\$8*10, %rdi			# rdi -> pXh ; 128 bits, 2 qwords
+				#        Xh is actually { [rdi+8*1], rbp }
+	 add	\$`$M2-$M1`, %rsi			# rsi -> M2
+	 lea	(+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx			# rcx -> pX2 ; 641 bits, 11 qwords
+___
+	unshift(@X,pop(@X));	unshift(@X,pop(@X));
+$code.=<<___;
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
+
+	# X2 += Xl
+	 add	(+8*8-8*10)(%rdi), $X[6]		# (-8*10) is to adjust rdi -> Xh to Xl
+	 adc	(+8*9-8*10)(%rdi), $X[7]
+	 mov	$X[6], (+8*8)(%rcx)
+	 mov	$X[7], (+8*9)(%rcx)
+
+	 adc	%rax, %rax
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 lea	(+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi			# rdi -> pQ ; 128 bits, 2 qwords
+	 add	\$`$K1-$M2`, %rsi			# rsi -> pK1 ; 128 bits, 2 qwords
+
+	# MUL_128x128t128	rdi, rcx, rsi	; Q = X2 * K1 (bottom half)
+	# B1:B0 = rsi[1:0] = K1[1:0]
+	# A1:A0 = rcx[1:0] = X2[1:0]
+	# Result = rdi[1],rbp = Q[1],rbp
+	 mov	(%rsi), %r8			# B0
+	 mov	(+8*1)(%rsi), %rbx			# B1
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%r8			# B0
+	 mov	%rax, %rbp
+	 mov	%rdx, %r9
+
+	 mov	(+8*1)(%rcx), %rax			# A1
+	 mul	%r8			# B0
+	 add	%rax, %r9
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%rbx			# B1
+	 add	%rax, %r9
+
+	 mov	%r9, (+8*1)(%rdi)
+	# end MUL_128x128t128
+
+	 sub	\$`$K1-$M`, %rsi
+
+	 mov	(%rcx), $X[6]
+	 mov	(+8*1)(%rcx), $X[7]			# r9:r8 = X2[1:0]
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+
+	# load first half of m to rdx, rdi, rbx, rax
+	# moved this here for efficiency
+	 mov	(+8*0)(%rsi), %rax
+	 mov	(+8*1)(%rsi), %rbx
+	 mov	(+8*2)(%rsi), %rdi
+	 mov	(+8*3)(%rsi), %rdx
+
+	# continue with reduction
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
+
+	 add	(+8*8)(%rcx), $X[6]
+	 adc	(+8*9)(%rcx), $X[7]
+
+	#accumulate the final carry to rbp
+	 adc	%rbp, %rbp
+
+	# Add in overflow corrections: R = (X2>>128) += T[overflow]
+	# R = {r9, r8, r15, r14, ..., r10}
+	 shl	\$3, %rbp
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rcx			# rsi -> Data (and points to T)
+	 add	%rcx, %rbp			# pT ; 512 bits, 8 qwords, spread out
+
+	# rsi will be used to generate a mask after the addition
+	 xor	%rsi, %rsi
+
+	 add	(+8*8*0)(%rbp), $X[0]
+	 adc	(+8*8*1)(%rbp), $X[1]
+	 adc	(+8*8*2)(%rbp), $X[2]
+	 adc	(+8*8*3)(%rbp), $X[3]
+	 adc	(+8*8*4)(%rbp), $X[4]
+	 adc	(+8*8*5)(%rbp), $X[5]
+	 adc	(+8*8*6)(%rbp), $X[6]
+	 adc	(+8*8*7)(%rbp), $X[7]
+
+	# if there is a carry:	rsi = 0xFFFFFFFFFFFFFFFF
+	# if carry is clear:	rsi = 0x0000000000000000
+	 sbb	\$0, %rsi
+
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	 mov	\$1, %rbp
+	 sub	%rax, $X[0]
+	 sbb	%rbx, $X[1]
+	 sbb	%rdi, $X[2]
+	 sbb	%rdx, $X[3]
+
+	# if there is a borrow:		rbp = 0
+	# if there is no borrow:	rbp = 1
+	# this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
+	 sbb	\$0, %rbp
+
+	#load second half of m to rdx, rdi, rbx, rax
+
+	 add	\$$M, %rcx
+	 mov	(+8*4)(%rcx), %rax
+	 mov	(+8*5)(%rcx), %rbx
+	 mov	(+8*6)(%rcx), %rdi
+	 mov	(+8*7)(%rcx), %rdx
+
+	# use the rsi mask as before
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	# if rbp = 0, there was a borrow before, it is moved to the carry flag
+	# if rbp = 1, there was not a borrow before, carry flag is cleared
+	 sub	\$1, %rbp
+
+	 sbb	%rax, $X[4]
+	 sbb	%rbx, $X[5]
+	 sbb	%rdi, $X[6]
+	 sbb	%rdx, $X[7]
+
+	# write R back to memory
+
+	 mov	(+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
+	 mov	$X[0], (+8*0)(%rsi)
+	 mov	$X[1], (+8*1)(%rsi)
+	 mov	$X[2], (+8*2)(%rsi)
+	 mov	$X[3], (+8*3)(%rsi)
+	 mov	$X[4], (+8*4)(%rsi)
+	 mov	$X[5], (+8*5)(%rsi)
+	 mov	$X[6], (+8*6)(%rsi)
+	 mov	$X[7], (+8*7)(%rsi)
+
+	 ret
+.size	mont_reduce,.-mont_reduce
+___
+}}}
+
+{{{
+#MUL_512x512	MACRO	pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
+#
+# Inputs: pDst: Destination  (1024 bits, 16 qwords)
+#         pA:   Multiplicand (512 bits, 8 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Uses registers rax, rdx, args
+#   B operand in [pB] and also in x7...x0
+sub MUL_512x512
+{
+ my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+
+$code.=<<___;
+	 mov	(+8*0)($pA), $OP
+
+	 mov	$X[0], %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $X[0]
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	$X[$i], %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i-1]
+	 adc	\$0, %rdx
+	 mov	%rdx, $X[$i]
+___
+}
+
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	(+8*$i)($pA), $OP
+___
+
+	&MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
+	push(@X,shift(@X));
+}
+
+$code.=<<___;
+	 mov	$X[0], (+$pDst_o+8*8)($pDst)
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+	 mov	$X[3], (+$pDst_o+8*11)($pDst)
+	 mov	$X[4], (+$pDst_o+8*12)($pDst)
+	 mov	$X[5], (+$pDst_o+8*13)($pDst)
+	 mov	$X[6], (+$pDst_o+8*14)($pDst)
+	 mov	$X[7], (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
+# Input:  src1: Address of source 1: rdi
+#         src2: Address of source 2: rsi
+# Output: dst:  Address of destination: [red_res_addr]
+#    src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+# Temp:   Clobbers [tmp16], all registers
+$code.=<<___;
+.type	mont_mul_a3b,\@abi-omnipotent
+.align	16
+mont_mul_a3b:
+	#
+	# multiply tmp = src1 * src2
+	# For multiply: dst = rcx, src1 = rdi, src2 = rsi
+	# stack depth is extra 8 from call
+___
+	&MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
+$code.=<<___;
+	#
+	# Dst = tmp % m
+	# Call reduce(tmp, m, data, dst)
+
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	mont_mul_a3b,.-mont_mul_a3b
+___
+}}}
+
+{{{
+#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
+#
+# Input in memory [pA] and also in x7...x0
+# Uses all argument registers plus rax and rdx
+#
+# This version computes all of the off-diagonal terms into memory,
+# and then it adds in the diagonal terms
+
+sub SQR_512
+{
+ my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	# ------------------
+	# first pass 01...07
+	# ------------------
+	 mov	$X[0], $A
+
+	 mov	$X[1],%rax
+	 mul	$A
+	 mov	%rax, (+$pDst_o+8*1)($pDst)
+___
+for(my $i=2;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $X[$i-2]
+	 mov	$X[$i],%rax
+	 mul	$A
+	 add	%rax, $X[$i-2]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $x7
+
+	 mov	$X[0], (+$pDst_o+8*2)($pDst)
+
+	# ------------------
+	# second pass 12...17
+	# ------------------
+
+	 mov	(+8*1)($pA), $A
+
+	 mov	(+8*2)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*3)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*4)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# third pass 23...27
+	# ------------------
+	 mov	(+8*2)($pA), $A
+
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 mov	$X[3], (+$pDst_o+8*5)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+	 mov	$X[4], (+$pDst_o+8*6)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# fourth pass 34...37
+	# ------------------
+
+	 mov	(+8*3)($pA), $A
+
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*7)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+	 mov	$x7, (+$pDst_o+8*8)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[5]
+
+	# ------------------
+	# fifth pass 45...47
+	# ------------------
+	 mov	(+8*4)($pA), $A
+
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# sixth pass 56...57
+	# ------------------
+	 mov	(+8*5)($pA), $A
+
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*11)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*12)($pDst)
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# seventh pass 67
+	# ------------------
+	 mov	$X[6], $A
+
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*13)($pDst)
+
+	 mov	%rdx, (+$pDst_o+8*14)($pDst)
+
+	# start finalize (add	in squares, and double off-terms)
+	 mov	(+$pDst_o+8*1)($pDst), $X[0]
+	 mov	(+$pDst_o+8*2)($pDst), $X[1]
+	 mov	(+$pDst_o+8*3)($pDst), $X[2]
+	 mov	(+$pDst_o+8*4)($pDst), $X[3]
+	 mov	(+$pDst_o+8*5)($pDst), $X[4]
+	 mov	(+$pDst_o+8*6)($pDst), $X[5]
+
+	 mov	(+8*3)($pA), %rax
+	 mul	%rax
+	 mov	%rax, $x6
+	 mov	%rdx, $X[6]
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	\$0, $X[6]
+
+	 mov	(+8*0)($pA), %rax
+	 mul	%rax
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $A
+
+	 mov	(+8*1)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+	 mov	$X[0], (+$pDst_o+8*1)($pDst)
+	 mov	$X[1], (+$pDst_o+8*2)($pDst)
+
+	 mov	(+8*2)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+
+	 mov	$X[2], (+$pDst_o+8*3)($pDst)
+	 mov	$X[3], (+$pDst_o+8*4)($pDst)
+
+	 xor	$tmp, $tmp
+	 add	$A, $X[4]
+	 adc	$x6, $X[5]
+	 adc	\$0, $tmp
+
+	 mov	$X[4], (+$pDst_o+8*5)($pDst)
+	 mov	$X[5], (+$pDst_o+8*6)($pDst)
+
+	# %%tmp has 0/1 in column 7
+	# %%A6 has a full value in column 7
+
+	 mov	(+$pDst_o+8*7)($pDst), $X[0]
+	 mov	(+$pDst_o+8*8)($pDst), $X[1]
+	 mov	(+$pDst_o+8*9)($pDst), $X[2]
+	 mov	(+$pDst_o+8*10)($pDst), $X[3]
+	 mov	(+$pDst_o+8*11)($pDst), $X[4]
+	 mov	(+$pDst_o+8*12)($pDst), $X[5]
+	 mov	(+$pDst_o+8*13)($pDst), $x6
+	 mov	(+$pDst_o+8*14)($pDst), $x7
+
+	 mov	$X[7], %rax
+	 mul	%rax
+	 mov	%rax, $X[7]
+	 mov	%rdx, $A
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	$x6, $x6
+	 adc	$x7, $x7
+	 adc	\$0, $A
+
+	 add	$tmp, $X[0]
+
+	 mov	(+8*4)($pA), %rax
+	 mul	%rax
+
+	 add	$X[6], $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[0], (+$pDst_o+8*7)($pDst)
+	 mov	$X[1], (+$pDst_o+8*8)($pDst)
+
+	 mov	(+8*5)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[2], (+$pDst_o+8*9)($pDst)
+	 mov	$X[3], (+$pDst_o+8*10)($pDst)
+
+	 mov	(+8*6)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[4]
+	 adc	%rax, $X[5]
+	 adc	\$0, %rdx
+
+	 mov	$X[4], (+$pDst_o+8*11)($pDst)
+	 mov	$X[5], (+$pDst_o+8*12)($pDst)
+
+	 add	%rdx, $x6
+	 adc	$X[7], $x7
+	 adc	\$0, $A
+
+	 mov	$x6, (+$pDst_o+8*13)($pDst)
+	 mov	$x7, (+$pDst_o+8*14)($pDst)
+	 mov	$A, (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
+#
+# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+#
+$code.=<<___;
+.type	sqr_reduce,\@abi-omnipotent
+.align	16
+sqr_reduce:
+	 mov	(+$pResult_offset+8)(%rsp), %rcx
+___
+	&SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
+$code.=<<___;
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	sqr_reduce,.-sqr_reduce
+___
+}}}
+
+#
+# MAIN FUNCTION
+#
+
+#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
+#           UINT64 *g,   /* 512 bits, 8 qwords */
+#           UINT64 *exp, /* 512 bits, 8 qwords */
+#           struct mod_ctx_512 *data)
+
+# window size = 5
+# table size = 2^5 = 32
+#table_entries	equ	32
+#table_size	equ	table_entries * 8
+$code.=<<___;
+.globl	mod_exp_512
+.type	mod_exp_512,\@function,4
+mod_exp_512:
+	 push	%rbp
+	 push	%rbx
+	 push	%r12
+	 push	%r13
+	 push	%r14
+	 push	%r15
+
+	# adjust stack down and then align it with cache boundary
+	 mov	%rsp, %r8
+	 sub	\$$mem_size, %rsp
+	 and	\$-64, %rsp
+
+	# store previous stack pointer and arguments
+	 mov	%r8, (+$rsp_offset)(%rsp)
+	 mov	%rdi, (+$pResult_offset)(%rsp)
+	 mov	%rsi, (+$pG_offset)(%rsp)
+	 mov	%rcx, (+$pData_offset)(%rsp)
+.Lbody:
+	# transform g into montgomery space
+	# GT = reduce(g * C2) = reduce(g * (2^256))
+	# reduce expects to have the input in [tmp16]
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rsi), %xmm0
+	 movdqu	(+16*1)(%rsi), %xmm1
+	 movdqu	(+16*2)(%rsi), %xmm2
+	 movdqu	(+16*3)(%rsi), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*3)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*5)(%rsp)
+
+	# load pExp before rdx gets blown away
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+
+	 lea	(+$GT_offset)(%rsp), %rbx
+	 mov	%rbx, (+$red_result_addr_offset)(%rsp)
+	 call	mont_reduce
+
+	# Initialize tmp = C
+	 lea	(+$tmp_offset)(%rsp), %rcx
+	 xor	%rax, %rax
+	 mov	%rax, (+8*0)(%rcx)
+	 mov	%rax, (+8*1)(%rcx)
+	 mov	%rax, (+8*3)(%rcx)
+	 mov	%rax, (+8*4)(%rcx)
+	 mov	%rax, (+8*5)(%rcx)
+	 mov	%rax, (+8*6)(%rcx)
+	 mov	%rax, (+8*7)(%rcx)
+	 mov	%rax, (+$exp_offset+8*8)(%rsp)
+	 movq	\$1, (+8*2)(%rcx)
+
+	 lea	(+$garray_offset)(%rsp), %rbp
+	 mov	%rcx, %rsi			# pTmp
+	 mov	%rbp, %rdi			# Garray[][0]
+___
+
+	&swizzle("%rdi", "%rcx", "%rax", "%rbx");
+
+	# for (rax = 31; rax != 0; rax--) {
+	#     tmp = reduce(tmp * G)
+	#     swizzle(pg, tmp);
+	#     pg += 2; }
+$code.=<<___;
+	 mov	\$31, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	# rsi -> pTmp
+	 mov	%rsi, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rsi), %r10
+	 mov	(+8*1)(%rsi), %r11
+	 mov	(+8*2)(%rsi), %r12
+	 mov	(+8*3)(%rsi), %r13
+	 mov	(+8*4)(%rsi), %r14
+	 mov	(+8*5)(%rsi), %r15
+	 mov	(+8*6)(%rsi), %r8
+	 mov	(+8*7)(%rsi), %r9
+init_loop:
+	 lea	(+$GT_offset)(%rsp), %rdi
+	 call	mont_mul_a3b
+	 lea	(+$tmp_offset)(%rsp), %rsi
+	 mov	(+$pg_offset)(%rsp), %rbp
+	 add	\$2, %rbp
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	 mov	%rsi, %rcx			# rcx = rsi = addr of tmp
+___
+
+	&swizzle("%rbp", "%rcx", "%rax", "%rbx");
+$code.=<<___;
+	 mov	(+$i_offset)(%rsp), %rax
+	 sub	\$1, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 jne	init_loop
+
+	#
+	# Copy exponent onto stack
+	 movdqa	%xmm0, (+$exp_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$exp_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$exp_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$exp_offset+16*3)(%rsp)
+
+
+	#
+	# Do exponentiation
+	# Initialize result to G[exp{511:507}]
+	 mov	(+$exp_offset+62)(%rsp), %eax
+	 mov	%rax, %rdx
+	 shr	\$11, %rax
+	 and	\$0x07FF, %edx
+	 mov	%edx, (+$exp_offset+62)(%rsp)
+	 lea	(+$garray_offset)(%rsp,%rax,2), %rsi
+	 mov	(+$pResult_offset)(%rsp), %rdx
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+
+	#
+	# Loop variables
+	# rcx = [loop_idx] = index: 510-5 to 0 by 5
+$code.=<<___;
+	 movq	\$505, (+$loop_idx_offset)(%rsp)
+
+	 mov	(+$pResult_offset)(%rsp), %rcx
+	 mov	%rcx, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rcx), %r10
+	 mov	(+8*1)(%rcx), %r11
+	 mov	(+8*2)(%rcx), %r12
+	 mov	(+8*3)(%rcx), %r13
+	 mov	(+8*4)(%rcx), %r14
+	 mov	(+8*5)(%rcx), %r15
+	 mov	(+8*6)(%rcx), %r8
+	 mov	(+8*7)(%rcx), %r9
+	 jmp	sqr_2
+
+main_loop_a3b:
+	 call	sqr_reduce
+	 call	sqr_reduce
+	 call	sqr_reduce
+sqr_2:
+	 call	sqr_reduce
+	 call	sqr_reduce
+
+	#
+	# Do multiply, first look up proper value in Garray
+	 mov	(+$loop_idx_offset)(%rsp), %rcx			# bit index
+	 mov	%rcx, %rax
+	 shr	\$4, %rax			# rax is word pointer
+	 mov	(+$exp_offset)(%rsp,%rax,2), %edx
+	 and	\$15, %rcx
+	 shrq	%cl, %rdx
+	 and	\$0x1F, %rdx
+
+	 lea	(+$garray_offset)(%rsp,%rdx,2), %rsi
+	 lea	(+$tmp_offset)(%rsp), %rdx
+	 mov	%rdx, %rdi
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+	# rdi = tmp = pG
+
+	#
+	# Call mod_mul_a1(pDst,  pSrc1, pSrc2, pM, pData)
+	#                 result result pG     M   Data
+$code.=<<___;
+	 mov	(+$pResult_offset)(%rsp), %rsi
+	 call	mont_mul_a3b
+
+	#
+	# finish loop
+	 mov	(+$loop_idx_offset)(%rsp), %rcx
+	 sub	\$5, %rcx
+	 mov	%rcx, (+$loop_idx_offset)(%rsp)
+	 jge	main_loop_a3b
+
+	#
+
+end_main_loop_a3b:
+	# transform result out of Montgomery space
+	# result = reduce(result)
+	 mov	(+$pResult_offset)(%rsp), %rdx
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*5)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*3)(%rsp)
+	 call	mont_reduce
+
+	# If result > m, subract m
+	# load result into r15:r8
+	 mov	(+$pResult_offset)(%rsp), %rax
+	 mov	(+8*0)(%rax), %r8
+	 mov	(+8*1)(%rax), %r9
+	 mov	(+8*2)(%rax), %r10
+	 mov	(+8*3)(%rax), %r11
+	 mov	(+8*4)(%rax), %r12
+	 mov	(+8*5)(%rax), %r13
+	 mov	(+8*6)(%rax), %r14
+	 mov	(+8*7)(%rax), %r15
+
+	# subtract m
+	 mov	(+$pData_offset)(%rsp), %rbx
+	 add	\$$M, %rbx
+
+	 sub	(+8*0)(%rbx), %r8
+	 sbb	(+8*1)(%rbx), %r9
+	 sbb	(+8*2)(%rbx), %r10
+	 sbb	(+8*3)(%rbx), %r11
+	 sbb	(+8*4)(%rbx), %r12
+	 sbb	(+8*5)(%rbx), %r13
+	 sbb	(+8*6)(%rbx), %r14
+	 sbb	(+8*7)(%rbx), %r15
+
+	# if Carry is clear, replace result with difference
+	 mov	(+8*0)(%rax), %rsi
+	 mov	(+8*1)(%rax), %rdi
+	 mov	(+8*2)(%rax), %rcx
+	 mov	(+8*3)(%rax), %rdx
+	 cmovnc	%r8, %rsi
+	 cmovnc	%r9, %rdi
+	 cmovnc	%r10, %rcx
+	 cmovnc	%r11, %rdx
+	 mov	%rsi, (+8*0)(%rax)
+	 mov	%rdi, (+8*1)(%rax)
+	 mov	%rcx, (+8*2)(%rax)
+	 mov	%rdx, (+8*3)(%rax)
+
+	 mov	(+8*4)(%rax), %rsi
+	 mov	(+8*5)(%rax), %rdi
+	 mov	(+8*6)(%rax), %rcx
+	 mov	(+8*7)(%rax), %rdx
+	 cmovnc	%r12, %rsi
+	 cmovnc	%r13, %rdi
+	 cmovnc	%r14, %rcx
+	 cmovnc	%r15, %rdx
+	 mov	%rsi, (+8*4)(%rax)
+	 mov	%rdi, (+8*5)(%rax)
+	 mov	%rcx, (+8*6)(%rax)
+	 mov	%rdx, (+8*7)(%rax)
+
+	 mov	(+$rsp_offset)(%rsp), %rsi
+	 mov	0(%rsi),%r15
+	 mov	8(%rsi),%r14
+	 mov	16(%rsi),%r13
+	 mov	24(%rsi),%r12
+	 mov	32(%rsi),%rbx
+	 mov	40(%rsi),%rbp
+	 lea	48(%rsi),%rsp
+.Lepilogue:
+	 ret
+.size mod_exp_512, . - mod_exp_512
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mod_exp_512_se_handler,\@abi-omnipotent
+.align	16
+mod_exp_512_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	mov	$rsp_offset(%rax),%rax	# pull saved Rsp
+
+	mov	32(%rax),%rbx
+	mov	40(%rax),%rbp
+	mov	24(%rax),%r12
+	mov	16(%rax),%r13
+	mov	8(%rax),%r14
+	mov	0(%rax),%r15
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mod_exp_512_se_handler,.-mod_exp_512_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_mod_exp_512
+	.rva	.LSEH_end_mod_exp_512
+	.rva	.LSEH_info_mod_exp_512
+
+.section	.xdata
+.align	8
+.LSEH_info_mod_exp_512:
+	.byte	9,0,0,0
+	.rva	mod_exp_512_se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/(\(\+[^)]+\))/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/parisc-mont.pl b/main/openssl/crypto/bn/asm/parisc-mont.pl
new file mode 100644
index 00000000..c02ef6f0
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,995 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$BN_SZ		=$SIZE_T;
+} else {
+	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$BN_SZ		=$SIZE_T;
+	if (open CONF,"<${dir}../../opensslconf.h") {
+	    while(<CONF>) {
+		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+		    $BN_SZ=8;
+		    $LEVEL="2.0";
+		    last;
+		}
+	    }
+	    close CONF;
+	}
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
+				#                [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32;			# local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22";	# passed through stack in 32-bit
+$num="%r21";	# passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4";	$fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
+$fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+bn_mul_mont
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	ldo	-$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldw	`-$FRAME_MARKER-4`($fp),$n0
+	ldw	`-$FRAME_MARKER-8`($fp),$num
+	nop
+	nop					; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+	comiclr,<=	6,$num,%r0		; are vectors long enough?
+	b		L\$abort
+	ldi		0,%r28			; signal "unhandled"
+	add,ev		%r0,$num,$num		; is $num even?
+	b		L\$abort
+	nop
+	or		$ap,$np,$ti1
+	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
+	b		L\$abort
+	nop
+	nop					; alignment
+	nop
+
+	fldws		0($n0),${fn0}
+	fldws,ma	4($bp),${fbi}		; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+	comib,>		3,$num,L\$abort		; are vectors long enough?
+	ldi		0,%r28			; signal "unhandled"
+	addl		$num,$num,$num		; I operate on 32-bit values
+
+	fldws		4($n0),${fn0}		; only low part of n0
+	fldws		4($bp),${fbi}		; bp[0] in flipped word order
+___
+$code.=<<___;
+	fldds		0($ap),${fai}		; ap[0,1]
+	fldds		0($np),${fni}		; np[0,1]
+
+	sh2addl		$num,%r0,$arrsz
+	ldi		31,$hi0
+	ldo		36($arrsz),$hi1		; space for tp[num+1]
+	andcm		$hi1,$hi0,$hi1		; align
+	addl		$hi1,%sp,%sp
+	$PUSH		$fp,-$SIZE_T(%sp)
+
+	ldo		`$LOCALS+16`($fp),$xfer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
+	xmpyu		${fn0},${fab0}R,${fm0}
+
+	addl		$arrsz,$ap,$ap		; point at the end
+	addl		$arrsz,$np,$np
+	subi		0,$arrsz,$idx		; j=0
+	ldo		8($idx),$idx		; j++++
+
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	fstds		${fab1},0($xfer)
+	fstds		${fnm1},8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+	mtctl		$hi0,%cr11		; $hi0 still holds 31
+	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
+	b		L\$parisc11
+	nop
+___
+$code.=<<___;					# PA-RISC 2.0 code-path
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+
+	extrd,u		$ab0,31,32,$hi0
+	extrd,u		$ab0,63,32,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 ldo		8($idx),$idx		; j++++
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	 extrd,u	$nm0,31,32,$hi1
+
+L\$1st
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$ab1,$nm1,$nm1
+	 extrd,u	$nm1,31,32,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	 addl		$ab1,$nm1,$nm1
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+	fldws		0($bp),${fbi}		; bp[1] in flipped word order
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	addl		$hi1,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2]
+	 flddx		$idx($np),${fni}	; np[2]
+	 ldo		8($idx),$idx		; j++++
+	ldd		-16($xfer),$ab0		; 33-bit value
+	ldd		-8($xfer),$nm0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	 extrd,u	$ab0,31,32,$ti0		; carry bit
+	 extrd,u	$ab0,63,32,$ab0
+	fstds		${fab1},0($xfer)
+	 addl		$ti0,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 extrd,u	$nm0,31,32,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+L\$inner
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldw		4($tp),$ti0		; tp[j]
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ti0,$ti0
+	 addl		$ti0,$ab0,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,31,32,$hi0
+	 extrd,u	$nm1,31,32,$hi1
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	ldw		4($tp),$ti0		; tp[j]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	addl		$hi0,$ab0,$ab0
+	 addl		$ti0,$ab0,$ab0
+	 stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,31,32,$hi0
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone	; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldi		12,$ti0			; bp[i] in flipped word order
+	addl,ev		%r0,$num,$num
+	ldi		-4,$ti0
+	addl		$ti0,$bp,$bp
+	fldws		0($bp),${fbi}
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0]
+	addl		$hi0,$ab1,$ab1
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	addl		$hi1,$hi0,$hi0
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+	addl		$hi0,$ab1,$ab1
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	addl		$hi1,$hi0,$hi0
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+	ldws,ma		4($tp),$ti0
+	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
+	b		L\$sub_pa11
+	addl		$tp,$arrsz,$tp
+L\$sub
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldd,ma		8($tp),$ti0
+L\$sub
+	ldd		$idx($np),$hi0
+	shrpd		$ti0,$ti0,32,$ti0	; flip word order
+	std		$ti0,-8($tp)		; save flipped value
+	sub,db		$ti0,$hi0,$hi1
+	ldd,ma		8($tp),$ti0
+	addib,<>	8,$idx,L\$sub
+	std,ma		$hi1,8($rp)
+
+	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
+	sub,db		$ti0,%r0,$hi1
+	ldo		-8($tp),$tp
+___
+$code.=<<___;
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy
+	ldd		$idx($np),$hi0
+	std,ma		%r0,8($tp)
+	addib,<>	8,$idx,.-8		; L\$copy
+	std,ma		$hi0,8($rp)	
+___
+
+if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+	b		L\$done
+	nop
+
+	.ALIGN		8
+L\$parisc11
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-12($xfer),$ablo
+	ldw		-16($xfer),$hi0
+	ldw		-4($xfer),$nmlo0
+	ldw		-8($xfer),$nmhi0
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+	 ldo		8($idx),$idx		; j++++
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	 addc		%r0,$nmhi0,$hi1
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+	nop
+
+L\$1st_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		12($xfer),$nmlo1
+	 addc		%r0,$abhi,$hi0
+	ldw		8($xfer),$nmhi1
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	fstds		${fnm1},8($xfer)
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-16($xfer),$abhi
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	ldw		-4($xfer),$nmlo0
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-8($xfer),$nmhi0
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab0},-16($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	fstds		${fnm0},-8($xfer)
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	 ldw		8($xfer),$nmhi1
+	 ldw		12($xfer),$nmlo1
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	 add		$hi0,$ablo,$ablo
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-4($xfer),$nmlo0
+
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	stws,ma		$nmlo0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[1]
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+	ldw		-16($xfer),$abhi	; carry bit actually
+	 ldo		8($idx),$idx		; j++++
+	ldw		-12($xfer),$ablo
+	ldw		-8($xfer),$nmhi0
+	ldw		-4($xfer),$nmlo0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	fstds		${fab1},0($xfer)
+	 addl		$abhi,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 addc		%r0,$nmhi0,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+
+L\$inner_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	ldw		12($xfer),$nmlo1
+	 add		$ti1,$ablo,$ablo
+	ldw		8($xfer),$nmhi1
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab1},0($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	ldw		8($tp),$ti1		; tp[j]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-4($xfer),$nmlo0
+	 add		$hi0,$ablo,$ablo
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$abhi,$abhi
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 add		$ti0,$ablo,$ablo
+	fstds		${fab0},-16($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm0},-8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	ldw		12($xfer),$nmlo1
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldw		8($xfer),$nmhi1
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	fstds		${fab1},0($xfer)
+	 add		$ti1,$ablo,$ablo
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$abhi,$hi0
+	ldw		-16($xfer),$abhi
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-8($xfer),$nmhi0
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-4($xfer),$nmlo0
+	 addc		%r0,$nmhi1,$hi1
+
+	add		$hi0,$ablo,$ablo
+	 stw		$nmlo1,-4($tp)		; tp[j-1]
+	addc		%r0,$abhi,$abhi
+	 add		$ti0,$ablo,$ablo
+	ldw		8($tp),$ti1		; tp[j]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone_pa11; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[i]
+	 flddx		$idx($ap),${fai}	; ap[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer_pa11
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32+4`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+	ldw		-4($tp),$ti0
+	addl		$tp,$arrsz,$tp
+L\$sub_pa11
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub_pa11
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy_pa11
+	ldwx		$idx($np),$hi0
+	stws,ma		%r0,4($tp)
+	addib,<>	4,$idx,L\$copy_pa11
+	stws,ma		$hi0,4($rp)	
+
+	nop					; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+	ldi		1,%r28			; signal "handled"
+	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
+    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $sub = sub {
+  my ($mod,$args) = @_;
+  my $orig = "sub$mod\t$args";
+
+    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+	$opcode|=(1<<10);	# e1
+	$opcode|=(1<<8);	# e2
+	$opcode|=(1<<5);	# d
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	# flip word order in 64-bit mode...
+	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+	# assemble 2.0 instructions in 32-bit mode...
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+	s/\bbv\b/bve/gm	if ($SIZE_T==8);
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/ppc-mont.pl b/main/openssl/crypto/bn/asm/ppc-mont.pl
index 7849eae9..f9b6992c 100644
--- a/main/openssl/crypto/bn/asm/ppc-mont.pl
+++ b/main/openssl/crypto/bn/asm/ppc-mont.pl
@@ -31,7 +31,6 @@ if ($flavour =~ /32/) {
 	$BNSZ=	$BITS/8;
 	$SIZE_T=4;
 	$RZONE=	224;
-	$FRAME=	$SIZE_T*16;
 
 	$LD=	"lwz";		# load
 	$LDU=	"lwzu";		# load and update
@@ -51,7 +50,6 @@ if ($flavour =~ /32/) {
 	$BNSZ=	$BITS/8;
 	$SIZE_T=8;
 	$RZONE=	288;
-	$FRAME=	$SIZE_T*16;
 
 	# same as above, but 64-bit mnemonics...
 	$LD=	"ld";		# load
@@ -69,6 +67,9 @@ if ($flavour =~ /32/) {
 	$POP=	$LD;
 } else { die "nonsense $flavour"; }
 
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@ $aj="r10";
 $nj="r11";
 $tj="r12";
 # non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
 #
 $nhi="r0";
 
@@ -108,42 +109,48 @@ $code=<<___;
 .machine "any"
 .text
 
-.globl	.bn_mul_mont
+.globl	.bn_mul_mont_int
 .align	4
-.bn_mul_mont:
+.bn_mul_mont_int:
 	cmpwi	$num,4
 	mr	$rp,r3		; $rp is reassigned
 	li	r3,0
 	bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+	cmpwi	$num,32		; longer key performance is not better
+	bgelr
+___
+$code.=<<___;
 	slwi	$num,$num,`log($BNSZ)/log(2)`
 	li	$tj,-4096
-	addi	$ovf,$num,`$FRAME+$RZONE`
+	addi	$ovf,$num,$FRAME
 	subf	$ovf,$ovf,$sp	; $sp-$ovf
 	and	$ovf,$ovf,$tj	; minimize TLB usage
 	subf	$ovf,$sp,$ovf	; $ovf-$sp
+	mr	$tj,$sp
 	srwi	$num,$num,`log($BNSZ)/log(2)`
 	$STUX	$sp,$sp,$ovf
 
-	$PUSH	r14,`4*$SIZE_T`($sp)
-	$PUSH	r15,`5*$SIZE_T`($sp)
-	$PUSH	r16,`6*$SIZE_T`($sp)
-	$PUSH	r17,`7*$SIZE_T`($sp)
-	$PUSH	r18,`8*$SIZE_T`($sp)
-	$PUSH	r19,`9*$SIZE_T`($sp)
-	$PUSH	r20,`10*$SIZE_T`($sp)
-	$PUSH	r21,`11*$SIZE_T`($sp)
-	$PUSH	r22,`12*$SIZE_T`($sp)
-	$PUSH	r23,`13*$SIZE_T`($sp)
-	$PUSH	r24,`14*$SIZE_T`($sp)
-	$PUSH	r25,`15*$SIZE_T`($sp)
+	$PUSH	r20,`-12*$SIZE_T`($tj)
+	$PUSH	r21,`-11*$SIZE_T`($tj)
+	$PUSH	r22,`-10*$SIZE_T`($tj)
+	$PUSH	r23,`-9*$SIZE_T`($tj)
+	$PUSH	r24,`-8*$SIZE_T`($tj)
+	$PUSH	r25,`-7*$SIZE_T`($tj)
+	$PUSH	r26,`-6*$SIZE_T`($tj)
+	$PUSH	r27,`-5*$SIZE_T`($tj)
+	$PUSH	r28,`-4*$SIZE_T`($tj)
+	$PUSH	r29,`-3*$SIZE_T`($tj)
+	$PUSH	r30,`-2*$SIZE_T`($tj)
+	$PUSH	r31,`-1*$SIZE_T`($tj)
 
 	$LD	$n0,0($n0)	; pull n0[0] value
 	addi	$num,$num,-2	; adjust $num for counter register
 
 	$LD	$m0,0($bp)	; m0=bp[0]
 	$LD	$aj,0($ap)	; ap[0]
-	addi	$tp,$sp,$FRAME
+	addi	$tp,$sp,$LOCALS
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
 	$UMULH	$hi0,$aj,$m0
 
@@ -205,8 +212,8 @@ L1st:
 Louter:
 	$LDX	$m0,$bp,$i	; m0=bp[i]
 	$LD	$aj,0($ap)	; ap[0]
-	addi	$tp,$sp,$FRAME
-	$LD	$tj,$FRAME($sp)	; tp[0]
+	addi	$tp,$sp,$LOCALS
+	$LD	$tj,$LOCALS($sp); tp[0]
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
 	$UMULH	$hi0,$aj,$m0
 	$LD	$aj,$BNSZ($ap)	; ap[1]
@@ -273,7 +280,7 @@ Linner:
 
 	addi	$num,$num,2	; restore $num
 	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
-	addi	$tp,$sp,$FRAME
+	addi	$tp,$sp,$LOCALS
 	mtctr	$num
 
 .align	4
@@ -299,23 +306,27 @@ Lcopy:				; copy or in-place refresh
 	addi	$j,$j,$BNSZ
 	bdnz-	Lcopy
 
-	$POP	r14,`4*$SIZE_T`($sp)
-	$POP	r15,`5*$SIZE_T`($sp)
-	$POP	r16,`6*$SIZE_T`($sp)
-	$POP	r17,`7*$SIZE_T`($sp)
-	$POP	r18,`8*$SIZE_T`($sp)
-	$POP	r19,`9*$SIZE_T`($sp)
-	$POP	r20,`10*$SIZE_T`($sp)
-	$POP	r21,`11*$SIZE_T`($sp)
-	$POP	r22,`12*$SIZE_T`($sp)
-	$POP	r23,`13*$SIZE_T`($sp)
-	$POP	r24,`14*$SIZE_T`($sp)
-	$POP	r25,`15*$SIZE_T`($sp)
-	$POP	$sp,0($sp)
+	$POP	$tj,0($sp)
 	li	r3,1
+	$POP	r20,`-12*$SIZE_T`($tj)
+	$POP	r21,`-11*$SIZE_T`($tj)
+	$POP	r22,`-10*$SIZE_T`($tj)
+	$POP	r23,`-9*$SIZE_T`($tj)
+	$POP	r24,`-8*$SIZE_T`($tj)
+	$POP	r25,`-7*$SIZE_T`($tj)
+	$POP	r26,`-6*$SIZE_T`($tj)
+	$POP	r27,`-5*$SIZE_T`($tj)
+	$POP	r28,`-4*$SIZE_T`($tj)
+	$POP	r29,`-3*$SIZE_T`($tj)
+	$POP	r30,`-2*$SIZE_T`($tj)
+	$POP	r31,`-1*$SIZE_T`($tj)
+	mr	$sp,$tj
 	blr
 	.long	0
-.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+	.byte	0,12,4,0,0x80,12,6,0
+	.long	0
+
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/bn/asm/ppc.pl b/main/openssl/crypto/bn/asm/ppc.pl
index 37c65d35..1249ce22 100644
--- a/main/openssl/crypto/bn/asm/ppc.pl
+++ b/main/openssl/crypto/bn/asm/ppc.pl
@@ -389,7 +389,9 @@ $data=<<EOF;
 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -814,8 +816,9 @@ $data=<<EOF;
 
 
 	blr
-
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -949,7 +952,7 @@ $data=<<EOF;
 	addze	r11,r0
 					#mul_add_c(a[3],b[2],c3,c1,c2);
 	$LD	r6,`3*$BNSZ`(r4)
-	$LD	r7,`2*$BNSZ`(r4)
+	$LD	r7,`2*$BNSZ`(r5)
 	$UMULL	r8,r6,r7
 	$UMULH	r9,r6,r7
 	addc	r12,r8,r12
@@ -966,7 +969,9 @@ $data=<<EOF;
 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1502,7 +1507,9 @@ $data=<<EOF;
 	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
 	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1550,8 +1557,9 @@ Lppcasm_sub_adios:
 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
 	andi.	r3,r3,1         # keep only last bit.
 	blr
-	.long	0x00000000
-
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop:
 Lppcasm_add_adios:	
 	addze	r3,r0			#return carry bit.
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1707,7 +1717,9 @@ Lppcasm_div8:
 Lppcasm_div9:
 	or	r3,r8,r0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop:
 	bdnz-	Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:	
 	blr
-	.long	0x00000000
-
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1850,7 +1863,9 @@ Lppcasm_mw_REM:
 Lppcasm_mw_OVER:	
 	addi	r3,r12,0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover:
 Lppcasm_maw_adios:	
 	addi	r3,r12,0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 	.align	4
 EOF
 $data =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/bn/asm/ppc64-mont.pl b/main/openssl/crypto/bn/asm/ppc64-mont.pl
index 3449b358..a14e769a 100644
--- a/main/openssl/crypto/bn/asm/ppc64-mont.pl
+++ b/main/openssl/crypto/bn/asm/ppc64-mont.pl
@@ -45,23 +45,40 @@
 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
 # in absolute terms, but it's apparently the way Power 6 is...
 
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
 $flavour = shift;
 
 if ($flavour =~ /32/) {
 	$SIZE_T=4;
 	$RZONE=	224;
-	$FRAME=	$SIZE_T*12+8*12;
-	$fname=	"bn_mul_mont_ppc64";
+	$fname=	"bn_mul_mont_fpu64";
 
 	$STUX=	"stwux";	# store indexed and update
 	$PUSH=	"stw";
 	$POP=	"lwz";
-	die "not implemented yet";
 } elsif ($flavour =~ /64/) {
 	$SIZE_T=8;
 	$RZONE=	288;
-	$FRAME=	$SIZE_T*12+8*12;
-	$fname=	"bn_mul_mont";
+	$fname=	"bn_mul_mont_fpu64";
 
 	# same as above, but 64-bit mnemonics...
 	$STUX=	"stdux";	# store indexed and update
@@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=($FRAME+63)&~63;
+$FRAME=64;	# padded frame header
 $TRANSFER=16*8;
 
 $carry="r0";
@@ -93,16 +110,16 @@ $tp="r10";
 $j="r11";
 $i="r12";
 # non-volatile registers
-$nap_d="r14";	# interleaved ap and np in double format
-$a0="r15";	# ap[0]
-$t0="r16";	# temporary registers
-$t1="r17";
-$t2="r18";
-$t3="r19";
-$t4="r20";
-$t5="r21";
-$t6="r22";
-$t7="r23";
+$nap_d="r22";	# interleaved ap and np in double format
+$a0="r23";	# ap[0]
+$t0="r24";	# temporary registers
+$t1="r25";
+$t2="r26";
+$t3="r27";
+$t4="r28";
+$t5="r29";
+$t6="r30";
+$t7="r31";
 
 # PPC offers enough register bank capacity to unroll inner loops twice
 #
@@ -132,28 +149,17 @@ $ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
 $na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
 $dota="f8";	$dotb="f9";
 $A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
-$N0="f14";	$N1="f15";	$N2="f16";	$N3="f17";
-$T0a="f18";	$T0b="f19";
-$T1a="f20";	$T1b="f21";
-$T2a="f22";	$T2b="f23";
-$T3a="f24";	$T3b="f25";
+$N0="f20";	$N1="f21";	$N2="f22";	$N3="f23";
+$T0a="f24";	$T0b="f25";
+$T1a="f26";	$T1b="f27";
+$T2a="f28";	$T2b="f29";
+$T3a="f30";	$T3b="f31";
 
 # sp----------->+-------------------------------+
 #		| saved sp			|
 #		+-------------------------------+
-#		|				|
-#		+-------------------------------+
-#		| 10 saved gpr, r14-r23		|
-#		.				.
-#		.				.
-#   +12*size_t	+-------------------------------+
-#		| 12 saved fpr, f14-f25		|
 #		.				.
-#		.				.
-#   +12*8	+-------------------------------+
-#		| padding to 64 byte boundary	|
-#		.				.
-#   +X		+-------------------------------+
+#   +64		+-------------------------------+
 #		| 16 gpr<->fpr transfer zone	|
 #		.				.
 #		.				.
@@ -173,6 +179,16 @@ $T3a="f24";	$T3b="f25";
 #		.				.
 #		.				.
 #		+-------------------------------+
+#		.				.
+#   -12*size_t	+-------------------------------+
+#		| 10 saved gpr, r22-r31		|
+#		.				.
+#		.				.
+#   -12*8	+-------------------------------+
+#		| 12 saved fpr, f20-f31		|
+#		.				.
+#		.				.
+#		+-------------------------------+
 
 $code=<<___;
 .machine "any"
@@ -181,14 +197,14 @@ $code=<<___;
 .globl	.$fname
 .align	5
 .$fname:
-	cmpwi	$num,4
+	cmpwi	$num,`3*8/$SIZE_T`
 	mr	$rp,r3		; $rp is reassigned
 	li	r3,0		; possible "not handled" return code
 	bltlr-
-	andi.	r0,$num,1	; $num has to be even
+	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
 	bnelr-
 
-	slwi	$num,$num,3	; num*=8
+	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
 	li	$i,-4096
 	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
 	add	$tp,$tp,$num	; place for tp[num+1]
@@ -196,35 +212,50 @@ $code=<<___;
 	subf	$tp,$tp,$sp	; $sp-$tp
 	and	$tp,$tp,$i	; minimize TLB usage
 	subf	$tp,$sp,$tp	; $tp-$sp
+	mr	$i,$sp
 	$STUX	$sp,$sp,$tp	; alloca
 
-	$PUSH	r14,`2*$SIZE_T`($sp)
-	$PUSH	r15,`3*$SIZE_T`($sp)
-	$PUSH	r16,`4*$SIZE_T`($sp)
-	$PUSH	r17,`5*$SIZE_T`($sp)
-	$PUSH	r18,`6*$SIZE_T`($sp)
-	$PUSH	r19,`7*$SIZE_T`($sp)
-	$PUSH	r20,`8*$SIZE_T`($sp)
-	$PUSH	r21,`9*$SIZE_T`($sp)
-	$PUSH	r22,`10*$SIZE_T`($sp)
-	$PUSH	r23,`11*$SIZE_T`($sp)
-	stfd	f14,`12*$SIZE_T+0`($sp)
-	stfd	f15,`12*$SIZE_T+8`($sp)
-	stfd	f16,`12*$SIZE_T+16`($sp)
-	stfd	f17,`12*$SIZE_T+24`($sp)
-	stfd	f18,`12*$SIZE_T+32`($sp)
-	stfd	f19,`12*$SIZE_T+40`($sp)
-	stfd	f20,`12*$SIZE_T+48`($sp)
-	stfd	f21,`12*$SIZE_T+56`($sp)
-	stfd	f22,`12*$SIZE_T+64`($sp)
-	stfd	f23,`12*$SIZE_T+72`($sp)
-	stfd	f24,`12*$SIZE_T+80`($sp)
-	stfd	f25,`12*$SIZE_T+88`($sp)
-
+	$PUSH	r22,`-12*8-10*$SIZE_T`($i)
+	$PUSH	r23,`-12*8-9*$SIZE_T`($i)
+	$PUSH	r24,`-12*8-8*$SIZE_T`($i)
+	$PUSH	r25,`-12*8-7*$SIZE_T`($i)
+	$PUSH	r26,`-12*8-6*$SIZE_T`($i)
+	$PUSH	r27,`-12*8-5*$SIZE_T`($i)
+	$PUSH	r28,`-12*8-4*$SIZE_T`($i)
+	$PUSH	r29,`-12*8-3*$SIZE_T`($i)
+	$PUSH	r30,`-12*8-2*$SIZE_T`($i)
+	$PUSH	r31,`-12*8-1*$SIZE_T`($i)
+	stfd	f20,`-12*8`($i)
+	stfd	f21,`-11*8`($i)
+	stfd	f22,`-10*8`($i)
+	stfd	f23,`-9*8`($i)
+	stfd	f24,`-8*8`($i)
+	stfd	f25,`-7*8`($i)
+	stfd	f26,`-6*8`($i)
+	stfd	f27,`-5*8`($i)
+	stfd	f28,`-4*8`($i)
+	stfd	f29,`-3*8`($i)
+	stfd	f30,`-2*8`($i)
+	stfd	f31,`-1*8`($i)
+___
+$code.=<<___ if ($SIZE_T==8);
 	ld	$a0,0($ap)	; pull ap[0] value
 	ld	$n0,0($n0)	; pull n0[0] value
 	ld	$t3,0($bp)	; bp[0]
-
+___
+$code.=<<___ if ($SIZE_T==4);
+	mr	$t1,$n0
+	lwz	$a0,0($ap)	; pull ap[0,1] value
+	lwz	$t0,4($ap)
+	lwz	$n0,0($t1)	; pull n0[0,1] value
+	lwz	$t1,4($t1)
+	lwz	$t3,0($bp)	; bp[0,1]
+	lwz	$t2,4($bp)
+	insrdi	$a0,$t0,32,0
+	insrdi	$n0,$t1,32,0
+	insrdi	$t3,$t2,32,0
+___
+$code.=<<___;
 	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
 	li	$i,-64
 	add	$nap_d,$tp,$num
@@ -258,6 +289,8 @@ $code=<<___;
 	std	$t5,`$FRAME+40`($sp)
 	std	$t6,`$FRAME+48`($sp)
 	std	$t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
 	lwz	$t1,0($ap)
 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
@@ -266,6 +299,18 @@ $code=<<___;
 	lwz	$t5,0($np)
 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
 	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
 	lfd	$ba,`$FRAME+0`($sp)
 	lfd	$bb,`$FRAME+8`($sp)
 	lfd	$bc,`$FRAME+16`($sp)
@@ -374,6 +419,8 @@ $code=<<___;
 
 .align	5
 L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
 	lwz	$t1,0($ap)
 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
@@ -382,6 +429,18 @@ L1st:
 	lwz	$t5,0($np)
 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
 	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
 	std	$t0,`$FRAME+64`($sp)
 	std	$t1,`$FRAME+72`($sp)
 	std	$t2,`$FRAME+80`($sp)
@@ -559,7 +618,17 @@ L1st:
 	li	$i,8			; i=1
 .align	5
 Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
 	ldx	$t3,$bp,$i	; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+	add	$t0,$bp,$i
+	lwz	$t3,0($t0)		; bp[i,i+1]
+	lwz	$t0,4($t0)
+	insrdi	$t3,$t0,32,0
+___
+$code.=<<___;
 	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
 	mulld	$t7,$a0,$t3	; ap[0]*bp[i]
 
@@ -761,6 +830,13 @@ Linner:
 	stfd	$T0b,`$FRAME+8`($sp)
 	 add	$t7,$t7,$carry
 	 addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
 	stfd	$T1a,`$FRAME+16`($sp)
 	stfd	$T1b,`$FRAME+24`($sp)
 	 insrdi	$t4,$t7,16,0		; 64..127 bits
@@ -768,6 +844,13 @@ Linner:
 	stfd	$T2a,`$FRAME+32`($sp)
 	stfd	$T2b,`$FRAME+40`($sp)
 	 adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
 	stfd	$T3a,`$FRAME+48`($sp)
 	stfd	$T3b,`$FRAME+56`($sp)
 	 addze	$carry,$carry
@@ -816,7 +899,21 @@ Linner:
 	ld	$t7,`$FRAME+72`($sp)
 
 	addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
 	adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
 	addze	$carry,$carry
 
 	std	$t3,-16($tp)		; tp[j-1]
@@ -835,7 +932,9 @@ Linner:
 	subf	$nap_d,$t7,$nap_d	; rewind pointer
 	cmpw	$i,$num
 	blt-	Louter
+___
 
+$code.=<<___ if ($SIZE_T==8);
 	subf	$np,$num,$np	; rewind np
 	addi	$j,$j,1		; restore counter
 	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
@@ -883,34 +982,105 @@ Lcopy:				; copy or in-place refresh
 	stdx	$i,$t4,$i
 	addi	$i,$i,16
 	bdnz-	Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+	subf	$np,$num,$np	; rewind np
+	addi	$j,$j,1		; restore counter
+	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	addi	$np,$np,-4
+	addi	$rp,$rp,-4
+	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
+	mtctr	$j
+
+.align	4
+Lsub:	ld	$t0,8($tp)	; load tp[j..j+3] in 64-bit word order
+	ldu	$t2,16($tp)
+	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
+	lwz	$t5,8($np)
+	lwz	$t6,12($np)
+	lwzu	$t7,16($np)
+	extrdi	$t1,$t0,32,0
+	extrdi	$t3,$t2,32,0
+	subfe	$t4,$t4,$t0	; tp[j]-np[j]
+	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
+	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
+	 stw	$t1,8($ap)
+	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
+	 stw	$t2,12($ap)
+	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
+	 stwu	$t3,16($ap)
+	stw	$t4,4($rp)
+	stw	$t5,8($rp)
+	stw	$t6,12($rp)
+	stwu	$t7,16($rp)
+	bdnz-	Lsub
+
+	li	$i,0
+	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
+	addi	$tp,$sp,`$FRAME+$TRANSFER+4`
+	subf	$rp,$num,$rp	; rewind rp
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	mtctr	$j
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	lwz	$t0,4($ap)
+	lwz	$t1,8($ap)
+	lwz	$t2,12($ap)
+	lwzu	$t3,16($ap)
+	std	$i,8($nap_d)	; zap nap_d
+	std	$i,16($nap_d)
+	std	$i,24($nap_d)
+	std	$i,32($nap_d)
+	std	$i,40($nap_d)
+	std	$i,48($nap_d)
+	std	$i,56($nap_d)
+	stdu	$i,64($nap_d)
+	stw	$t0,4($rp)
+	stw	$t1,8($rp)
+	stw	$t2,12($rp)
+	stwu	$t3,16($rp)
+	std	$i,8($tp)	; zap tp at once
+	stdu	$i,16($tp)
+	bdnz-	Lcopy
+___
 
-	$POP	r14,`2*$SIZE_T`($sp)
-	$POP	r15,`3*$SIZE_T`($sp)
-	$POP	r16,`4*$SIZE_T`($sp)
-	$POP	r17,`5*$SIZE_T`($sp)
-	$POP	r18,`6*$SIZE_T`($sp)
-	$POP	r19,`7*$SIZE_T`($sp)
-	$POP	r20,`8*$SIZE_T`($sp)
-	$POP	r21,`9*$SIZE_T`($sp)
-	$POP	r22,`10*$SIZE_T`($sp)
-	$POP	r23,`11*$SIZE_T`($sp)
-	lfd	f14,`12*$SIZE_T+0`($sp)
-	lfd	f15,`12*$SIZE_T+8`($sp)
-	lfd	f16,`12*$SIZE_T+16`($sp)
-	lfd	f17,`12*$SIZE_T+24`($sp)
-	lfd	f18,`12*$SIZE_T+32`($sp)
-	lfd	f19,`12*$SIZE_T+40`($sp)
-	lfd	f20,`12*$SIZE_T+48`($sp)
-	lfd	f21,`12*$SIZE_T+56`($sp)
-	lfd	f22,`12*$SIZE_T+64`($sp)
-	lfd	f23,`12*$SIZE_T+72`($sp)
-	lfd	f24,`12*$SIZE_T+80`($sp)
-	lfd	f25,`12*$SIZE_T+88`($sp)
-	$POP	$sp,0($sp)
+$code.=<<___;
+	$POP	$i,0($sp)
 	li	r3,1	; signal "handled"
+	$POP	r22,`-12*8-10*$SIZE_T`($i)
+	$POP	r23,`-12*8-9*$SIZE_T`($i)
+	$POP	r24,`-12*8-8*$SIZE_T`($i)
+	$POP	r25,`-12*8-7*$SIZE_T`($i)
+	$POP	r26,`-12*8-6*$SIZE_T`($i)
+	$POP	r27,`-12*8-5*$SIZE_T`($i)
+	$POP	r28,`-12*8-4*$SIZE_T`($i)
+	$POP	r29,`-12*8-3*$SIZE_T`($i)
+	$POP	r30,`-12*8-2*$SIZE_T`($i)
+	$POP	r31,`-12*8-1*$SIZE_T`($i)
+	lfd	f20,`-12*8`($i)
+	lfd	f21,`-11*8`($i)
+	lfd	f22,`-10*8`($i)
+	lfd	f23,`-9*8`($i)
+	lfd	f24,`-8*8`($i)
+	lfd	f25,`-7*8`($i)
+	lfd	f26,`-6*8`($i)
+	lfd	f27,`-5*8`($i)
+	lfd	f28,`-4*8`($i)
+	lfd	f29,`-3*8`($i)
+	lfd	f30,`-2*8`($i)
+	lfd	f31,`-1*8`($i)
+	mr	$sp,$i
 	blr
 	.long	0
-.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+	.byte	0,12,4,0,0x8c,10,6,0
+	.long	0
+
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/bn/asm/s390x-gf2m.pl b/main/openssl/crypto/bn/asm/s390x-gf2m.pl
new file mode 100644
index 00000000..9d18d40e
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/s390x-gf2m.pl
@@ -0,0 +1,221 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... gcc 4.3 appeared to generate poor code, therefore
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
+#
+# (*)	gcc 4.1 was observed to deliver better results than gcc 4.3,
+#	so that improvement coefficients can vary from one specific
+#	setup to another.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$rp="%r2";
+$a1="%r3";
+$a0="%r4";
+$b1="%r5";
+$b0="%r6";
+
+$ra="%r14";
+$sp="%r15";
+
+@T=("%r0","%r1");
+@i=("%r12","%r13");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
+($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@function
+.align	16
+_mul_1x1:
+	lgr	$a1,$a
+	sllg	$a2,$a,1
+	sllg	$a4,$a,2
+	sllg	$a8,$a,3
+
+	srag	$lo,$a1,63			# broadcast 63rd bit
+	nihh	$a1,0x1fff
+	srag	@i[0],$a2,63			# broadcast 62nd bit
+	nihh	$a2,0x3fff
+	srag	@i[1],$a4,63			# broadcast 61st bit
+	nihh	$a4,0x7fff
+	ngr	$lo,$b
+	ngr	@i[0],$b
+	ngr	@i[1],$b
+
+	lghi	@T[0],0
+	lgr	$a12,$a1
+	stg	@T[0],`$stdframe+0*8`($sp)	# tab[0]=0
+	xgr	$a12,$a2
+	stg	$a1,`$stdframe+1*8`($sp)	# tab[1]=a1
+	 lgr	$a48,$a4
+	stg	$a2,`$stdframe+2*8`($sp)	# tab[2]=a2
+	 xgr	$a48,$a8
+	stg	$a12,`$stdframe+3*8`($sp)	# tab[3]=a1^a2
+	 xgr	$a1,$a4
+
+	stg	$a4,`$stdframe+4*8`($sp)	# tab[4]=a4
+	xgr	$a2,$a4
+	stg	$a1,`$stdframe+5*8`($sp)	# tab[5]=a1^a4
+	xgr	$a12,$a4
+	stg	$a2,`$stdframe+6*8`($sp)	# tab[6]=a2^a4
+	 xgr	$a1,$a48
+	stg	$a12,`$stdframe+7*8`($sp)	# tab[7]=a1^a2^a4
+	 xgr	$a2,$a48
+
+	stg	$a8,`$stdframe+8*8`($sp)	# tab[8]=a8
+	xgr	$a12,$a48
+	stg	$a1,`$stdframe+9*8`($sp)	# tab[9]=a1^a8
+	 xgr	$a1,$a4
+	stg	$a2,`$stdframe+10*8`($sp)	# tab[10]=a2^a8
+	 xgr	$a2,$a4
+	stg	$a12,`$stdframe+11*8`($sp)	# tab[11]=a1^a2^a8
+
+	xgr	$a12,$a4
+	stg	$a48,`$stdframe+12*8`($sp)	# tab[12]=a4^a8
+	 srlg	$hi,$lo,1
+	stg	$a1,`$stdframe+13*8`($sp)	# tab[13]=a1^a4^a8
+	 sllg	$lo,$lo,63
+	stg	$a2,`$stdframe+14*8`($sp)	# tab[14]=a2^a4^a8
+	 srlg	@T[0],@i[0],2
+	stg	$a12,`$stdframe+15*8`($sp)	# tab[15]=a1^a2^a4^a8
+
+	lghi	$mask,`0xf<<3`
+	sllg	$a1,@i[0],62
+	 sllg	@i[0],$b,3
+	srlg	@T[1],@i[1],3
+	 ngr	@i[0],$mask
+	sllg	$a2,@i[1],61
+	 srlg	@i[1],$b,4-3
+	xgr	$hi,@T[0]
+	 ngr	@i[1],$mask
+	xgr	$lo,$a1
+	xgr	$hi,@T[1]
+	xgr	$lo,$a2
+
+	xg	$lo,$stdframe(@i[0],$sp)
+	srlg	@i[0],$b,8-3
+	ngr	@i[0],$mask
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	srlg	@i[1],$b,`($n+2)*4`-3
+	sllg	@T[0],@T[1],`$n*4`
+	ngr	@i[1],$mask
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+___
+	push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	sllg	@T[0],@T[1],`$n*4`
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+
+	lg	@T[0],$stdframe(@i[0],$sp)
+	sllg	@T[1],@T[0],`($n+1)*4`
+	srlg	@T[0],@T[0],`64-($n+1)*4`
+	xgr	$lo,@T[1]
+	xgr	$hi,@T[0]
+
+	br	$ra
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@function
+.align	16
+bn_GF2m_mul_2x2:
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
+
+	lghi	%r1,-$stdframe-128
+	la	%r0,0($sp)
+	la	$sp,0(%r1,$sp)			# alloca
+	st${g}	%r0,0($sp)			# back chain
+___
+if ($SIZE_T==8) {
+my @r=map("%r$_",(6..9));
+$code.=<<___;
+	bras	$ra,_mul_1x1			# a1·b1
+	stmg	$lo,$hi,16($rp)
+
+	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# a0·b0
+	stmg	$lo,$hi,0($rp)
+
+	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
+	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# (a0+a1)·(b0+b1)
+	lmg	@r[0],@r[3],0($rp)
+
+	xgr	$lo,$hi
+	xgr	$hi,@r[1]
+	xgr	$lo,@r[0]
+	xgr	$hi,@r[2]
+	xgr	$lo,@r[3]	
+	xgr	$hi,@r[3]
+	xgr	$lo,$hi
+	stg	$hi,16($rp)
+	stg	$lo,8($rp)
+___
+} else {
+$code.=<<___;
+	sllg	%r3,%r3,32
+	sllg	%r5,%r5,32
+	or	%r3,%r4
+	or	%r5,%r6
+	bras	$ra,_mul_1x1
+	rllg	$lo,$lo,32
+	rllg	$hi,$hi,32
+	stmg	$lo,$hi,0($rp)
+___
+}
+$code.=<<___;
+	lm${g}	%r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
+	br	$ra
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.string	"GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/s390x-mont.pl b/main/openssl/crypto/bn/asm/s390x-mont.pl
index f61246f5..9fd64e81 100644
--- a/main/openssl/crypto/bn/asm/s390x-mont.pl
+++ b/main/openssl/crypto/bn/asm/s390x-mont.pl
@@ -32,6 +32,33 @@
 # Reschedule to minimize/avoid Address Generation Interlock hazard,
 # make inner loops counter-based.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better than
+# compiler-generated code, less for longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
 $mn0="%r0";
 $num="%r1";
 
@@ -60,34 +87,44 @@ $code.=<<___;
 .globl	bn_mul_mont
 .type	bn_mul_mont,\@function
 bn_mul_mont:
-	lgf	$num,164($sp)	# pull $num
-	sla	$num,3		# $num to enumerate bytes
+	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
+	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
 	la	$bp,0($num,$bp)
 
-	stg	%r2,16($sp)
+	st${g}	%r2,2*$SIZE_T($sp)
 
 	cghi	$num,16		#
 	lghi	%r2,0		#
 	blr	%r14		# if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	tmll	$num,4
+	bnzr	%r14		# if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
 	cghi	$num,96		#
 	bhr	%r14		# if($num>96) return 0;
+___
+$code.=<<___;
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
 
-	stmg	%r3,%r15,24($sp)
-
-	lghi	$rp,-160-8	# leave room for carry bit
+	lghi	$rp,-$stdframe-8	# leave room for carry bit
 	lcgr	$j,$num		# -$num
 	lgr	%r0,$sp
 	la	$rp,0($rp,$sp)
 	la	$sp,0($j,$rp)	# alloca
-	stg	%r0,0($sp)	# back chain
+	st${g}	%r0,0($sp)	# back chain
 
 	sra	$num,3		# restore $num
 	la	$bp,0($j,$bp)	# restore $bp
 	ahi	$num,-1		# adjust $num for inner loop
 	lg	$n0,0($n0)	# pull n0
+	_dswap	$n0
 
 	lg	$bi,0($bp)
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[0]
 	lgr	$AHI,$ahi
 
@@ -95,6 +132,7 @@ bn_mul_mont:
 	msgr	$mn0,$n0
 
 	lg	$nlo,0($np)	#
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@@ -106,12 +144,14 @@ bn_mul_mont:
 .align	16
 .L1st:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[0]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 
 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@@ -119,22 +159,24 @@ bn_mul_mont:
 	algr	$nlo,$alo
 	alcgr	$NHI,$nhi
 
-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.L1st
 
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI	# upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
 	la	$bp,8($bp)	# bp++
 
 .Louter:
 	lg	$bi,0($bp)	# bp[i]
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[i]
-	alg	$alo,160($sp)	# +=tp[0]
+	alg	$alo,$stdframe($sp)	# +=tp[0]
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 
@@ -142,6 +184,7 @@ bn_mul_mont:
 	msgr	$mn0,$n0	# tp[0]*n0
 
 	lg	$nlo,0($np)	# np[0]
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@@ -153,14 +196,16 @@ bn_mul_mont:
 .align	16
 .Linner:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[i]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$ahi,$AHI
-	alg	$alo,160($j,$sp)# +=tp[j]
+	alg	$alo,$stdframe($j,$sp)# +=tp[j]
 	alcgr	$AHI,$ahi
 
 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@@ -168,31 +213,33 @@ bn_mul_mont:
 	algr	$nlo,$alo	# +="tp[j]"
 	alcgr	$NHI,$nhi
 
-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.Linner
 
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI
-	alg	$NHI,160($j,$sp)# accumulate previous upmost overflow bit
+	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
 	lghi	$ahi,0
 	alcgr	$AHI,$ahi	# new upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
 
 	la	$bp,8($bp)	# bp++
-	clg	$bp,160+8+32($j,$sp)	# compare to &bp[num]
+	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
 	jne	.Louter
 
-	lg	$rp,160+8+16($j,$sp)	# reincarnate rp
-	la	$ap,160($sp)
+	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
+	la	$ap,$stdframe($sp)
 	ahi	$num,1		# restore $num, incidentally clears "borrow"
 
 	la	$j,0(%r0)
 	lr	$count,$num
 .Lsub:	lg	$alo,0($j,$ap)
-	slbg	$alo,0($j,$np)
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	slbgr	$alo,$nlo
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lsub
@@ -207,19 +254,24 @@ bn_mul_mont:
 
 	la	$j,0(%r0)
 	lgr	$count,$num
-.Lcopy:	lg	$alo,0($j,$ap)	# copy or in-place refresh
-	stg	$j,160($j,$sp)	# zap tp
+.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
+	_dswap	$alo
+	stg	$j,$stdframe($j,$sp)	# zap tp
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lcopy
 
-	la	%r1,160+8+48($j,$sp)
-	lmg	%r6,%r15,0(%r1)
+	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+	lm${g}	%r6,%r15,0(%r1)
 	lghi	%r2,1		# signal "processed"
 	br	%r14
 .size	bn_mul_mont,.-bn_mul_mont
 .string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
-print $code;
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+	print $_,"\n";
+}
 close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/x86-gf2m.S b/main/openssl/crypto/bn/asm/x86-gf2m.S
new file mode 100644
index 00000000..9403a5aa
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86-gf2m.S
@@ -0,0 +1,335 @@
+.file	"crypto/bn/asm/x86-gf2m.s"
+.text
+.type	_mul_1x1_mmx,@function
+.align	16
+_mul_1x1_mmx:
+	subl	$36,%esp
+	movl	%eax,%ecx
+	leal	(%eax,%eax,1),%edx
+	andl	$1073741823,%ecx
+	leal	(%edx,%edx,1),%ebp
+	movl	$0,(%esp)
+	andl	$2147483647,%edx
+	movd	%eax,%mm2
+	movd	%ebx,%mm3
+	movl	%ecx,4(%esp)
+	xorl	%edx,%ecx
+	pxor	%mm5,%mm5
+	pxor	%mm4,%mm4
+	movl	%edx,8(%esp)
+	xorl	%ebp,%edx
+	movl	%ecx,12(%esp)
+	pcmpgtd	%mm2,%mm5
+	paddd	%mm2,%mm2
+	xorl	%edx,%ecx
+	movl	%ebp,16(%esp)
+	xorl	%edx,%ebp
+	pand	%mm3,%mm5
+	pcmpgtd	%mm2,%mm4
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%ebp
+	psllq	$31,%mm5
+	pand	%mm3,%mm4
+	movl	%edx,24(%esp)
+	movl	$7,%esi
+	movl	%ebp,28(%esp)
+	movl	%esi,%ebp
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	movl	%ebp,%edi
+	psllq	$30,%mm4
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	movd	(%esp,%esi,4),%mm0
+	movl	%ebp,%esi
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	movd	(%esp,%edi,4),%mm2
+	movl	%ebp,%edi
+	psllq	$3,%mm2
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	movl	%ebp,%esi
+	psllq	$6,%mm1
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	pxor	%mm1,%mm0
+	movd	(%esp,%edi,4),%mm2
+	movl	%ebp,%edi
+	psllq	$9,%mm2
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	movl	%ebp,%esi
+	psllq	$12,%mm1
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	pxor	%mm1,%mm0
+	movd	(%esp,%edi,4),%mm2
+	movl	%ebp,%edi
+	psllq	$15,%mm2
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	movl	%ebp,%esi
+	psllq	$18,%mm1
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	pxor	%mm1,%mm0
+	movd	(%esp,%edi,4),%mm2
+	movl	%ebp,%edi
+	psllq	$21,%mm2
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	movl	%ebp,%esi
+	psllq	$24,%mm1
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	pxor	%mm1,%mm0
+	movd	(%esp,%edi,4),%mm2
+	pxor	%mm4,%mm0
+	psllq	$27,%mm2
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	pxor	%mm5,%mm0
+	psllq	$30,%mm1
+	addl	$36,%esp
+	pxor	%mm1,%mm0
+	ret
+.size	_mul_1x1_mmx,.-_mul_1x1_mmx
+.type	_mul_1x1_ialu,@function
+.align	16
+_mul_1x1_ialu:
+	subl	$36,%esp
+	movl	%eax,%ecx
+	leal	(%eax,%eax,1),%edx
+	leal	(,%eax,4),%ebp
+	andl	$1073741823,%ecx
+	leal	(%eax,%eax,1),%edi
+	sarl	$31,%eax
+	movl	$0,(%esp)
+	andl	$2147483647,%edx
+	movl	%ecx,4(%esp)
+	xorl	%edx,%ecx
+	movl	%edx,8(%esp)
+	xorl	%ebp,%edx
+	movl	%ecx,12(%esp)
+	xorl	%edx,%ecx
+	movl	%ebp,16(%esp)
+	xorl	%edx,%ebp
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%ebp
+	sarl	$31,%edi
+	andl	%ebx,%eax
+	movl	%edx,24(%esp)
+	andl	%ebx,%edi
+	movl	%ebp,28(%esp)
+	movl	%eax,%edx
+	shll	$31,%eax
+	movl	%edi,%ecx
+	shrl	$1,%edx
+	movl	$7,%esi
+	shll	$30,%edi
+	andl	%ebx,%esi
+	shrl	$2,%ecx
+	xorl	%edi,%eax
+	shrl	$3,%ebx
+	movl	$7,%edi
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	xorl	(%esp,%esi,4),%eax
+	movl	$7,%esi
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	movl	(%esp,%edi,4),%ebp
+	movl	$7,%edi
+	movl	%ebp,%ecx
+	shll	$3,%ebp
+	andl	%ebx,%edi
+	shrl	$29,%ecx
+	xorl	%ebp,%eax
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	movl	(%esp,%esi,4),%ecx
+	movl	$7,%esi
+	movl	%ecx,%ebp
+	shll	$6,%ecx
+	andl	%ebx,%esi
+	shrl	$26,%ebp
+	xorl	%ecx,%eax
+	shrl	$3,%ebx
+	xorl	%ebp,%edx
+	movl	(%esp,%edi,4),%ebp
+	movl	$7,%edi
+	movl	%ebp,%ecx
+	shll	$9,%ebp
+	andl	%ebx,%edi
+	shrl	$23,%ecx
+	xorl	%ebp,%eax
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	movl	(%esp,%esi,4),%ecx
+	movl	$7,%esi
+	movl	%ecx,%ebp
+	shll	$12,%ecx
+	andl	%ebx,%esi
+	shrl	$20,%ebp
+	xorl	%ecx,%eax
+	shrl	$3,%ebx
+	xorl	%ebp,%edx
+	movl	(%esp,%edi,4),%ebp
+	movl	$7,%edi
+	movl	%ebp,%ecx
+	shll	$15,%ebp
+	andl	%ebx,%edi
+	shrl	$17,%ecx
+	xorl	%ebp,%eax
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	movl	(%esp,%esi,4),%ecx
+	movl	$7,%esi
+	movl	%ecx,%ebp
+	shll	$18,%ecx
+	andl	%ebx,%esi
+	shrl	$14,%ebp
+	xorl	%ecx,%eax
+	shrl	$3,%ebx
+	xorl	%ebp,%edx
+	movl	(%esp,%edi,4),%ebp
+	movl	$7,%edi
+	movl	%ebp,%ecx
+	shll	$21,%ebp
+	andl	%ebx,%edi
+	shrl	$11,%ecx
+	xorl	%ebp,%eax
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	movl	(%esp,%esi,4),%ecx
+	movl	$7,%esi
+	movl	%ecx,%ebp
+	shll	$24,%ecx
+	andl	%ebx,%esi
+	shrl	$8,%ebp
+	xorl	%ecx,%eax
+	shrl	$3,%ebx
+	xorl	%ebp,%edx
+	movl	(%esp,%edi,4),%ebp
+	movl	%ebp,%ecx
+	shll	$27,%ebp
+	movl	(%esp,%esi,4),%edi
+	shrl	$5,%ecx
+	movl	%edi,%esi
+	xorl	%ebp,%eax
+	shll	$30,%edi
+	xorl	%ecx,%edx
+	shrl	$2,%esi
+	xorl	%edi,%eax
+	xorl	%esi,%edx
+	addl	$36,%esp
+	ret
+.size	_mul_1x1_ialu,.-_mul_1x1_ialu
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,@function
+.align	16
+bn_GF2m_mul_2x2:
+.L_bn_GF2m_mul_2x2_begin:
+	call	.L000PIC_me_up
+.L000PIC_me_up:
+	popl	%edx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
+	movl	OPENSSL_ia32cap_P@GOT(%edx),%edx
+	movl	(%edx),%eax
+	movl	4(%edx),%edx
+	testl	$8388608,%eax
+	jz	.L001ialu
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	24(%esp),%eax
+	movl	32(%esp),%ebx
+	call	_mul_1x1_mmx
+	movq	%mm0,%mm7
+	movl	28(%esp),%eax
+	movl	36(%esp),%ebx
+	call	_mul_1x1_mmx
+	movq	%mm0,%mm6
+	movl	24(%esp),%eax
+	movl	32(%esp),%ebx
+	xorl	28(%esp),%eax
+	xorl	36(%esp),%ebx
+	call	_mul_1x1_mmx
+	pxor	%mm7,%mm0
+	movl	20(%esp),%eax
+	pxor	%mm6,%mm0
+	movq	%mm0,%mm2
+	psllq	$32,%mm0
+	popl	%edi
+	psrlq	$32,%mm2
+	popl	%esi
+	pxor	%mm6,%mm0
+	popl	%ebx
+	pxor	%mm7,%mm2
+	movq	%mm0,(%eax)
+	popl	%ebp
+	movq	%mm2,8(%eax)
+	emms
+	ret
+.align	16
+.L001ialu:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	subl	$20,%esp
+	movl	44(%esp),%eax
+	movl	52(%esp),%ebx
+	call	_mul_1x1_ialu
+	movl	%eax,8(%esp)
+	movl	%edx,12(%esp)
+	movl	48(%esp),%eax
+	movl	56(%esp),%ebx
+	call	_mul_1x1_ialu
+	movl	%eax,(%esp)
+	movl	%edx,4(%esp)
+	movl	44(%esp),%eax
+	movl	52(%esp),%ebx
+	xorl	48(%esp),%eax
+	xorl	56(%esp),%ebx
+	call	_mul_1x1_ialu
+	movl	40(%esp),%ebp
+	movl	(%esp),%ebx
+	movl	4(%esp),%ecx
+	movl	8(%esp),%edi
+	movl	12(%esp),%esi
+	xorl	%edx,%eax
+	xorl	%ecx,%edx
+	xorl	%ebx,%eax
+	movl	%ebx,(%ebp)
+	xorl	%edi,%edx
+	movl	%esi,12(%ebp)
+	xorl	%esi,%eax
+	addl	$20,%esp
+	xorl	%esi,%edx
+	popl	%edi
+	xorl	%edx,%eax
+	popl	%esi
+	movl	%edx,8(%ebp)
+	popl	%ebx
+	movl	%eax,4(%ebp)
+	popl	%ebp
+	ret
+.size	bn_GF2m_mul_2x2,.-.L_bn_GF2m_mul_2x2_begin
+.byte	71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105
+.byte	99,97,116,105,111,110,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+.comm	OPENSSL_ia32cap_P,8,4
diff --git a/main/openssl/crypto/bn/asm/x86-gf2m.pl b/main/openssl/crypto/bn/asm/x86-gf2m.pl
new file mode 100644
index 00000000..b5795302
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86-gf2m.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has three code paths: pure integer
+# code suitable for any x86 CPU, MMX code suitable for PIII and later
+# and PCLMULQDQ suitable for Westmere and later. Improvement varies
+# from one benchmark and µ-arch to another. Below are interval values
+# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
+# code:
+#
+# PIII		16%-30%
+# P4		12%-12%
+# Opteron	18%-40%
+# Core2		19%-44%
+# Atom		38%-64%
+# Westmere	53%-121%(PCLMULQDQ)/20%-32%(MMX)
+# Sandy Bridge	72%-127%(PCLMULQDQ)/27%-23%(MMX)
+#
+# Note that above improvement coefficients are not coefficients for
+# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
+# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
+# is more and more dominated by other subroutines, most notably by
+# BN_GF2m_mod[_mul]_arr...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$a="eax";
+$b="ebx";
+($a1,$a2,$a4)=("ecx","edx","ebp");
+
+$R="mm0";
+@T=("mm1","mm2");
+($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
+@i=("esi","edi");
+
+					if (!$x86only) {
+&function_begin_B("_mul_1x1_mmx");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &and	($a1,0x3fffffff);
+	 &lea	($a4,&DWP(0,$a2,$a2));
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	&movd	($A,$a);
+	&movd	($B,$b);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	&pxor	($B31,$B31);
+	&pxor	($B30,$B30);
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	&pcmpgtd($B31,$A);		# broadcast 31st bit
+	&paddd	($A,$A);		# $A<<=1
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	&pand	($B31,$B);
+	&pcmpgtd($B30,$A);		# broadcast 30th bit
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&psllq	($B31,31);
+	&pand	($B30,$B);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&mov	(@i[0],0x7);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	 &mov	($a4,@i[0]);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	&mov	(@i[1],$a4);
+	&psllq	($B30,30);
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	&movd	($R,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],$a4);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],$a4);
+		&psllq	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	($b,3);
+		&pxor	($R,@T[1]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+	&pxor	($R,$B30);
+	&psllq	(@T[1],3*$n++);
+	&pxor	($R,@T[1]);
+
+	&movd	(@T[0],&DWP(0,"esp",@i[0],4));
+	&pxor	($R,$B31);
+	&psllq	(@T[0],3*$n);
+	&add	("esp",32+4);
+	&pxor	($R,@T[0]);
+	&ret	();
+&function_end_B("_mul_1x1_mmx");
+					}
+
+($lo,$hi)=("eax","edx");
+@T=("ecx","ebp");
+
+&function_begin_B("_mul_1x1_ialu");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &lea	($a4,&DWP(0,"",$a,4));
+	 &and	($a1,0x3fffffff);
+	&lea	(@i[1],&DWP(0,$lo,$lo));
+	&sar	($lo,31);		# broadcast 31st bit
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&sar	(@i[1],31);		# broardcast 30th bit
+	&and	($lo,$b);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&and	(@i[1],$b);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	&mov	($hi,$lo);
+	&shl	($lo,31);
+	&mov	(@T[0],@i[1]);
+	&shr	($hi,1);
+
+	 &mov	(@i[0],0x7);
+	&shl	(@i[1],30);
+	 &and	(@i[0],$b);
+	&shr	(@T[0],2);
+	&xor	($lo,@i[1]);
+
+	&shr	($b,3);
+	&mov	(@i[1],0x7);		# 5-byte instruction!?
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	 &xor	($hi,@T[0]);
+	&xor	($lo,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],0x7);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],0x7);
+		&mov	(@T[0],@T[1]);
+		&shl	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	(@T[0],32-3*$n);
+		&xor	($lo,@T[1]);
+		&shr	($b,3);
+		&xor	($hi,@T[0]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+	&mov	(@T[0],@T[1]);
+	&shl	(@T[1],3*$n);
+	&mov	(@i[1],&DWP(0,"esp",@i[0],4));
+	&shr	(@T[0],32-3*$n);	$n++;
+	&mov	(@i[0],@i[1]);
+	&xor	($lo,@T[1]);
+	&shl	(@i[1],3*$n);
+	&xor	($hi,@T[0]);
+	&shr	(@i[0],32-3*$n);
+	&xor	($lo,@i[1]);
+	&xor	($hi,@i[0]);
+
+	&add	("esp",32+4);
+	&ret	();
+&function_end_B("_mul_1x1_ialu");
+
+# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+&function_begin_B("bn_GF2m_mul_2x2");
+if (!$x86only) {
+	&picmeup("edx","OPENSSL_ia32cap_P");
+	&mov	("eax",&DWP(0,"edx"));
+	&mov	("edx",&DWP(4,"edx"));
+	&test	("eax",1<<23);		# check MMX bit
+	&jz	(&label("ialu"));
+if ($sse2) {
+	&test	("eax",1<<24);		# check FXSR bit
+	&jz	(&label("mmx"));
+	&test	("edx",1<<1);		# check PCLMULQDQ bit
+	&jz	(&label("mmx"));
+
+	&movups		("xmm0",&QWP(8,"esp"));
+	&shufps		("xmm0","xmm0",0b10110001);
+	&pclmulqdq	("xmm0","xmm0",1);
+	&mov		("eax",&DWP(4,"esp"));
+	&movups		(&QWP(0,"eax"),"xmm0");
+	&ret	();
+
+&set_label("mmx",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_mmx");	# a1·b1
+	&movq	("mm7",$R);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# a0·b0
+	&movq	("mm6",$R);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# (a0+a1)·(b0+b1)
+	&pxor	($R,"mm7");
+	&mov	($a,&wparam(0));
+	&pxor	($R,"mm6");		# (a0+a1)·(b0+b1)-a1·b1-a0·b0
+
+	&movq	($A,$R);
+	&psllq	($R,32);
+	&pop	("edi");
+	&psrlq	($A,32);
+	&pop	("esi");
+	&pxor	($R,"mm6");
+	&pop	("ebx");
+	&pxor	($A,"mm7");
+	&movq	(&QWP(0,$a),$R);
+	&pop	("ebp");
+	&movq	(&QWP(8,$a),$A);
+	&emms	();
+	&ret	();
+&set_label("ialu",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&stack_push(4+1);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_ialu");	# a1·b1
+	&mov	(&DWP(8,"esp"),$lo);
+	&mov	(&DWP(12,"esp"),$hi);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# a0·b0
+	&mov	(&DWP(0,"esp"),$lo);
+	&mov	(&DWP(4,"esp"),$hi);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# (a0+a1)·(b0+b1)
+
+	&mov	("ebp",&wparam(0));
+		 @r=("ebx","ecx","edi","esi");
+	&mov	(@r[0],&DWP(0,"esp"));
+	&mov	(@r[1],&DWP(4,"esp"));
+	&mov	(@r[2],&DWP(8,"esp"));
+	&mov	(@r[3],&DWP(12,"esp"));
+
+	&xor	($lo,$hi);
+	&xor	($hi,@r[1]);
+	&xor	($lo,@r[0]);
+	&mov	(&DWP(0,"ebp"),@r[0]);
+	&xor	($hi,@r[2]);
+	&mov	(&DWP(12,"ebp"),@r[3]);
+	&xor	($lo,@r[3]);
+	&stack_pop(4+1);
+	&xor	($hi,@r[3]);
+	&pop	("edi");
+	&xor	($lo,$hi);
+	&pop	("esi");
+	&mov	(&DWP(8,"ebp"),$hi);
+	&pop	("ebx");
+	&mov	(&DWP(4,"ebp"),$lo);
+	&pop	("ebp");
+	&ret	();
+&function_end_B("bn_GF2m_mul_2x2");
+
+&asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/main/openssl/crypto/bn/asm/x86-mont.S b/main/openssl/crypto/bn/asm/x86-mont.S
new file mode 100644
index 00000000..2bbb0e3a
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86-mont.S
@@ -0,0 +1,338 @@
+.file	"crypto/bn/asm/x86-mont.s"
+.text
+.globl	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%eax,%eax
+	movl	40(%esp),%edi
+	cmpl	$4,%edi
+	jl	.L000just_leave
+	leal	20(%esp),%esi
+	leal	24(%esp),%edx
+	movl	%esp,%ebp
+	addl	$2,%edi
+	negl	%edi
+	leal	-32(%esp,%edi,4),%esp
+	negl	%edi
+	movl	%esp,%eax
+	subl	%edx,%eax
+	andl	$2047,%eax
+	subl	%eax,%esp
+	xorl	%esp,%edx
+	andl	$2048,%edx
+	xorl	$2048,%edx
+	subl	%edx,%esp
+	andl	$-64,%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	16(%esi),%esi
+	movl	(%esi),%esi
+	movl	%eax,4(%esp)
+	movl	%ebx,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	%edx,16(%esp)
+	movl	%esi,20(%esp)
+	leal	-3(%edi),%ebx
+	movl	%ebp,24(%esp)
+	movl	8(%esp),%esi
+	leal	1(%ebx),%ebp
+	movl	12(%esp),%edi
+	xorl	%ecx,%ecx
+	movl	%esi,%edx
+	andl	$1,%ebp
+	subl	%edi,%edx
+	leal	4(%edi,%ebx,4),%eax
+	orl	%edx,%ebp
+	movl	(%edi),%edi
+	jz	.L001bn_sqr_mont
+	movl	%eax,28(%esp)
+	movl	(%esi),%eax
+	xorl	%edx,%edx
+.align	16
+.L002mull:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%eax,%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	movl	(%esi,%ecx,4),%eax
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L002mull
+	movl	%edx,%ebp
+	mull	%edi
+	movl	20(%esp),%edi
+	addl	%ebp,%eax
+	movl	16(%esp),%esi
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	movl	%eax,32(%esp,%ebx,4)
+	xorl	%ecx,%ecx
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	movl	(%esi),%eax
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	incl	%ecx
+	jmp	.L0032ndmadd
+.align	16
+.L0041stmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L0041stmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	xorl	%ecx,%ecx
+	addl	36(%esp,%ebx,4),%edx
+	movl	%ebp,32(%esp,%ebx,4)
+	adcl	$0,%ecx
+	movl	(%esi),%eax
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	movl	$1,%ecx
+.align	16
+.L0032ndmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	.L0032ndmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	xorl	%eax,%eax
+	movl	12(%esp),%ecx
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	leal	4(%ecx),%ecx
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	28(%esp),%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	.L005common_tail
+	movl	(%ecx),%edi
+	movl	8(%esp),%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	movl	(%esi),%eax
+	jmp	.L0041stmadd
+.align	16
+.L001bn_sqr_mont:
+	movl	%ebx,(%esp)
+	movl	%ecx,12(%esp)
+	movl	%edi,%eax
+	mull	%edi
+	movl	%eax,32(%esp)
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+	incl	%ecx
+.align	16
+.L006sqr:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	leal	(%ebx,%eax,2),%ebp
+	shrl	$31,%eax
+	cmpl	(%esp),%ecx
+	movl	%eax,%ebx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L006sqr
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	leal	(%ebx,%eax,2),%ebp
+	imull	32(%esp),%edi
+	shrl	$31,%eax
+	movl	%ebp,32(%esp,%ecx,4)
+	leal	(%eax,%edx,2),%ebp
+	movl	(%esi),%eax
+	shrl	$31,%edx
+	movl	%ebp,36(%esp,%ecx,4)
+	movl	%edx,40(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	%ecx,%ebx
+	adcl	$0,%edx
+	movl	4(%esi),%eax
+	movl	$1,%ecx
+.align	16
+.L0073rdmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	4(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%edx,%ebp
+	mull	%edi
+	addl	36(%esp,%ecx,4),%ebp
+	leal	2(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	.L0073rdmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	movl	12(%esp),%ecx
+	xorl	%eax,%eax
+	movl	8(%esp),%esi
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	%ebx,%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	.L005common_tail
+	movl	4(%esi,%ecx,4),%edi
+	leal	1(%ecx),%ecx
+	movl	%edi,%eax
+	movl	%ecx,12(%esp)
+	mull	%edi
+	addl	32(%esp,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%eax,32(%esp,%ecx,4)
+	xorl	%ebp,%ebp
+	cmpl	%ebx,%ecx
+	leal	1(%ecx),%ecx
+	je	.L008sqrlast
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+.align	16
+.L009sqradd:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	(%eax,%eax,1),%ebp
+	adcl	$0,%edx
+	shrl	$31,%eax
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%eax
+	addl	%ebx,%ebp
+	adcl	$0,%eax
+	cmpl	(%esp),%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%eax,%ebx
+	jle	.L009sqradd
+	movl	%edx,%ebp
+	addl	%edx,%edx
+	shrl	$31,%ebp
+	addl	%ebx,%edx
+	adcl	$0,%ebp
+.L008sqrlast:
+	movl	20(%esp),%edi
+	movl	16(%esp),%esi
+	imull	32(%esp),%edi
+	addl	32(%esp,%ecx,4),%edx
+	movl	(%esi),%eax
+	adcl	$0,%ebp
+	movl	%edx,32(%esp,%ecx,4)
+	movl	%ebp,36(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	leal	-1(%ecx),%ebx
+	adcl	$0,%edx
+	movl	$1,%ecx
+	movl	4(%esi),%eax
+	jmp	.L0073rdmadd
+.align	16
+.L005common_tail:
+	movl	16(%esp),%ebp
+	movl	4(%esp),%edi
+	leal	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	%ebx,%ecx
+	xorl	%edx,%edx
+.align	16
+.L010sub:
+	sbbl	(%ebp,%edx,4),%eax
+	movl	%eax,(%edi,%edx,4)
+	decl	%ecx
+	movl	4(%esi,%edx,4),%eax
+	leal	1(%edx),%edx
+	jge	.L010sub
+	sbbl	$0,%eax
+	andl	%eax,%esi
+	notl	%eax
+	movl	%edi,%ebp
+	andl	%eax,%ebp
+	orl	%ebp,%esi
+.align	16
+.L011copy:
+	movl	(%esi,%ebx,4),%eax
+	movl	%eax,(%edi,%ebx,4)
+	movl	%ecx,32(%esp,%ebx,4)
+	decl	%ebx
+	jge	.L011copy
+	movl	24(%esp),%esp
+	movl	$1,%eax
+.L000just_leave:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte	111,114,103,62,0
diff --git a/main/openssl/crypto/bn/asm/x86-mont.pl b/main/openssl/crypto/bn/asm/x86-mont.pl
index 5cd3cd2e..e8f6b050 100755
--- a/main/openssl/crypto/bn/asm/x86-mont.pl
+++ b/main/openssl/crypto/bn/asm/x86-mont.pl
@@ -527,8 +527,10 @@ $sbit=$num;
 	&jle	(&label("sqradd"));
 
 	&mov	($carry,"edx");
-	&lea	("edx",&DWP(0,$sbit,"edx",2));
+	&add	("edx","edx");
 	&shr	($carry,31);
+	&add	("edx",$sbit);
+	&adc	($carry,0);
 &set_label("sqrlast");
 	&mov	($word,$_n0);
 	&mov	($inp,$_np);
diff --git a/main/openssl/crypto/bn/asm/x86_64-gcc.c b/main/openssl/crypto/bn/asm/x86_64-gcc.c
index acb0b401..329946e5 100644
--- a/main/openssl/crypto/bn/asm/x86_64-gcc.c
+++ b/main/openssl/crypto/bn/asm/x86_64-gcc.c
@@ -66,7 +66,7 @@
 #undef sqr
 
 /*
- * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
+ * "m"(a), "+m"(r)	is the way to favor DirectPath µ-code;
  * "g"(0)		let the compiler to decide where does it
  *			want to keep the value of zero;
  */
diff --git a/main/openssl/crypto/bn/asm/x86_64-gf2m.S b/main/openssl/crypto/bn/asm/x86_64-gf2m.S
new file mode 100644
index 00000000..ccd2ed70
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-gf2m.S
@@ -0,0 +1,291 @@
+.text	
+
+.type	_mul_1x1,@function
+.align	16
+_mul_1x1:
+	subq	$128+8,%rsp
+	movq	$-1,%r9
+	leaq	(%rax,%rax,1),%rsi
+	shrq	$3,%r9
+	leaq	(,%rax,4),%rdi
+	andq	%rax,%r9
+	leaq	(,%rax,8),%r12
+	sarq	$63,%rax
+	leaq	(%r9,%r9,1),%r10
+	sarq	$63,%rsi
+	leaq	(,%r9,4),%r11
+	andq	%rbp,%rax
+	sarq	$63,%rdi
+	movq	%rax,%rdx
+	shlq	$63,%rax
+	andq	%rbp,%rsi
+	shrq	$1,%rdx
+	movq	%rsi,%rcx
+	shlq	$62,%rsi
+	andq	%rbp,%rdi
+	shrq	$2,%rcx
+	xorq	%rsi,%rax
+	movq	%rdi,%rbx
+	shlq	$61,%rdi
+	xorq	%rcx,%rdx
+	shrq	$3,%rbx
+	xorq	%rdi,%rax
+	xorq	%rbx,%rdx
+
+	movq	%r9,%r13
+	movq	$0,0(%rsp)
+	xorq	%r10,%r13
+	movq	%r9,8(%rsp)
+	movq	%r11,%r14
+	movq	%r10,16(%rsp)
+	xorq	%r12,%r14
+	movq	%r13,24(%rsp)
+
+	xorq	%r11,%r9
+	movq	%r11,32(%rsp)
+	xorq	%r11,%r10
+	movq	%r9,40(%rsp)
+	xorq	%r11,%r13
+	movq	%r10,48(%rsp)
+	xorq	%r14,%r9
+	movq	%r13,56(%rsp)
+	xorq	%r14,%r10
+
+	movq	%r12,64(%rsp)
+	xorq	%r14,%r13
+	movq	%r9,72(%rsp)
+	xorq	%r11,%r9
+	movq	%r10,80(%rsp)
+	xorq	%r11,%r10
+	movq	%r13,88(%rsp)
+
+	xorq	%r11,%r13
+	movq	%r14,96(%rsp)
+	movq	%r8,%rsi
+	movq	%r9,104(%rsp)
+	andq	%rbp,%rsi
+	movq	%r10,112(%rsp)
+	shrq	$4,%rbp
+	movq	%r13,120(%rsp)
+	movq	%r8,%rdi
+	andq	%rbp,%rdi
+	shrq	$4,%rbp
+
+	movq	(%rsp,%rsi,8),%xmm0
+	movq	%r8,%rsi
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$4,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$60,%rbx
+	xorq	%rcx,%rax
+	pslldq	$1,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$12,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$52,%rbx
+	xorq	%rcx,%rax
+	pslldq	$2,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$20,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$44,%rbx
+	xorq	%rcx,%rax
+	pslldq	$3,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$28,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$36,%rbx
+	xorq	%rcx,%rax
+	pslldq	$4,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$36,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$28,%rbx
+	xorq	%rcx,%rax
+	pslldq	$5,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$44,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$20,%rbx
+	xorq	%rcx,%rax
+	pslldq	$6,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$52,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$12,%rbx
+	xorq	%rcx,%rax
+	pslldq	$7,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%rcx,%rbx
+	shlq	$60,%rcx
+.byte	102,72,15,126,198
+	shrq	$4,%rbx
+	xorq	%rcx,%rax
+	psrldq	$8,%xmm0
+	xorq	%rbx,%rdx
+.byte	102,72,15,126,199
+	xorq	%rsi,%rax
+	xorq	%rdi,%rdx
+
+	addq	$128+8,%rsp
+	.byte	0xf3,0xc3
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,@function
+.align	16
+bn_GF2m_mul_2x2:
+	movq	OPENSSL_ia32cap_P(%rip),%rax
+	btq	$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+.byte	102,72,15,110,198
+.byte	102,72,15,110,201
+.byte	102,72,15,110,210
+.byte	102,73,15,110,216
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm1,%xmm5
+.byte	102,15,58,68,193,0
+	pxor	%xmm2,%xmm4
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,68,211,0
+.byte	102,15,58,68,229,0
+	xorps	%xmm0,%xmm4
+	xorps	%xmm2,%xmm4
+	movdqa	%xmm4,%xmm5
+	pslldq	$8,%xmm4
+	psrldq	$8,%xmm5
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm0
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	.byte	0xf3,0xc3
+
+.align	16
+.Lvanilla_mul_2x2:
+	leaq	-136(%rsp),%rsp
+	movq	%r14,80(%rsp)
+	movq	%r13,88(%rsp)
+	movq	%r12,96(%rsp)
+	movq	%rbp,104(%rsp)
+	movq	%rbx,112(%rsp)
+.Lbody_mul_2x2:
+	movq	%rdi,32(%rsp)
+	movq	%rsi,40(%rsp)
+	movq	%rdx,48(%rsp)
+	movq	%rcx,56(%rsp)
+	movq	%r8,64(%rsp)
+
+	movq	$15,%r8
+	movq	%rsi,%rax
+	movq	%rcx,%rbp
+	call	_mul_1x1		
+	movq	%rax,16(%rsp)
+	movq	%rdx,24(%rsp)
+
+	movq	48(%rsp),%rax
+	movq	64(%rsp),%rbp
+	call	_mul_1x1		
+	movq	%rax,0(%rsp)
+	movq	%rdx,8(%rsp)
+
+	movq	40(%rsp),%rax
+	movq	56(%rsp),%rbp
+	xorq	48(%rsp),%rax
+	xorq	64(%rsp),%rbp
+	call	_mul_1x1		
+	movq	0(%rsp),%rbx
+	movq	8(%rsp),%rcx
+	movq	16(%rsp),%rdi
+	movq	24(%rsp),%rsi
+	movq	32(%rsp),%rbp
+
+	xorq	%rdx,%rax
+	xorq	%rcx,%rdx
+	xorq	%rbx,%rax
+	movq	%rbx,0(%rbp)
+	xorq	%rdi,%rdx
+	movq	%rsi,24(%rbp)
+	xorq	%rsi,%rax
+	xorq	%rsi,%rdx
+	xorq	%rdx,%rax
+	movq	%rdx,16(%rbp)
+	movq	%rax,8(%rbp)
+
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbp
+	movq	112(%rsp),%rbx
+	leaq	136(%rsp),%rsp
+	.byte	0xf3,0xc3
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.byte	71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
diff --git a/main/openssl/crypto/bn/asm/x86_64-gf2m.pl b/main/openssl/crypto/bn/asm/x86_64-gf2m.pl
new file mode 100644
index 00000000..42bbec2f
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-gf2m.pl
@@ -0,0 +1,390 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and µ-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($lo,$hi)=("%rax","%rdx");	$a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@abi-omnipotent
+.align	16
+_mul_1x1:
+	sub	\$128+8,%rsp
+	mov	\$-1,$a1
+	lea	($a,$a),$i0
+	shr	\$3,$a1
+	lea	(,$a,4),$i1
+	and	$a,$a1			# a1=a&0x1fffffffffffffff
+	lea	(,$a,8),$a8
+	sar	\$63,$a			# broadcast 63rd bit
+	lea	($a1,$a1),$a2
+	sar	\$63,$i0		# broadcast 62nd bit
+	lea	(,$a1,4),$a4
+	and	$b,$a
+	sar	\$63,$i1		# boardcast 61st bit
+	mov	$a,$hi			# $a is $lo
+	shl	\$63,$lo
+	and	$b,$i0
+	shr	\$1,$hi
+	mov	$i0,$t1
+	shl	\$62,$i0
+	and	$b,$i1
+	shr	\$2,$t1
+	xor	$i0,$lo
+	mov	$i1,$t0
+	shl	\$61,$i1
+	xor	$t1,$hi
+	shr	\$3,$t0
+	xor	$i1,$lo
+	xor	$t0,$hi
+
+	mov	$a1,$a12
+	movq	\$0,0(%rsp)		# tab[0]=0
+	xor	$a2,$a12		# a1^a2
+	mov	$a1,8(%rsp)		# tab[1]=a1
+	 mov	$a4,$a48
+	mov	$a2,16(%rsp)		# tab[2]=a2
+	 xor	$a8,$a48		# a4^a8
+	mov	$a12,24(%rsp)		# tab[3]=a1^a2
+
+	xor	$a4,$a1
+	mov	$a4,32(%rsp)		# tab[4]=a4
+	xor	$a4,$a2
+	mov	$a1,40(%rsp)		# tab[5]=a1^a4
+	xor	$a4,$a12
+	mov	$a2,48(%rsp)		# tab[6]=a2^a4
+	 xor	$a48,$a1		# a1^a4^a4^a8=a1^a8
+	mov	$a12,56(%rsp)		# tab[7]=a1^a2^a4
+	 xor	$a48,$a2		# a2^a4^a4^a8=a1^a8
+
+	mov	$a8,64(%rsp)		# tab[8]=a8
+	xor	$a48,$a12		# a1^a2^a4^a4^a8=a1^a2^a8
+	mov	$a1,72(%rsp)		# tab[9]=a1^a8
+	 xor	$a4,$a1			# a1^a8^a4
+	mov	$a2,80(%rsp)		# tab[10]=a2^a8
+	 xor	$a4,$a2			# a2^a8^a4
+	mov	$a12,88(%rsp)		# tab[11]=a1^a2^a8
+
+	xor	$a4,$a12		# a1^a2^a8^a4
+	mov	$a48,96(%rsp)		# tab[12]=a4^a8
+	 mov	$mask,$i0
+	mov	$a1,104(%rsp)		# tab[13]=a1^a4^a8
+	 and	$b,$i0
+	mov	$a2,112(%rsp)		# tab[14]=a2^a4^a8
+	 shr	\$4,$b
+	mov	$a12,120(%rsp)		# tab[15]=a1^a2^a4^a8
+	 mov	$mask,$i1
+	 and	$b,$i1
+	 shr	\$4,$b
+
+	movq	(%rsp,$i0,8),$R		# half of calculations is done in SSE2
+	mov	$mask,$i0
+	and	$b,$i0
+	shr	\$4,$b
+___
+    for ($n=1;$n<8;$n++) {
+	$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$mask,$i1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	and	$b,$i1
+	 movq	(%rsp,$i0,8),$Tx
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	 pslldq	\$$n,$Tx
+	 mov	$mask,$i0
+	shr	\$4,$b
+	xor	$t0,$hi
+	 and	$b,$i0
+	 shr	\$4,$b
+	 pxor	$Tx,$R
+___
+    }
+$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	movq	$R,$i0
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	psrldq	\$8,$R
+	xor	$t0,$hi
+	movq	$R,$i1
+	xor	$i0,$lo
+	xor	$i1,$hi
+
+	add	\$128+8,%rsp
+	ret
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64?	("%rcx","%rdx","%r8", "%r9","%r10") :	# Win64 order
+				("%rdi","%rsi","%rdx","%rcx","%r8");	# Unix order
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@abi-omnipotent
+.align	16
+bn_GF2m_mul_2x2:
+	mov	OPENSSL_ia32cap_P(%rip),%rax
+	bt	\$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+	movq		$a1,%xmm0
+	movq		$b1,%xmm1
+	movq		$a0,%xmm2
+___
+$code.=<<___ if ($win64);
+	movq		40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+	movq		$b0,%xmm3
+___
+$code.=<<___;
+	movdqa		%xmm0,%xmm4
+	movdqa		%xmm1,%xmm5
+	pclmulqdq	\$0,%xmm1,%xmm0	# a1·b1
+	pxor		%xmm2,%xmm4
+	pxor		%xmm3,%xmm5
+	pclmulqdq	\$0,%xmm3,%xmm2	# a0·b0
+	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)·(b0+b1)
+	xorps		%xmm0,%xmm4
+	xorps		%xmm2,%xmm4	# (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	movdqa		%xmm4,%xmm5
+	pslldq		\$8,%xmm4
+	psrldq		\$8,%xmm5
+	pxor		%xmm4,%xmm2
+	pxor		%xmm5,%xmm0
+	movdqu		%xmm2,0($rp)
+	movdqu		%xmm0,16($rp)
+	ret
+
+.align	16
+.Lvanilla_mul_2x2:
+	lea	-8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	mov	`8*17+40`(%rsp),$b0
+	mov	%rdi,8*15(%rsp)
+	mov	%rsi,8*16(%rsp)
+___
+$code.=<<___;
+	mov	%r14,8*10(%rsp)
+	mov	%r13,8*11(%rsp)
+	mov	%r12,8*12(%rsp)
+	mov	%rbp,8*13(%rsp)
+	mov	%rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+	mov	$rp,32(%rsp)		# save the arguments
+	mov	$a1,40(%rsp)
+	mov	$a0,48(%rsp)
+	mov	$b1,56(%rsp)
+	mov	$b0,64(%rsp)
+
+	mov	\$0xf,$mask
+	mov	$a1,$a
+	mov	$b1,$b
+	call	_mul_1x1		# a1·b1
+	mov	$lo,16(%rsp)
+	mov	$hi,24(%rsp)
+
+	mov	48(%rsp),$a
+	mov	64(%rsp),$b
+	call	_mul_1x1		# a0·b0
+	mov	$lo,0(%rsp)
+	mov	$hi,8(%rsp)
+
+	mov	40(%rsp),$a
+	mov	56(%rsp),$b
+	xor	48(%rsp),$a
+	xor	64(%rsp),$b
+	call	_mul_1x1		# (a0+a1)·(b0+b1)
+___
+	@r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+	mov	0(%rsp),@r[0]
+	mov	8(%rsp),@r[1]
+	mov	16(%rsp),@r[2]
+	mov	24(%rsp),@r[3]
+	mov	32(%rsp),%rbp
+
+	xor	$hi,$lo
+	xor	@r[1],$hi
+	xor	@r[0],$lo
+	mov	@r[0],0(%rbp)
+	xor	@r[2],$hi
+	mov	@r[3],24(%rbp)
+	xor	@r[3],$lo
+	xor	@r[3],$hi
+	xor	$hi,$lo
+	mov	$hi,16(%rbp)
+	mov	$lo,8(%rbp)
+
+	mov	8*10(%rsp),%r14
+	mov	8*11(%rsp),%r13
+	mov	8*12(%rsp),%r12
+	mov	8*13(%rsp),%rbp
+	mov	8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+	mov	8*15(%rsp),%rdi
+	mov	8*16(%rsp),%rsi
+___
+$code.=<<___;
+	lea	8*17(%rsp),%rsp
+	ret
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz	"GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody_mul_2x2(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lin_prologue
+
+	mov	8*10(%rax),%r14		# mimic epilogue
+	mov	8*11(%rax),%r13
+	mov	8*12(%rax),%r12
+	mov	8*13(%rax),%rbp
+	mov	8*14(%rax),%rbx
+	mov	8*15(%rax),%rdi
+	mov	8*16(%rax),%rsi
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+
+.Lin_prologue:
+	lea	8*17(%rax),%rax
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	_mul_1x1
+	.rva	.Lend_mul_1x1
+	.rva	.LSEH_info_1x1
+
+	.rva	.Lvanilla_mul_2x2
+	.rva	.Lend_mul_2x2
+	.rva	.LSEH_info_2x2
+.section	.xdata
+.align	8
+.LSEH_info_1x1:
+	.byte	0x01,0x07,0x02,0x00
+	.byte	0x07,0x01,0x11,0x00	# sub rsp,128+8
+.LSEH_info_2x2:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/x86_64-mont.S b/main/openssl/crypto/bn/asm/x86_64-mont.S
new file mode 100644
index 00000000..95e29052
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-mont.S
@@ -0,0 +1,1374 @@
+.text	
+
+.globl	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+	testl	$3,%r9d
+	jnz	.Lmul_enter
+	cmpl	$8,%r9d
+	jb	.Lmul_enter
+	cmpq	%rsi,%rdx
+	jne	.Lmul4x_enter
+	jmp	.Lsqr4x_enter
+
+.align	16
+.Lmul_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movl	%r9d,%r9d
+	leaq	2(%r9),%r10
+	movq	%rsp,%r11
+	negq	%r10
+	leaq	(%rsp,%r10,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%r11,8(%rsp,%r9,8)
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jl	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	movq	%r9,%r15
+	orq	%rcx,%rsi
+.align	16
+.Lcopy:
+	movq	(%rsi,%r14,8),%rax
+	movq	%r14,(%rsp,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul_mont,.-bn_mul_mont
+.type	bn_mul4x_mont,@function
+.align	16
+bn_mul4x_mont:
+.Lmul4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movl	%r9d,%r9d
+	leaq	4(%r9),%r10
+	movq	%rsp,%r11
+	negq	%r10
+	leaq	(%rsp,%r10,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%r11,8(%rsp,%r9,8)
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jl	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	movq	8(%rsp),%rdx
+	shrq	$2,%r9
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+	leaq	-1(%r9),%r15
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-1(%r9),%r15
+	orq	%rcx,%rsi
+
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
+	leaq	32(%r14),%r14
+	decq	%r15
+	jnz	.Lcopy4x
+
+	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+.type	bn_sqr4x_mont,@function
+.align	16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	movq	%rsp,%r11
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,2),%rsp
+	andq	$-1024,%rsp
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+	movq	%r8,48(%rsp)
+	movq	%r11,56(%rsp)
+.Lsqr4x_body:
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	xorq	%r10,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	leaq	-16(%rbp),%rcx
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	.Lsqr4x_1st
+
+.align	16
+.Lsqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,(%rdi,%rcx,1)
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,8(%rdi,%rcx,1)
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,16(%rdi,%rcx,1)
+
+
+	movq	24(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_1st
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	leaq	16(%rbp),%rbp
+	movq	%r12,8(%rdi)
+	jmp	.Lsqr4x_outer
+
+.align	16
+.Lsqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	movq	-24(%rdi,%rbp,1),%r10
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	xorq	%r10,%r10
+	addq	-16(%rdi,%rbp,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	leaq	-16(%rbp),%rcx
+	xorq	%r12,%r12
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,8(%rdi,%rcx,1)
+
+	leaq	16(%rcx),%rcx
+	jmp	.Lsqr4x_inner
+
+.align	16
+.Lsqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,(%rdi,%rcx,1)
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_inner
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	movq	%r12,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	.Lsqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-24(%rdi)
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi)
+
+	movq	-8(%rsi),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	movq	%rdx,%r13
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi)
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	movq	%r12,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
+	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
+
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	64(%rsp,%r9,2),%rdi
+	xorq	%r10,%r10
+	movq	-24(%rdi,%rbp,2),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,-40(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	jmp	.Lsqr4x_shift_n_add
+
+.align	16
+.Lsqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	addq	$32,%rbp
+	jnz	.Lsqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+	movq	40(%rsp),%rsi
+	movq	48(%rsp),%r8
+	xorq	%rcx,%rcx
+	movq	%r9,0(%rsp)
+	subq	%r9,%rcx
+	movq	64(%rsp),%r10
+	movq	%r8,%r14
+	leaq	64(%rsp,%r9,2),%rax
+	leaq	64(%rsp,%r9,1),%rdi
+	movq	%rax,8(%rsp)
+	leaq	(%rsi,%r9,1),%rsi
+	xorq	%rbp,%rbp
+
+	movq	0(%rsi,%rcx,1),%rax
+	movq	8(%rsi,%rcx,1),%r9
+	imulq	%r10,%r14
+	movq	%rax,%rbx
+	jmp	.Lsqr4x_mont_outer
+
+.align	16
+.Lsqr4x_mont_outer:
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+	movq	%r8,%r15
+
+	xorq	%r10,%r10
+	addq	8(%rdi,%rcx,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+
+	imulq	%r11,%r15
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	24(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,16(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	24(%rdi,%rcx,1),%r11
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	jmp	.Lsqr4x_mont_inner
+
+.align	16
+.Lsqr4x_mont_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,-8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	8(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	8(%rdi,%rcx,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	24(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,16(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	24(%rdi,%rcx,1),%r11
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_mont_inner
+
+	subq	0(%rsp),%rcx
+	movq	%r8,%r14
+
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	movq	%r12,-8(%rdi)
+
+	xorq	%r11,%r11
+	addq	(%rdi),%r10
+	adcq	$0,%r11
+	movq	0(%rsi,%rcx,1),%rbx
+	addq	%rbp,%r10
+	adcq	$0,%r11
+
+	imulq	16(%rdi,%rcx,1),%r14
+	xorq	%r12,%r12
+	movq	8(%rsi,%rcx,1),%r9
+	addq	%r10,%r13
+	movq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+	movq	%r13,(%rdi)
+
+	xorq	%rbp,%rbp
+	addq	8(%rdi),%r12
+	adcq	%rbp,%rbp
+	addq	%r11,%r12
+	leaq	16(%rdi),%rdi
+	adcq	$0,%rbp
+	movq	%r12,-8(%rdi)
+	cmpq	8(%rsp),%rdi
+	jb	.Lsqr4x_mont_outer
+
+	movq	0(%rsp),%r9
+	movq	%rbp,(%rdi)
+	movq	64(%rsp,%r9,1),%rax
+	leaq	64(%rsp,%r9,1),%rbx
+	movq	40(%rsp),%rsi
+	shrq	$5,%r9
+	movq	8(%rbx),%rdx
+	xorq	%rbp,%rbp
+
+	movq	32(%rsp),%rdi
+	subq	0(%rsi),%rax
+	movq	16(%rbx),%r10
+	movq	24(%rbx),%r11
+	sbbq	8(%rsi),%rdx
+	leaq	-1(%r9),%rcx
+	jmp	.Lsqr4x_sub
+.align	16
+.Lsqr4x_sub:
+	movq	%rax,0(%rdi,%rbp,8)
+	movq	%rdx,8(%rdi,%rbp,8)
+	sbbq	16(%rsi,%rbp,8),%r10
+	movq	32(%rbx,%rbp,8),%rax
+	movq	40(%rbx,%rbp,8),%rdx
+	sbbq	24(%rsi,%rbp,8),%r11
+	movq	%r10,16(%rdi,%rbp,8)
+	movq	%r11,24(%rdi,%rbp,8)
+	sbbq	32(%rsi,%rbp,8),%rax
+	movq	48(%rbx,%rbp,8),%r10
+	movq	56(%rbx,%rbp,8),%r11
+	sbbq	40(%rsi,%rbp,8),%rdx
+	leaq	4(%rbp),%rbp
+	decq	%rcx
+	jnz	.Lsqr4x_sub
+
+	movq	%rax,0(%rdi,%rbp,8)
+	movq	32(%rbx,%rbp,8),%rax
+	sbbq	16(%rsi,%rbp,8),%r10
+	movq	%rdx,8(%rdi,%rbp,8)
+	sbbq	24(%rsi,%rbp,8),%r11
+	movq	%r10,16(%rdi,%rbp,8)
+
+	sbbq	$0,%rax
+	movq	%r11,24(%rdi,%rbp,8)
+	xorq	%rbp,%rbp
+	andq	%rax,%rbx
+	notq	%rax
+	movq	%rdi,%rsi
+	andq	%rax,%rsi
+	leaq	-1(%r9),%rcx
+	orq	%rsi,%rbx
+
+	pxor	%xmm0,%xmm0
+	leaq	64(%rsp,%r9,8),%rsi
+	movdqu	(%rbx),%xmm1
+	leaq	(%rsi,%r9,8),%rsi
+	movdqa	%xmm0,64(%rsp)
+	movdqa	%xmm0,(%rsi)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lsqr4x_copy
+.align	16
+.Lsqr4x_copy:
+	movdqu	16(%rbx,%rbp,1),%xmm2
+	movdqu	32(%rbx,%rbp,1),%xmm1
+	movdqa	%xmm0,80(%rsp,%rbp,1)
+	movdqa	%xmm0,96(%rsp,%rbp,1)
+	movdqa	%xmm0,16(%rsi,%rbp,1)
+	movdqa	%xmm0,32(%rsi,%rbp,1)
+	movdqu	%xmm2,16(%rdi,%rbp,1)
+	movdqu	%xmm1,32(%rdi,%rbp,1)
+	leaq	32(%rbp),%rbp
+	decq	%rcx
+	jnz	.Lsqr4x_copy
+
+	movdqu	16(%rbx,%rbp,1),%xmm2
+	movdqa	%xmm0,80(%rsp,%rbp,1)
+	movdqa	%xmm0,16(%rsi,%rbp,1)
+	movdqu	%xmm2,16(%rdi,%rbp,1)
+	movq	56(%rsp),%rsi
+	movq	$1,%rax
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lsqr4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_sqr4x_mont,.-bn_sqr4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
diff --git a/main/openssl/crypto/bn/asm/x86_64-mont.pl b/main/openssl/crypto/bn/asm/x86_64-mont.pl
index 3b7a6f24..17fb94c8 100755
--- a/main/openssl/crypto/bn/asm/x86_64-mont.pl
+++ b/main/openssl/crypto/bn/asm/x86_64-mont.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -15,6 +15,20 @@
 # respectful 50%. It remains to be seen if loop unrolling and
 # dedicated squaring routine can provide further improvement...
 
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -26,7 +40,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 # int bn_mul_mont(
 $rp="%rdi";	# BN_ULONG *rp,
@@ -37,7 +52,6 @@ $n0="%r8";	# const BN_ULONG *n0,
 $num="%r9";	# int num);
 $lo0="%r10";
 $hi0="%r11";
-$bp="%r12";	# reassign $bp
 $hi1="%r13";
 $i="%r14";
 $j="%r15";
@@ -51,6 +65,16 @@ $code=<<___;
 .type	bn_mul_mont,\@function,6
 .align	16
 bn_mul_mont:
+	test	\$3,${num}d
+	jnz	.Lmul_enter
+	cmp	\$8,${num}d
+	jb	.Lmul_enter
+	cmp	$ap,$bp
+	jne	.Lmul4x_enter
+	jmp	.Lsqr4x_enter
+
+.align	16
+.Lmul_enter:
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -66,48 +90,66 @@ bn_mul_mont:
 	and	\$-1024,%rsp		# minimize TLB usage
 
 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
-.Lprologue:
-	mov	%rdx,$bp		# $bp reassigned, remember?
-
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
 	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
 
 	xor	$i,$i			# i=0
 	xor	$j,$j			# j=0
 
-	mov	($bp),$m0		# m0=bp[0]
-	mov	($ap),%rax
+	mov	$n0,$m1
 	mulq	$m0			# ap[0]*bp[0]
 	mov	%rax,$lo0
-	mov	%rdx,$hi0
+	mov	($np),%rax
 
-	imulq	$n0,%rax		# "tp[0]"*n0
-	mov	%rax,$m1
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
 
-	mulq	($np)			# np[0]*m1
-	add	$lo0,%rax		# discarded
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
 	adc	\$0,%rdx
 	mov	%rdx,$hi1
 
 	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
 .L1st:
+	add	%rax,$hi1
 	mov	($ap,$j,8),%rax
-	mulq	$m0			# ap[j]*bp[0]
-	add	$hi0,%rax
 	adc	\$0,%rdx
-	mov	%rax,$lo0
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
 	mov	($np,$j,8),%rax
-	mov	%rdx,$hi0
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
 
 	mulq	$m1			# np[j]*m1
-	add	$hi1,%rax
-	lea	1($j),$j		# j++
+	cmp	$num,$j
+	jne	.L1st
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
 	adc	\$0,%rdx
-	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[0]
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
 	adc	\$0,%rdx
-	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
-	cmp	$num,$j
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
 	mov	%rdx,$hi1
-	jl	.L1st
+	mov	$lo0,$hi0
 
 	xor	%rdx,%rdx
 	add	$hi0,$hi1
@@ -116,50 +158,64 @@ bn_mul_mont:
 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
 
 	lea	1($i),$i		# i++
-.align	4
+	jmp	.Louter
+.align	16
 .Louter:
-	xor	$j,$j			# j=0
-
 	mov	($bp,$i,8),$m0		# m0=bp[i]
-	mov	($ap),%rax		# ap[0]
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
 	mulq	$m0			# ap[0]*bp[i]
-	add	(%rsp),%rax		# ap[0]*bp[i]+tp[0]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
 	adc	\$0,%rdx
-	mov	%rax,$lo0
-	mov	%rdx,$hi0
 
-	imulq	$n0,%rax		# tp[0]*n0
-	mov	%rax,$m1
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
 
-	mulq	($np,$j,8)		# np[0]*m1
-	add	$lo0,%rax		# discarded
-	mov	8(%rsp),$lo0		# tp[1]
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
 	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
 	mov	%rdx,$hi1
 
 	lea	1($j),$j		# j++
-.align	4
+	jmp	.Linner_enter
+
+.align	16
 .Linner:
+	add	%rax,$hi1
 	mov	($ap,$j,8),%rax
-	mulq	$m0			# ap[j]*bp[i]
-	add	$hi0,%rax
 	adc	\$0,%rdx
-	add	%rax,$lo0		# ap[j]*bp[i]+tp[j]
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
 	mov	($np,$j,8),%rax
 	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
 	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
 
 	mulq	$m1			# np[j]*m1
-	add	$hi1,%rax
-	lea	1($j),$j		# j++
-	adc	\$0,%rdx
-	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	cmp	$num,$j
+	jne	.Linner
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
 	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
 	mov	(%rsp,$j,8),$lo0
-	cmp	$num,$j
-	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
 	mov	%rdx,$hi1
-	jl	.Linner
 
 	xor	%rdx,%rdx
 	add	$hi0,$hi1
@@ -173,35 +229,449 @@ bn_mul_mont:
 	cmp	$num,$i
 	jl	.Louter
 
-	lea	(%rsp),$ap		# borrow ap for tp
-	lea	-1($num),$j		# j=num-1
-
-	mov	($ap),%rax		# tp[0]
 	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
 	jmp	.Lsub
 .align	16
 .Lsub:	sbb	($np,$i,8),%rax
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
-	dec	$j			# doesn't affect CF!
 	mov	8($ap,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
-	jge	.Lsub
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
 
 	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
 	and	%rax,$ap
 	not	%rax
 	mov	$rp,$np
 	and	%rax,$np
-	lea	-1($num),$j
+	mov	$num,$j			# j=num
 	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul_epilogue:
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont,\@function,6
+.align	16
+bn_mul4x_mont:
+.Lmul4x_enter:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	${num}d,${num}d
+	lea	4($num),%r10
+	mov	%rsp,%r11
+	neg	%r10
+	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
 	mov	($ap,$j,8),%rax
-	mov	%rax,($rp,$j,8)		# rp[i]=tp[i]
-	mov	$i,(%rsp,$j,8)		# zap temporary vector
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	xor	$j,$j			# j=0
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)		# tp[j-1]
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jl	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$num		# num/=4
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
+	lea	32($i),$i
 	dec	$j
-	jge	.Lcopy
+	jnz	.Lcopy4x
 
+	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
 	mov	\$1,%rax
 	mov	(%rsi),%r15
@@ -211,9 +681,823 @@ bn_mul_mont:
 	mov	32(%rsi),%rbp
 	mov	40(%rsi),%rbx
 	lea	48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
 	ret
-.size	bn_mul_mont,.-bn_mul_mont
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+{{{
+######################################################################
+# void bn_sqr4x_mont(
+my $rptr="%rdi";	# const BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $bptr="%rdx";	# not used
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 4 and
+			# not less than 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type	bn_sqr4x_mont,\@function,6
+.align	16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	shl	\$3,${num}d		# convert $num to bytes
+	xor	%r10,%r10
+	mov	%rsp,%r11		# put aside %rsp
+	sub	$num,%r10		# -$num
+	mov	($n0),$n0		# *n0
+	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
+	and	\$-1024,%rsp		# minimize TLB usage
+	##############################################################
+	# Stack layout
+	#
+	# +0	saved $num, used in reduction section
+	# +8	&t[2*$num], used in reduction section
+	# +32	saved $rptr
+	# +40	saved $nptr
+	# +48	saved *n0
+	# +56	saved %rsp
+	# +64	t[2*$num]
+	#
+	mov	$rptr,32(%rsp)		# save $rptr
+	mov	$nptr,40(%rsp)
+	mov	$n0,  48(%rsp)
+	mov	%r11, 56(%rsp)		# save original %rsp
+.Lsqr4x_body:
+	##############################################################
+	# Squaring part:
+	#
+	# a) multiply-n-add everything but a[i]*a[i];
+	# b) shift result of a) by 1 to the left and accumulate
+	#    a[i]*a[i] products;
+	#
+	lea	32(%r10),$i		# $i=-($num-32)
+	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
+
+	mov	$num,$j			# $j=$num
+
+					# comments apply to $num==8 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	mov	%rax,$A0[0]		# a[1]*a[0]
+	 mov	$ai,%rax		# a[2]
+	mov	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	lea	-16($i),$j		# j=-16
+
+
+	 mov	8($aptr,$j),$ai		# a[3]
+	mul	$a1			# a[2]*a[1]
+	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 lea	16($j),$j
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[3]
+	jmp	.Lsqr4x_1st
+
+.align	16
+.Lsqr4x_1st:
+	 mov	($aptr,$j),$ai		# a[4]
+	xor	$A1[0],$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],($tptr,$j)	# t[4]
+
+
+	 mov	8($aptr,$j),$ai		# a[5]
+	xor	$A1[1],$A1[1]
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],8($tptr,$j)	# t[5]
+
+	 mov	16($aptr,$j),$ai	# a[6]
+	xor	$A1[0],$A1[0]
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[6]*a[2]
+	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],16($tptr,$j)	# t[6]
+
+
+	 mov	24($aptr,$j),$ai	# a[7]
+	xor	$A1[1],$A1[1]
+	mul	$a1			# a[6]*a[5]
+	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 lea	32($j),$j
+	adc	\$0,$A0[1]
+	mul	$a0			# a[7]*a[4]
+	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[7]
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_1st
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[7]*a[5]
+	add	%rax,$A1[1]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[8]
+	lea	16($i),$i
+	mov	$A1[0],8($tptr)		# t[9]
+	jmp	.Lsqr4x_outer
+
+.align	16
+.Lsqr4x_outer:				# comments apply to $num==6 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mov	-24($tptr,$i),$A0[0]	# t[1]
+	xor	$A0[1],$A0[1]
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	lea	-16($i),$j		# j=-16
+	xor	$A1[0],$A1[0]
+
+
+	 mov	8($aptr,$j),$ai		# a[3]
+	xor	$A1[1],$A1[1]
+	add	8($tptr,$j),$A1[0]
+	adc	\$0,$A1[1]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],8($tptr,$j)	# t[3]
+
+	lea	16($j),$j
+	jmp	.Lsqr4x_inner
+
+.align	16
+.Lsqr4x_inner:
+	 mov	($aptr,$j),$ai		# a[4]
+	xor	$A1[0],$A1[0]
+	add	($tptr,$j),$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],($tptr,$j)	# t[4]
+
+	 mov	8($aptr,$j),$ai		# a[5]
+	xor	$A1[1],$A1[1]
+	add	8($tptr,$j),$A1[0]
+	adc	\$0,$A1[1]
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	lea	16($j),$j		# j++
+	adc	\$0,$A0[1]
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_inner
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
+	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
+
+	add	\$16,$i
+	jnz	.Lsqr4x_outer
+
+					# comments apply to $num==4 case
+	mov	-32($aptr),$a0		# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr),%rax		# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr),$ai		# a[2]
+	mov	%rax,$a1
+
+	xor	$A0[1],$A0[1]
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr)	# t[2]
+
+	 mov	-8($aptr),$ai		# a[3]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 mov	%rdx,$A1[1]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr)	# t[3]
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]
+	 mov	-16($aptr),%rax		# a[2]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[4]
+	mov	$A1[0],8($tptr)		# t[5]
+
+	mul	$ai			# a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+	 add	\$16,$i
+	 xor	$shift,$shift
+	 sub	$num,$i			# $i=16-$num
+	 xor	$carry,$carry
+
+	add	$A1[0],%rax		# t[5]
+	adc	\$0,%rdx
+	mov	%rax,8($tptr)		# t[5]
+	mov	%rdx,16($tptr)		# t[6]
+	mov	$carry,24($tptr)	# t[7]
+
+	 mov	-16($aptr,$i),%rax	# a[0]
+	lea	64(%rsp,$num,2),$tptr
+	 xor	$A0[0],$A0[0]		# t[0]
+	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr,$i,2)
+	adc	%rdx,$S[3]
+	lea	16($i),$i
+	mov	$S[3],-40($tptr,$i,2)
+	sbb	$carry,$carry		# mov cf,$carry
+	jmp	.Lsqr4x_shift_n_add
+
+.align	16
+.Lsqr4x_shift_n_add:
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr,$i,2)
+	adc	%rdx,$S[3]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	 mov	$S[3],-8($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],0($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],8($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],16($tptr,$i,2)
+	adc	%rdx,$S[3]
+	mov	$S[3],24($tptr,$i,2)
+	sbb	$carry,$carry		# mov cf,$carry
+	add	\$32,$i
+	jnz	.Lsqr4x_shift_n_add
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
+	 mov	$S[1],-24($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	adc	%rax,$S[2]
+	adc	%rdx,$S[3]
+	mov	$S[2],-16($tptr)
+	mov	$S[3],-8($tptr)
+___
+}
+##############################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+{
+my ($topbit,$nptr)=("%rbp",$aptr);
+my ($m0,$m1)=($a0,$a1);
+my @Ni=("%rbx","%r9");
+$code.=<<___;
+	mov	40(%rsp),$nptr		# restore $nptr
+	mov	48(%rsp),$n0		# restore *n0
+	xor	$j,$j
+	mov	$num,0(%rsp)		# save $num
+	sub	$num,$j			# $j=-$num
+	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
+	 mov	$n0,$m0			#		# modsched #
+	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
+	lea	64(%rsp,$num),$tptr	# end of t[] window
+	mov	%rax,8(%rsp)		# save end of t[] buffer
+	lea	($nptr,$num),$nptr	# end of n[] buffer
+	xor	$topbit,$topbit		# $topbit=0
+
+	mov	0($nptr,$j),%rax	# n[0]		# modsched #
+	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
+	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
+	 mov	%rax,$Ni[0]		#		# modsched #
+	jmp	.Lsqr4x_mont_outer
+
+.align	16
+.Lsqr4x_mont_outer:
+	xor	$A0[1],$A0[1]
+	mul	$m0			# n[0]*m0
+	add	%rax,$A0[0]		# n[0]*m0+t[0]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+	mov	$n0,$m1
+
+	xor	$A0[0],$A0[0]
+	add	8($tptr,$j),$A0[1]
+	adc	\$0,$A0[0]
+	mul	$m0			# n[1]*m0
+	add	%rax,$A0[1]		# n[1]*m0+t[1]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+
+	imulq	$A0[1],$m1
+
+	mov	16($nptr,$j),$Ni[0]	# n[2]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[0]*m1
+	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],8($tptr,$j)	# "t[1]"
+
+	xor	$A0[1],$A0[1]
+	add	16($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[2]*m0
+	add	%rax,$A0[0]		# n[2]*m0+t[2]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	24($nptr,$j),$Ni[1]	# n[3]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[1]*m1
+	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],16($tptr,$j)	# "t[2]"
+
+	xor	$A0[0],$A0[0]
+	add	24($tptr,$j),$A0[1]
+	lea	32($j),$j
+	adc	\$0,$A0[0]
+	mul	$m0			# n[3]*m0
+	add	%rax,$A0[1]		# n[3]*m0+t[3]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+	jmp	.Lsqr4x_mont_inner
+
+.align	16
+.Lsqr4x_mont_inner:
+	mov	($nptr,$j),$Ni[0]	# n[4]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[2]*m1
+	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],-8($tptr,$j)	# "t[3]"
+
+	xor	$A0[1],$A0[1]
+	add	($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[4]*m0
+	add	%rax,$A0[0]		# n[4]*m0+t[4]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	8($nptr,$j),$Ni[1]	# n[5]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[3]*m1
+	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],($tptr,$j)	# "t[4]"
+
+	xor	$A0[0],$A0[0]
+	add	8($tptr,$j),$A0[1]
+	adc	\$0,$A0[0]
+	mul	$m0			# n[5]*m0
+	add	%rax,$A0[1]		# n[5]*m0+t[5]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+
+
+	mov	16($nptr,$j),$Ni[0]	# n[6]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[4]*m1
+	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],8($tptr,$j)	# "t[5]"
+
+	xor	$A0[1],$A0[1]
+	add	16($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[6]*m0
+	add	%rax,$A0[0]		# n[6]*m0+t[6]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	24($nptr,$j),$Ni[1]	# n[7]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[5]*m1
+	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],16($tptr,$j)	# "t[6]"
+
+	xor	$A0[0],$A0[0]
+	add	24($tptr,$j),$A0[1]
+	lea	32($j),$j
+	adc	\$0,$A0[0]
+	mul	$m0			# n[7]*m0
+	add	%rax,$A0[1]		# n[7]*m0+t[7]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+	cmp	\$0,$j
+	jne	.Lsqr4x_mont_inner
+
+	 sub	0(%rsp),$j		# $j=-$num	# modsched #
+	 mov	$n0,$m0			#		# modsched #
+
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[6]*m1
+	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
+	mov	$Ni[1],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],-8($tptr)	# "t[7]"
+
+	xor	$A0[1],$A0[1]
+	add	($tptr),$A0[0]		# +t[8]
+	adc	\$0,$A0[1]
+	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
+	add	$topbit,$A0[0]
+	adc	\$0,$A0[1]
+
+	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
+	xor	$A1[0],$A1[0]
+	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
+	add	$A0[0],$A1[1]
+	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
+	adc	\$0,$A1[0]
+	mul	$m1			# n[7]*m1
+	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
+	 mov	$Ni[0],%rax		#		# modsched #
+	adc	%rdx,$A1[0]
+	mov	$A1[1],($tptr)		# "t[8]"
+
+	xor	$topbit,$topbit
+	add	8($tptr),$A1[0]		# +t[9]
+	adc	$topbit,$topbit
+	add	$A0[1],$A1[0]
+	lea	16($tptr),$tptr		# "t[$num]>>128"
+	adc	\$0,$topbit
+	mov	$A1[0],-8($tptr)	# "t[9]"
+	cmp	8(%rsp),$tptr		# are we done?
+	jb	.Lsqr4x_mont_outer
+
+	mov	0(%rsp),$num		# restore $num
+	mov	$topbit,($tptr)		# save $topbit
+___
+}
+##############################################################
+# Post-condition, 4x unrolled copy from bn_mul_mont
+#
+{
+my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
+$code.=<<___;
+	mov	64(%rsp,$num),@ri[0]	# tp[0]
+	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
+	mov	40(%rsp),$nptr		# restore $nptr
+	shr	\$5,$num		# num/4
+	mov	8($tptr),@ri[1]		# t[1]
+	xor	$i,$i			# i=0 and clear CF!
+
+	mov	32(%rsp),$rptr		# restore $rptr
+	sub	0($nptr),@ri[0]
+	mov	16($tptr),@ri[2]	# t[2]
+	mov	24($tptr),@ri[3]	# t[3]
+	sbb	8($nptr),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsqr4x_sub
+.align	16
+.Lsqr4x_sub:
+	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($nptr,$i,8),@ri[2]
+	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
+	mov	40($tptr,$i,8),@ri[1]
+	sbb	24($nptr,$i,8),@ri[3]
+	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($nptr,$i,8),@ri[0]
+	mov	48($tptr,$i,8),@ri[2]
+	mov	56($tptr,$i,8),@ri[3]
+	sbb	40($nptr,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesn't affect CF!
+	jnz	.Lsqr4x_sub
+
+	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
+	sbb	16($nptr,$i,8),@ri[2]
+	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($nptr,$i,8),@ri[3]
+	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$tptr
+	not	@ri[0]
+	mov	$rptr,$nptr
+	and	@ri[0],$nptr
+	lea	-1($num),$j
+	or	$nptr,$tptr		# tp=borrow?tp:rp
+
+	pxor	%xmm0,%xmm0
+	lea	64(%rsp,$num,8),$nptr
+	movdqu	($tptr),%xmm1
+	lea	($nptr,$num,8),$nptr
+	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
+	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
+	movdqu	%xmm1,($rptr)
+	jmp	.Lsqr4x_copy
+.align	16
+.Lsqr4x_copy:				# copy or in-place refresh
+	movdqu	16($tptr,$i),%xmm2
+	movdqu	32($tptr,$i),%xmm1
+	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
+	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
+	movdqu	%xmm2,16($rptr,$i)
+	movdqu	%xmm1,32($rptr,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lsqr4x_copy
+
+	movdqu	16($tptr,$i),%xmm2
+	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
+	movdqu	%xmm2,16($rptr,$i)
+___
+}
+$code.=<<___;
+	mov	56(%rsp),%rsi		# restore %rsp
+	mov	\$1,%rax
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lsqr4x_epilogue:
+	ret
+.size	bn_sqr4x_mont,.-bn_sqr4x_mont
+___
+}}}
+$code.=<<___;
 .asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	16
 ___
@@ -228,9 +1512,9 @@ $disp="%r9";
 
 $code.=<<___;
 .extern	__imp_RtlVirtualUnwind
-.type	se_handler,\@abi-omnipotent
+.type	mul_handler,\@abi-omnipotent
 .align	16
-se_handler:
+mul_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
@@ -245,15 +1529,20 @@ se_handler:
 	mov	120($context),%rax	# pull context->Rax
 	mov	248($context),%rbx	# pull context->Rip
 
-	lea	.Lprologue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lin_prologue
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
 
 	mov	152($context),%rax	# pull context->Rsp
 
-	lea	.Lepilogue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lin_prologue
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
 
 	mov	192($context),%r10	# pull $num
 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
@@ -272,7 +1561,53 @@ se_handler:
 	mov	%r14,232($context)	# restore context->R14
 	mov	%r15,240($context)	# restore context->R15
 
-.Lin_prologue:
+	jmp	.Lcommon_seh_tail
+.size	mul_handler,.-mul_handler
+
+.type	sqr_handler,\@abi-omnipotent
+.align	16
+sqr_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lsqr4x_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lsqr4x_epilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
+	jae	.Lcommon_seh_tail
+
+	mov	56(%rax),%rax		# pull saved stack pointer
+	lea	48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
@@ -310,7 +1645,7 @@ se_handler:
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	se_handler,.-se_handler
+.size	sqr_handler,.-sqr_handler
 
 .section	.pdata
 .align	4
@@ -318,11 +1653,27 @@ se_handler:
 	.rva	.LSEH_end_bn_mul_mont
 	.rva	.LSEH_info_bn_mul_mont
 
+	.rva	.LSEH_begin_bn_mul4x_mont
+	.rva	.LSEH_end_bn_mul4x_mont
+	.rva	.LSEH_info_bn_mul4x_mont
+
+	.rva	.LSEH_begin_bn_sqr4x_mont
+	.rva	.LSEH_end_bn_sqr4x_mont
+	.rva	.LSEH_info_bn_sqr4x_mont
+
 .section	.xdata
 .align	8
 .LSEH_info_bn_mul_mont:
 	.byte	9,0,0,0
-	.rva	se_handler
+	.rva	mul_handler
+	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
+	.byte	9,0,0,0
+	.rva	sqr_handler
 ___
 }
 
diff --git a/main/openssl/crypto/bn/asm/x86_64-mont5.S b/main/openssl/crypto/bn/asm/x86_64-mont5.S
new file mode 100644
index 00000000..49ec6ac6
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-mont5.S
@@ -0,0 +1,784 @@
+.text	
+
+.globl	bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,@function
+.align	64
+bn_mul_mont_gather5:
+	testl	$3,%r9d
+	jnz	.Lmul_enter
+	cmpl	$8,%r9d
+	jb	.Lmul_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	movl	%r9d,%r9d
+	movl	8(%rsp),%r10d
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%rax
+	leaq	2(%r9),%r11
+	negq	%r11
+	leaq	(%rsp,%r11,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	%r10,%r11
+	shrq	$3,%r10
+	andq	$7,%r11
+	notq	%r10
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%r10
+	leaq	96(%r12,%r11,8),%r12
+	movq	0(%rax,%r10,8),%xmm4
+	movq	8(%rax,%r10,8),%xmm5
+	movq	16(%rax,%r10,8),%xmm6
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+.byte	102,72,15,126,195
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+.byte	102,72,15,126,195
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+.byte	102,72,15,126,195
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jl	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	movq	%r9,%r15
+	orq	%rcx,%rsi
+.align	16
+.Lcopy:
+	movq	(%rsi,%r14,8),%rax
+	movq	%r14,(%rsp,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type	bn_mul4x_mont_gather5,@function
+.align	16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+	movl	%r9d,%r9d
+	movl	8(%rsp),%r10d
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%rax
+	leaq	4(%r9),%r11
+	negq	%r11
+	leaq	(%rsp,%r11,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	%r10,%r11
+	shrq	$3,%r10
+	andq	$7,%r11
+	notq	%r10
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%r10
+	leaq	96(%r12,%r11,8),%r12
+	movq	0(%rax,%r10,8),%xmm4
+	movq	8(%rax,%r10,8),%xmm5
+	movq	16(%rax,%r10,8),%xmm6
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+.byte	102,72,15,126,195
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.byte	102,72,15,126,195
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	xorq	%r15,%r15
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-40(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.byte	102,72,15,126,195
+	movq	%rdi,-16(%rsp,%r15,8)
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jl	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	movq	8(%rsp),%rdx
+	shrq	$2,%r9
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+	leaq	-1(%r9),%r15
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-1(%r9),%r15
+	orq	%rcx,%rsi
+
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
+	leaq	32(%r14),%r14
+	decq	%r15
+	jnz	.Lcopy4x
+
+	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+.globl	bn_scatter5
+.type	bn_scatter5,@function
+.align	16
+bn_scatter5:
+	cmpq	$0,%rsi
+	jz	.Lscatter_epilogue
+	leaq	(%rdx,%rcx,8),%rdx
+.Lscatter:
+	movq	(%rdi),%rax
+	leaq	8(%rdi),%rdi
+	movq	%rax,(%rdx)
+	leaq	256(%rdx),%rdx
+	subq	$1,%rsi
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,@function
+.align	16
+bn_gather5:
+	movq	%rcx,%r11
+	shrq	$3,%rcx
+	andq	$7,%r11
+	notq	%rcx
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%rcx
+	leaq	96(%rdx,%r11,8),%rdx
+	movq	0(%rax,%rcx,8),%xmm4
+	movq	8(%rax,%rcx,8),%xmm5
+	movq	16(%rax,%rcx,8),%xmm6
+	movq	24(%rax,%rcx,8),%xmm7
+	jmp	.Lgather
+.align	16
+.Lgather:
+	movq	-96(%rdx),%xmm0
+	movq	-32(%rdx),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%rdx),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%rdx),%rdx
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,(%rdi)
+	leaq	8(%rdi),%rdi
+	subq	$1,%rsi
+	jnz	.Lgather
+	.byte	0xf3,0xc3
+.LSEH_end_bn_gather5:
+.size	bn_gather5,.-bn_gather5
+.align	64
+.Lmagic_masks:
+.long	0,0, 0,0, 0,0, -1,-1
+.long	0,0, 0,0, 0,0,  0,0
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/main/openssl/crypto/bn/asm/x86_64-mont5.pl b/main/openssl/crypto/bn/asm/x86_64-mont5.pl
new file mode 100755
index 00000000..dae0fe24
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-mont5.pl
@@ -0,0 +1,1071 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# int bn_mul_mont_gather5(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+$num="%r9";	# int num,
+		# int idx);	# 0 to 2^5-1, "index" in $bp holding
+				# pre-computed powers of a', interlaced
+				# in such manner that b[0] is $bp[idx],
+				# b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl	bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,\@function,6
+.align	64
+bn_mul_mont_gather5:
+	test	\$3,${num}d
+	jnz	.Lmul_enter
+	cmp	\$8,${num}d
+	jb	.Lmul_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	mov	${num}d,${num}d
+	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+.Lmul_alloca:
+___
+$code.=<<___;
+	mov	%rsp,%rax
+	lea	2($num),%r11
+	neg	%r11
+	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,$m0		# m0=bp[0]
+
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$lo0
+	mov	($np),%rax
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.L1st
+
+	movq	%xmm0,$m0		# bp[1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+	mov	$lo0,$hi0
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	jmp	.Louter
+.align	16
+.Louter:
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
+	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.Linner
+
+	movq	%xmm0,$m0		# bp[i+1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# pull upmost overflow bit
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	cmp	$num,$i
+	jl	.Louter
+
+	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
+	jmp	.Lsub
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	mov	8($ap,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
+	mov	$num,$j			# j=num
+	or	$np,$ap			# ap=borrow?tp:rp
+.align	16
+.Lcopy:					# copy or in-place refresh
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsi),%xmm6
+	movaps	0x10(%rsi),%xmm7
+	lea	0x28(%rsi),%rsi
+___
+$code.=<<___;
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul_epilogue:
+	ret
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont_gather5,\@function,6
+.align	16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+	mov	${num}d,${num}d
+	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+.Lmul4x_alloca:
+___
+$code.=<<___;
+	mov	%rsp,%rax
+	lea	4($num),%r11
+	neg	%r11
+	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,$m0		# m0=bp[0]
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[1]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	xor	$j,$j			# j=0
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[i+1]
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jl	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$num		# num/=4
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lcopy4x
+
+	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsi),%xmm6
+	movaps	0x10(%rsi),%xmm7
+	lea	0x28(%rsi),%rsi
+___
+$code.=<<___;
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	ret
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+				("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl	bn_scatter5
+.type	bn_scatter5,\@abi-omnipotent
+.align	16
+bn_scatter5:
+	cmp	\$0, $num
+	jz	.Lscatter_epilogue
+	lea	($tbl,$idx,8),$tbl
+.Lscatter:
+	mov	($inp),%rax
+	lea	8($inp),$inp
+	mov	%rax,($tbl)
+	lea	32*8($tbl),$tbl
+	sub	\$1,$num
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	ret
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,\@abi-omnipotent
+.align	16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	mov	$idx,%r11
+	shr	\$`log($N/8)/log(2)`,$idx
+	and	\$`$N/8-1`,%r11
+	not	$idx
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
+	lea	96($tbl,%r11,8),$tbl	# pointer within 1st cache line
+	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
+	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,$idx,8),%xmm7
+	jmp	.Lgather
+.align	16
+.Lgather:
+	movq	`0*$STRIDE/4-96`($tbl),%xmm0
+	movq	`1*$STRIDE/4-96`($tbl),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($tbl),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($tbl),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($tbl),$tbl
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,($out)		# m0=bp[0]
+	lea	8($out),$out
+	sub	\$1,$num
+	jnz	.Lgather
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	lea	0x28(%rsp),%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_bn_gather5:
+.size	bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align	64
+.Lmagic_masks:
+	.long	0,0, 0,0, 0,0, -1,-1
+	.long	0,0, 0,0, 0,0,  0,0
+.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mul_handler,\@abi-omnipotent
+.align	16
+mul_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	lea	`40+48`(%rax),%rax
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# end of alloca label
+	cmp	%r10,%rbx		# context->Rip<end of alloca label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+
+	movaps	(%rax),%xmm0
+	movaps	16(%rax),%xmm1
+	lea	`40+48`(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+	movups	%xmm0,512($context)	# restore context->Xmm6
+	movups	%xmm1,528($context)	# restore context->Xmm7
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mul_handler,.-mul_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul_mont_gather5
+	.rva	.LSEH_end_bn_mul_mont_gather5
+	.rva	.LSEH_info_bn_mul_mont_gather5
+
+	.rva	.LSEH_begin_bn_mul4x_mont_gather5
+	.rva	.LSEH_end_bn_mul4x_mont_gather5
+	.rva	.LSEH_info_bn_mul4x_mont_gather5
+
+	.rva	.LSEH_begin_bn_gather5
+	.rva	.LSEH_end_bn_gather5
+	.rva	.LSEH_info_bn_gather5
+
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
+.align	8
+.LSEH_info_bn_mul4x_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.align	8
+.LSEH_info_bn_gather5:
+        .byte   0x01,0x0d,0x05,0x00
+        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
+        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
+        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
+.align	8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/bn.h b/main/openssl/crypto/bn/bn.h
index a0bc4783..9281ce59 100644
--- a/main/openssl/crypto/bn/bn.h
+++ b/main/openssl/crypto/bn/bn.h
@@ -558,6 +558,17 @@ int	BN_is_prime_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, BN_GENCB *cb);
 int	BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
 		int do_trial_division, BN_GENCB *cb);
 
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
+			const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			BIGNUM *Xp1, BIGNUM *Xp2,
+			const BIGNUM *Xp,
+			const BIGNUM *e, BN_CTX *ctx,
+			BN_GENCB *cb);
+
 BN_MONT_CTX *BN_MONT_CTX_new(void );
 void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
 int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
@@ -612,6 +623,8 @@ int	BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
 	BN_RECP_CTX *recp, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Functions for arithmetic over binary polynomials represented by BIGNUMs. 
  *
  * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
@@ -663,6 +676,8 @@ int	BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
 int	BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
 int	BN_GF2m_arr2poly(const int p[], BIGNUM *a);
 
+#endif
+
 /* faster mod functions for the 'NIST primes' 
  * 0 <= a < p^2 */
 int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
@@ -677,6 +692,10 @@ const BIGNUM *BN_get0_nist_prime_256(void);
 const BIGNUM *BN_get0_nist_prime_384(void);
 const BIGNUM *BN_get0_nist_prime_521(void);
 
+int BN_generate_dsa_nonce(BIGNUM *out, const BIGNUM *range, const BIGNUM *priv,
+			  const unsigned char *message, size_t message_len,
+			  BN_CTX *ctx);
+
 /* library internal functions */
 
 #define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
@@ -827,6 +846,7 @@ void ERR_load_BN_strings(void);
 #define BN_F_BN_EXP					 123
 #define BN_F_BN_EXPAND2					 108
 #define BN_F_BN_EXPAND_INTERNAL				 120
+#define BN_F_BN_GENERATE_DSA_NONCE			 140
 #define BN_F_BN_GF2M_MOD				 131
 #define BN_F_BN_GF2M_MOD_EXP				 132
 #define BN_F_BN_GF2M_MOD_MUL				 133
@@ -866,6 +886,7 @@ void ERR_load_BN_strings(void);
 #define BN_R_NOT_INITIALIZED				 107
 #define BN_R_NO_INVERSE					 108
 #define BN_R_NO_SOLUTION				 116
+#define BN_R_PRIVATE_KEY_TOO_LARGE			 117
 #define BN_R_P_IS_NOT_PRIME				 112
 #define BN_R_TOO_MANY_ITERATIONS			 113
 #define BN_R_TOO_MANY_TEMPORARY_VARIABLES		 109
diff --git a/main/openssl/crypto/bn/bn_blind.c b/main/openssl/crypto/bn/bn_blind.c
index e060592f..9ed8bc2b 100644
--- a/main/openssl/crypto/bn/bn_blind.c
+++ b/main/openssl/crypto/bn/bn_blind.c
@@ -126,7 +126,7 @@ struct bn_blinding_st
 				  * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
 #endif
 	CRYPTO_THREADID tid;
-	unsigned int  counter;
+	int counter;
 	unsigned long flags;
 	BN_MONT_CTX *m_ctx;
 	int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
@@ -160,7 +160,10 @@ BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
 	if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
 		BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
 
-	ret->counter = BN_BLINDING_COUNTER;
+	/* Set the counter to the special value -1
+	 * to indicate that this is never-used fresh blinding
+	 * that does not need updating before first use. */
+	ret->counter = -1;
 	CRYPTO_THREADID_current(&ret->tid);
 	return(ret);
 err:
@@ -190,7 +193,10 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
 		goto err;
 		}
 
-	if (--(b->counter) == 0 && b->e != NULL &&
+	if (b->counter == -1)
+		b->counter = 0;
+
+	if (++b->counter == BN_BLINDING_COUNTER && b->e != NULL &&
 		!(b->flags & BN_BLINDING_NO_RECREATE))
 		{
 		/* re-create blinding parameters */
@@ -205,8 +211,8 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
 
 	ret=1;
 err:
-	if (b->counter == 0)
-		b->counter = BN_BLINDING_COUNTER;
+	if (b->counter == BN_BLINDING_COUNTER)
+		b->counter = 0;
 	return(ret);
 	}
 
@@ -227,6 +233,12 @@ int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
 		return(0);
 		}
 
+	if (b->counter == -1)
+		/* Fresh blinding, doesn't need updating. */
+		b->counter = 0;
+	else if (!BN_BLINDING_update(b,ctx))
+		return(0);
+
 	if (r != NULL)
 		{
 		if (!BN_copy(r, b->Ai)) ret=0;
@@ -247,22 +259,19 @@ int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *ct
 	int ret;
 
 	bn_check_top(n);
-	if ((b->A == NULL) || (b->Ai == NULL))
-		{
-		BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
-		return(0);
-		}
 
 	if (r != NULL)
 		ret = BN_mod_mul(n, n, r, b->mod, ctx);
 	else
-		ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
-
-	if (ret >= 0)
 		{
-		if (!BN_BLINDING_update(b,ctx))
+		if (b->Ai == NULL)
+			{
+			BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
 			return(0);
+			}
+		ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
 		}
+
 	bn_check_top(n);
 	return(ret);
 	}
diff --git a/main/openssl/crypto/bn/bn_div.c b/main/openssl/crypto/bn/bn_div.c
index 802a43d6..7b240318 100644
--- a/main/openssl/crypto/bn/bn_div.c
+++ b/main/openssl/crypto/bn/bn_div.c
@@ -141,6 +141,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
     *
     *					<appro@fy.chalmers.se>
     */
+#undef bn_div_words
 #  define bn_div_words(n0,n1,d0)		\
 	({  asm volatile (			\
 		"divl	%4"			\
@@ -155,6 +156,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
     * Same story here, but it's 128-bit by 64-bit division. Wow!
     *					<appro@fy.chalmers.se>
     */
+#  undef bn_div_words
 #  define bn_div_words(n0,n1,d0)		\
 	({  asm volatile (			\
 		"divq	%4"			\
@@ -169,15 +171,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
 #endif /* OPENSSL_NO_ASM */
 
 
-/* BN_div[_no_branch] computes  dv := num / divisor,  rounding towards
+/* BN_div computes  dv := num / divisor,  rounding towards
  * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
  * Thus:
  *     dv->neg == num->neg ^ divisor->neg  (unless the result is zero)
  *     rm->neg == num->neg                 (unless the remainder is zero)
  * If 'dv' or 'rm' is NULL, the respective value is not returned.
  */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
-        const BIGNUM *divisor, BN_CTX *ctx);
 int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	   BN_CTX *ctx)
 	{
@@ -186,6 +186,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	BN_ULONG *resp,*wnump;
 	BN_ULONG d0,d1;
 	int num_n,div_n;
+	int no_branch=0;
 
 	/* Invalid zero-padding would have particularly bad consequences
 	 * in the case of 'num', so don't just rely on bn_check_top() for this one
@@ -200,7 +201,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 
 	if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
 		{
-		return BN_div_no_branch(dv, rm, num, divisor, ctx);
+		no_branch=1;
 		}
 
 	bn_check_top(dv);
@@ -214,7 +215,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 		return(0);
 		}
 
-	if (BN_ucmp(num,divisor) < 0)
+	if (!no_branch && BN_ucmp(num,divisor) < 0)
 		{
 		if (rm != NULL)
 			{ if (BN_copy(rm,num) == NULL) return(0); }
@@ -239,242 +240,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	norm_shift+=BN_BITS2;
 	if (!(BN_lshift(snum,num,norm_shift))) goto err;
 	snum->neg=0;
-	div_n=sdiv->top;
-	num_n=snum->top;
-	loop=num_n-div_n;
-	/* Lets setup a 'window' into snum
-	 * This is the part that corresponds to the current
-	 * 'area' being divided */
-	wnum.neg   = 0;
-	wnum.d     = &(snum->d[loop]);
-	wnum.top   = div_n;
-	/* only needed when BN_ucmp messes up the values between top and max */
-	wnum.dmax  = snum->dmax - loop; /* so we don't step out of bounds */
-
-	/* Get the top 2 words of sdiv */
-	/* div_n=sdiv->top; */
-	d0=sdiv->d[div_n-1];
-	d1=(div_n == 1)?0:sdiv->d[div_n-2];
-
-	/* pointer to the 'top' of snum */
-	wnump= &(snum->d[num_n-1]);
-
-	/* Setup to 'res' */
-	res->neg= (num->neg^divisor->neg);
-	if (!bn_wexpand(res,(loop+1))) goto err;
-	res->top=loop;
-	resp= &(res->d[loop-1]);
-
-	/* space for temp */
-	if (!bn_wexpand(tmp,(div_n+1))) goto err;
 
-	if (BN_ucmp(&wnum,sdiv) >= 0)
+	if (no_branch)
 		{
-		/* If BN_DEBUG_RAND is defined BN_ucmp changes (via
-		 * bn_pollute) the const bignum arguments =>
-		 * clean the values between top and max again */
-		bn_clear_top2max(&wnum);
-		bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
-		*resp=1;
-		}
-	else
-		res->top--;
-	/* if res->top == 0 then clear the neg value otherwise decrease
-	 * the resp pointer */
-	if (res->top == 0)
-		res->neg = 0;
-	else
-		resp--;
-
-	for (i=0; i<loop-1; i++, wnump--, resp--)
-		{
-		BN_ULONG q,l0;
-		/* the first part of the loop uses the top two words of
-		 * snum and sdiv to calculate a BN_ULONG q such that
-		 * | wnum - sdiv * q | < sdiv */
-#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
-		BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
-		q=bn_div_3_words(wnump,d1,d0);
-#else
-		BN_ULONG n0,n1,rem=0;
-
-		n0=wnump[0];
-		n1=wnump[-1];
-		if (n0 == d0)
-			q=BN_MASK2;
-		else 			/* n0 < d0 */
-			{
-#ifdef BN_LLONG
-			BN_ULLONG t2;
-
-#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
-			q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
-#else
-			q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
-			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
-				n0, n1, d0, q);
-#endif
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-			/*
-			 * rem doesn't have to be BN_ULLONG. The least we
-			 * know it's less that d0, isn't it?
-			 */
-			rem=(n1-q*d0)&BN_MASK2;
-#endif
-			t2=(BN_ULLONG)d1*q;
-
-			for (;;)
-				{
-				if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
-					break;
-				q--;
-				rem += d0;
-				if (rem < d0) break; /* don't let rem overflow */
-				t2 -= d1;
-				}
-#else /* !BN_LLONG */
-			BN_ULONG t2l,t2h;
-
-			q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
-			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
-				n0, n1, d0, q);
-#endif
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-			rem=(n1-q*d0)&BN_MASK2;
-#endif
-
-#if defined(BN_UMULT_LOHI)
-			BN_UMULT_LOHI(t2l,t2h,d1,q);
-#elif defined(BN_UMULT_HIGH)
-			t2l = d1 * q;
-			t2h = BN_UMULT_HIGH(d1,q);
-#else
+		/* Since we don't know whether snum is larger than sdiv,
+		 * we pad snum with enough zeroes without changing its
+		 * value. 
+		 */
+		if (snum->top <= sdiv->top+1) 
 			{
-			BN_ULONG ql, qh;
-			t2l=LBITS(d1); t2h=HBITS(d1);
-			ql =LBITS(q);  qh =HBITS(q);
-			mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+			if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+			for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+			snum->top = sdiv->top + 2;
 			}
-#endif
-
-			for (;;)
-				{
-				if ((t2h < rem) ||
-					((t2h == rem) && (t2l <= wnump[-2])))
-					break;
-				q--;
-				rem += d0;
-				if (rem < d0) break; /* don't let rem overflow */
-				if (t2l < d1) t2h--; t2l -= d1;
-				}
-#endif /* !BN_LLONG */
-			}
-#endif /* !BN_DIV3W */
-
-		l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
-		tmp->d[div_n]=l0;
-		wnum.d--;
-		/* ingore top values of the bignums just sub the two 
-		 * BN_ULONG arrays with bn_sub_words */
-		if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
+		else
 			{
-			/* Note: As we have considered only the leading
-			 * two BN_ULONGs in the calculation of q, sdiv * q
-			 * might be greater than wnum (but then (q-1) * sdiv
-			 * is less or equal than wnum)
-			 */
-			q--;
-			if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
-				/* we can't have an overflow here (assuming
-				 * that q != 0, but if q == 0 then tmp is
-				 * zero anyway) */
-				(*wnump)++;
+			if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+			snum->d[snum->top] = 0;
+			snum->top ++;
 			}
-		/* store part of the result */
-		*resp = q;
-		}
-	bn_correct_top(snum);
-	if (rm != NULL)
-		{
-		/* Keep a copy of the neg flag in num because if rm==num
-		 * BN_rshift() will overwrite it.
-		 */
-		int neg = num->neg;
-		BN_rshift(rm,snum,norm_shift);
-		if (!BN_is_zero(rm))
-			rm->neg = neg;
-		bn_check_top(rm);
-		}
-	BN_CTX_end(ctx);
-	return(1);
-err:
-	bn_check_top(rm);
-	BN_CTX_end(ctx);
-	return(0);
-	}
-
-
-/* BN_div_no_branch is a special version of BN_div. It does not contain
- * branches that may leak sensitive information.
- */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, 
-	const BIGNUM *divisor, BN_CTX *ctx)
-	{
-	int norm_shift,i,loop;
-	BIGNUM *tmp,wnum,*snum,*sdiv,*res;
-	BN_ULONG *resp,*wnump;
-	BN_ULONG d0,d1;
-	int num_n,div_n;
-
-	bn_check_top(dv);
-	bn_check_top(rm);
-	/* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
-	bn_check_top(divisor);
-
-	if (BN_is_zero(divisor))
-		{
-		BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
-		return(0);
-		}
-
-	BN_CTX_start(ctx);
-	tmp=BN_CTX_get(ctx);
-	snum=BN_CTX_get(ctx);
-	sdiv=BN_CTX_get(ctx);
-	if (dv == NULL)
-		res=BN_CTX_get(ctx);
-	else	res=dv;
-	if (sdiv == NULL || res == NULL) goto err;
-
-	/* First we normalise the numbers */
-	norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
-	if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
-	sdiv->neg=0;
-	norm_shift+=BN_BITS2;
-	if (!(BN_lshift(snum,num,norm_shift))) goto err;
-	snum->neg=0;
-
-	/* Since we don't know whether snum is larger than sdiv,
-	 * we pad snum with enough zeroes without changing its
-	 * value. 
-	 */
-	if (snum->top <= sdiv->top+1) 
-		{
-		if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
-		for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
-		snum->top = sdiv->top + 2;
-		}
-	else
-		{
-		if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
-		snum->d[snum->top] = 0;
-		snum->top ++;
 		}
 
 	div_n=sdiv->top;
@@ -500,12 +284,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
 	/* Setup to 'res' */
 	res->neg= (num->neg^divisor->neg);
 	if (!bn_wexpand(res,(loop+1))) goto err;
-	res->top=loop-1;
+	res->top=loop-no_branch;
 	resp= &(res->d[loop-1]);
 
 	/* space for temp */
 	if (!bn_wexpand(tmp,(div_n+1))) goto err;
 
+	if (!no_branch)
+		{
+		if (BN_ucmp(&wnum,sdiv) >= 0)
+			{
+			/* If BN_DEBUG_RAND is defined BN_ucmp changes (via
+			 * bn_pollute) the const bignum arguments =>
+			 * clean the values between top and max again */
+			bn_clear_top2max(&wnum);
+			bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+			*resp=1;
+			}
+		else
+			res->top--;
+		}
+
 	/* if res->top == 0 then clear the neg value otherwise decrease
 	 * the resp pointer */
 	if (res->top == 0)
@@ -638,7 +437,7 @@ X) -> 0x%08X\n",
 			rm->neg = neg;
 		bn_check_top(rm);
 		}
-	bn_correct_top(res);
+	if (no_branch)	bn_correct_top(res);
 	BN_CTX_end(ctx);
 	return(1);
 err:
@@ -646,5 +445,4 @@ err:
 	BN_CTX_end(ctx);
 	return(0);
 	}
-
 #endif
diff --git a/main/openssl/crypto/bn/bn_err.c b/main/openssl/crypto/bn/bn_err.c
index cfe2eb94..f722b525 100644
--- a/main/openssl/crypto/bn/bn_err.c
+++ b/main/openssl/crypto/bn/bn_err.c
@@ -87,6 +87,7 @@ static ERR_STRING_DATA BN_str_functs[]=
 {ERR_FUNC(BN_F_BN_EXP),	"BN_exp"},
 {ERR_FUNC(BN_F_BN_EXPAND2),	"bn_expand2"},
 {ERR_FUNC(BN_F_BN_EXPAND_INTERNAL),	"BN_EXPAND_INTERNAL"},
+{ERR_FUNC(BN_F_BN_GENERATE_DSA_NONCE),	"BN_generate_dsa_nonce"},
 {ERR_FUNC(BN_F_BN_GF2M_MOD),	"BN_GF2m_mod"},
 {ERR_FUNC(BN_F_BN_GF2M_MOD_EXP),	"BN_GF2m_mod_exp"},
 {ERR_FUNC(BN_F_BN_GF2M_MOD_MUL),	"BN_GF2m_mod_mul"},
@@ -129,6 +130,7 @@ static ERR_STRING_DATA BN_str_reasons[]=
 {ERR_REASON(BN_R_NOT_INITIALIZED)        ,"not initialized"},
 {ERR_REASON(BN_R_NO_INVERSE)             ,"no inverse"},
 {ERR_REASON(BN_R_NO_SOLUTION)            ,"no solution"},
+{ERR_REASON(BN_R_PRIVATE_KEY_TOO_LARGE)  ,"private key too large"},
 {ERR_REASON(BN_R_P_IS_NOT_PRIME)         ,"p is not prime"},
 {ERR_REASON(BN_R_TOO_MANY_ITERATIONS)    ,"too many iterations"},
 {ERR_REASON(BN_R_TOO_MANY_TEMPORARY_VARIABLES),"too many temporary variables"},
diff --git a/main/openssl/crypto/bn/bn_exp.c b/main/openssl/crypto/bn/bn_exp.c
index d9b6c737..2abf6fd6 100644
--- a/main/openssl/crypto/bn/bn_exp.c
+++ b/main/openssl/crypto/bn/bn_exp.c
@@ -113,6 +113,18 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+#include <stdlib.h>
+#ifdef _WIN32
+# include <malloc.h>
+# ifndef alloca
+#  define alloca _alloca
+# endif
+#elif defined(__GNUC__)
+# ifndef alloca
+#  define alloca(s) __builtin_alloca((s))
+# endif
+#endif
+
 /* maximum precomputation table size for *variable* sliding windows */
 #define TABLE_SIZE	32
 
@@ -522,23 +534,17 @@ err:
  * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
  * from/to that table. */
 
-static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width)
 	{
 	size_t i, j;
 
-	if (bn_wexpand(b, top) == NULL)
-		return 0;
-	while (b->top < top)
-		{
-		b->d[b->top++] = 0;
-		}
-	
+	if (top > b->top)
+		top = b->top; /* this works because 'buf' is explicitly zeroed */
 	for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
 		{
 		buf[j] = ((unsigned char*)b->d)[i];
 		}
 
-	bn_correct_top(b);
 	return 1;
 	}
 
@@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
 
 /* Given a pointer value, compute the next address that is a cache line multiple. */
 #define MOD_EXP_CTIME_ALIGN(x_) \
-	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
 
 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
  * precomputation memory layout to limit data-dependency to a minimum
@@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 		    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
 	{
-	int i,bits,ret=0,idx,window,wvalue;
+	int i,bits,ret=0,window,wvalue;
 	int top;
- 	BIGNUM *r;
-	const BIGNUM *aa;
 	BN_MONT_CTX *mont=NULL;
 
 	int numPowers;
 	unsigned char *powerbufFree=NULL;
 	int powerbufLen = 0;
 	unsigned char *powerbuf=NULL;
-	BIGNUM *computeTemp=NULL, *am=NULL;
+	BIGNUM tmp, am;
 
 	bn_check_top(a);
 	bn_check_top(p);
@@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 		return ret;
 		}
 
- 	/* Initialize BIGNUM context and allocate intermediate result */
 	BN_CTX_start(ctx);
-	r = BN_CTX_get(ctx);
-	if (r == NULL) goto err;
 
 	/* Allocate a montgomery context if it was not supplied by the caller.
 	 * If this is not done, things will break in the montgomery part.
@@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 
 	/* Get the window size to use with size of p. */
 	window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+	if (window==6 && bits<=1024) window=5;	/* ~5% improvement of 2048-bit RSA sign */
+#endif
 
 	/* Allocate a buffer large enough to hold all of the pre-computed
-	 * powers of a.
+	 * powers of am, am itself and tmp.
 	 */
 	numPowers = 1 << window;
-	powerbufLen = sizeof(m->d[0])*top*numPowers;
+	powerbufLen = sizeof(m->d[0])*(top*numPowers +
+				((2*top)>numPowers?(2*top):numPowers));
+#ifdef alloca
+	if (powerbufLen < 3072)
+		powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+	else
+#endif
 	if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
 		goto err;
 		
 	powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
 	memset(powerbuf, 0, powerbufLen);
 
- 	/* Initialize the intermediate result. Do this early to save double conversion,
-	 * once each for a^0 and intermediate result.
-	 */
- 	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
-	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
+#ifdef alloca
+	if (powerbufLen < 3072)
+		powerbufFree = NULL;
+#endif
 
-	/* Initialize computeTemp as a^1 with montgomery precalcs */
-	computeTemp = BN_CTX_get(ctx);
-	am = BN_CTX_get(ctx);
-	if (computeTemp==NULL || am==NULL) goto err;
+	/* lay down tmp and am right after powers table */
+	tmp.d     = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
+	am.d      = tmp.d + top;
+	tmp.top   = am.top  = 0;
+	tmp.dmax  = am.dmax = top;
+	tmp.neg   = am.neg  = 0;
+	tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+
+	/* prepare a^0 in Montgomery domain */
+#if 1
+ 	if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx))	goto err;
+#else
+	tmp.d[0] = (0-m->d[0])&BN_MASK2;	/* 2^(top*BN_BITS2) - m */
+	for (i=1;i<top;i++)
+		tmp.d[i] = (~m->d[i])&BN_MASK2;
+	tmp.top = top;
+#endif
 
+	/* prepare a^1 in Montgomery domain */
 	if (a->neg || BN_ucmp(a,m) >= 0)
 		{
-		if (!BN_mod(am,a,m,ctx))
-			goto err;
-		aa= am;
+		if (!BN_mod(&am,a,m,ctx))			goto err;
+		if (!BN_to_montgomery(&am,&am,mont,ctx))	goto err;
 		}
-	else
-		aa=a;
-	if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
-	if (!BN_copy(computeTemp, am)) goto err;
-	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
+	else	if (!BN_to_montgomery(&am,a,mont,ctx))		goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+    /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+     * specifically optimization of cache-timing attack countermeasures
+     * and pre-computation optimization. */
+
+    /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+     * 512-bit RSA is hardly relevant, we omit it to spare size... */ 
+    if (window==5)
+	{
+	void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
+			const void *table,const BN_ULONG *np,
+			const BN_ULONG *n0,int num,int power);
+	void bn_scatter5(const BN_ULONG *inp,size_t num,
+			void *table,size_t power);
+	void bn_gather5(BN_ULONG *out,size_t num,
+			void *table,size_t power);
+
+	BN_ULONG *np=mont->N.d, *n0=mont->n0;
+
+	/* BN_to_montgomery can contaminate words above .top
+	 * [in BN_DEBUG[_DEBUG] build]... */
+	for (i=am.top; i<top; i++)	am.d[i]=0;
+	for (i=tmp.top; i<top; i++)	tmp.d[i]=0;
+
+	bn_scatter5(tmp.d,top,powerbuf,0);
+	bn_scatter5(am.d,am.top,powerbuf,1);
+	bn_mul_mont(tmp.d,am.d,am.d,np,n0,top);
+	bn_scatter5(tmp.d,top,powerbuf,2);
+
+#if 0
+	for (i=3; i<32; i++)
+		{
+		/* Calculate a^i = a^(i-1) * a */
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+#else
+	/* same as above, but uses squaring for 1/2 of operations */
+	for (i=4; i<32; i*=2)
+		{
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+	for (i=3; i<8; i+=2)
+		{
+		int j;
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		for (j=2*i; j<32; j*=2)
+			{
+			bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+			bn_scatter5(tmp.d,top,powerbuf,j);
+			}
+		}
+	for (; i<16; i+=2)
+		{
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_scatter5(tmp.d,top,powerbuf,2*i);
+		}
+	for (; i<32; i+=2)
+		{
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+#endif
+	bits--;
+	for (wvalue=0, i=bits%5; i>=0; i--,bits--)
+		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+	bn_gather5(tmp.d,top,powerbuf,wvalue);
+
+	/* Scan the exponent one window at a time starting from the most
+	 * significant bits.
+	 */
+	while (bits >= 0)
+		{
+		for (wvalue=0, i=0; i<5; i++,bits--)
+			wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
+		}
+
+	tmp.top=top;
+	bn_correct_top(&tmp);
+	}
+    else
+#endif
+	{
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err;
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am,  top, powerbuf, 1, numPowers)) goto err;
 
 	/* If the window size is greater than 1, then calculate
 	 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
@@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 	 */
 	if (window > 1)
 		{
-		for (i=2; i<numPowers; i++)
+		if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx))	goto err;
+		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err;
+		for (i=3; i<numPowers; i++)
 			{
 			/* Calculate a^i = a^(i-1) * a */
-			if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
+			if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx))
 				goto err;
-			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
+			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err;
 			}
 		}
 
- 	/* Adjust the number of bits up to a multiple of the window size.
- 	 * If the exponent length is not a multiple of the window size, then
- 	 * this pads the most significant bits with zeros to normalize the
- 	 * scanning loop to there's no special cases.
- 	 *
- 	 * * NOTE: Making the window size a power of two less than the native
-	 * * word size ensures that the padded bits won't go past the last
- 	 * * word in the internal BIGNUM structure. Going past the end will
- 	 * * still produce the correct result, but causes a different branch
- 	 * * to be taken in the BN_is_bit_set function.
- 	 */
- 	bits = ((bits+window-1)/window)*window;
- 	idx=bits-1;	/* The top bit of the window */
-
- 	/* Scan the exponent one window at a time starting from the most
- 	 * significant bits.
- 	 */
- 	while (idx >= 0)
+	bits--;
+	for (wvalue=0, i=bits%window; i>=0; i--,bits--)
+		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+	if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err;
+ 
+	/* Scan the exponent one window at a time starting from the most
+	 * significant bits.
+	 */
+ 	while (bits >= 0)
   		{
  		wvalue=0; /* The 'value' of the window */
  		
  		/* Scan the window, squaring the result as we go */
- 		for (i=0; i<window; i++,idx--)
+ 		for (i=0; i<window; i++,bits--)
  			{
-			if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))	goto err;
-			wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
+			if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx))	goto err;
+			wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
   			}
  		
 		/* Fetch the appropriate pre-computed value from the pre-buf */
-		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
+		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err;
 
  		/* Multiply the result into the intermediate result */
- 		if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
+ 		if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err;
   		}
+	}
 
  	/* Convert the final result from montgomery to standard format */
-	if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+	if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
 	ret=1;
 err:
 	if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
 	if (powerbuf!=NULL)
 		{
 		OPENSSL_cleanse(powerbuf,powerbufLen);
-		OPENSSL_free(powerbufFree);
+		if (powerbufFree) OPENSSL_free(powerbufFree);
 		}
- 	if (am!=NULL) BN_clear(am);
- 	if (computeTemp!=NULL) BN_clear(computeTemp);
 	BN_CTX_end(ctx);
 	return(ret);
 	}
@@ -988,4 +1095,3 @@ err:
 	bn_check_top(r);
 	return(ret);
 	}
-
diff --git a/main/openssl/crypto/bn/bn_gcd.c b/main/openssl/crypto/bn/bn_gcd.c
index 4a352119..a808f531 100644
--- a/main/openssl/crypto/bn/bn_gcd.c
+++ b/main/openssl/crypto/bn/bn_gcd.c
@@ -205,6 +205,7 @@ err:
 /* solves ax == 1 (mod n) */
 static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
         const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
+
 BIGNUM *BN_mod_inverse(BIGNUM *in,
 	const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
 	{
diff --git a/main/openssl/crypto/bn/bn_gf2m.c b/main/openssl/crypto/bn/bn_gf2m.c
index 432a3aa3..8a4dc20a 100644
--- a/main/openssl/crypto/bn/bn_gf2m.c
+++ b/main/openssl/crypto/bn/bn_gf2m.c
@@ -94,6 +94,8 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Maximum number of iterations before BN_GF2m_mod_solve_quad_arr should fail. */
 #define MAX_ITERATIONS 50
 
@@ -122,6 +124,7 @@ static const BN_ULONG SQR_tb[16] =
     SQR_tb[(w) >>  4 & 0xF] <<  8 | SQR_tb[(w)       & 0xF]
 #endif
 
+#if !defined(OPENSSL_BN_ASM_GF2m)
 /* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
  * result is a polynomial r with degree < 2 * BN_BITS - 1
  * The caller MUST ensure that the variables have the right amount
@@ -216,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c
 	r[2] ^= m1 ^ r[1] ^ r[3];  /* h0 ^= m1 ^ l1 ^ h1; */
 	r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0;  /* l1 ^= l0 ^ h0 ^ m0; */
 	}
-
+#else
+void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+#endif 
 
 /* Add polynomials a and b and store result in r; r could be a or b, a and b 
  * could be equal; r is the bitwise XOR of a and b.
@@ -360,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
 int	BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p) + 1;
-	int *arr=NULL;
+	int arr[6];
 	bn_check_top(a);
 	bn_check_top(p);
-	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
-	ret = BN_GF2m_poly2arr(p, arr, max);
-	if (!ret || ret > max)
+	ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
+	if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0])))
 		{
 		BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
-		goto err;
+		return 0;
 		}
 	ret = BN_GF2m_mod_arr(r, a, arr);
 	bn_check_top(r);
-err:
-	if (arr) OPENSSL_free(arr);
 	return ret;
 	}
 
@@ -521,7 +522,7 @@ err:
  */
 int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	{
-	BIGNUM *b, *c, *u, *v, *tmp;
+	BIGNUM *b, *c = NULL, *u = NULL, *v = NULL, *tmp;
 	int ret = 0;
 
 	bn_check_top(a);
@@ -529,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 
 	BN_CTX_start(ctx);
 	
-	b = BN_CTX_get(ctx);
-	c = BN_CTX_get(ctx);
-	u = BN_CTX_get(ctx);
-	v = BN_CTX_get(ctx);
-	if (v == NULL) goto err;
+	if ((b = BN_CTX_get(ctx))==NULL) goto err;
+	if ((c = BN_CTX_get(ctx))==NULL) goto err;
+	if ((u = BN_CTX_get(ctx))==NULL) goto err;
+	if ((v = BN_CTX_get(ctx))==NULL) goto err;
 
-	if (!BN_one(b)) goto err;
 	if (!BN_GF2m_mod(u, a, p)) goto err;
-	if (!BN_copy(v, p)) goto err;
-
 	if (BN_is_zero(u)) goto err;
 
+	if (!BN_copy(v, p)) goto err;
+#if 0
+	if (!BN_one(b)) goto err;
+
 	while (1)
 		{
 		while (!BN_is_odd(u))
@@ -565,13 +566,89 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 		if (!BN_GF2m_add(u, u, v)) goto err;
 		if (!BN_GF2m_add(b, b, c)) goto err;
 		}
+#else
+	{
+	int i,	ubits = BN_num_bits(u),
+		vbits = BN_num_bits(v),	/* v is copy of p */
+		top = p->top;
+	BN_ULONG *udp,*bdp,*vdp,*cdp;
+
+	bn_wexpand(u,top);	udp = u->d;
+				for (i=u->top;i<top;i++) udp[i] = 0;
+				u->top = top;
+	bn_wexpand(b,top);	bdp = b->d;
+				bdp[0] = 1;
+				for (i=1;i<top;i++) bdp[i] = 0;
+				b->top = top;
+	bn_wexpand(c,top);	cdp = c->d;
+				for (i=0;i<top;i++) cdp[i] = 0;
+				c->top = top;
+	vdp = v->d;	/* It pays off to "cache" *->d pointers, because
+			 * it allows optimizer to be more aggressive.
+			 * But we don't have to "cache" p->d, because *p
+			 * is declared 'const'... */
+	while (1)
+		{
+		while (ubits && !(udp[0]&1))
+			{
+			BN_ULONG u0,u1,b0,b1,mask;
+
+			u0   = udp[0];
+			b0   = bdp[0];
+			mask = (BN_ULONG)0-(b0&1);
+			b0  ^= p->d[0]&mask;
+			for (i=0;i<top-1;i++)
+				{
+				u1 = udp[i+1];
+				udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
+				u0 = u1;
+				b1 = bdp[i+1]^(p->d[i+1]&mask);
+				bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
+				b0 = b1;
+				}
+			udp[i] = u0>>1;
+			bdp[i] = b0>>1;
+			ubits--;
+			}
 
+		if (ubits<=BN_BITS2 && udp[0]==1) break;
+
+		if (ubits<vbits)
+			{
+			i = ubits; ubits = vbits; vbits = i;
+			tmp = u; u = v; v = tmp;
+			tmp = b; b = c; c = tmp;
+			udp = vdp; vdp = v->d;
+			bdp = cdp; cdp = c->d;
+			}
+		for(i=0;i<top;i++)
+			{
+			udp[i] ^= vdp[i];
+			bdp[i] ^= cdp[i];
+			}
+		if (ubits==vbits)
+			{
+			BN_ULONG ul;
+			int utop = (ubits-1)/BN_BITS2;
+
+			while ((ul=udp[utop])==0 && utop) utop--;
+			ubits = utop*BN_BITS2 + BN_num_bits_word(ul);
+			}
+		}
+	bn_correct_top(b);
+	}
+#endif
 
 	if (!BN_copy(r, b)) goto err;
 	bn_check_top(r);
 	ret = 1;
 
 err:
+#ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */
+        bn_correct_top(c);
+        bn_correct_top(u);
+        bn_correct_top(v);
+#endif
   	BN_CTX_end(ctx);
 	return ret;
 	}
@@ -1033,3 +1110,4 @@ int BN_GF2m_arr2poly(const int p[], BIGNUM *a)
 	return 1;
 	}
 
+#endif
diff --git a/main/openssl/crypto/bn/bn_lcl.h b/main/openssl/crypto/bn/bn_lcl.h
index 8e5e98e3..817c773b 100644
--- a/main/openssl/crypto/bn/bn_lcl.h
+++ b/main/openssl/crypto/bn/bn_lcl.h
@@ -238,7 +238,7 @@ extern "C" {
 #  if defined(__DECC)
 #   include <c_asm.h>
 #   define BN_UMULT_HIGH(a,b)	(BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
-#  elif defined(__GNUC__)
+#  elif defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("umulh	%1,%2,%0"	\
@@ -247,7 +247,7 @@ extern "C" {
 	ret;			})
 #  endif	/* compiler */
 # elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("mulhdu	%0,%1,%2"	\
@@ -257,7 +257,7 @@ extern "C" {
 #  endif	/* compiler */
 # elif (defined(__x86_64) || defined(__x86_64__)) && \
        (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret,discard;	\
 	asm ("mulq	%3"		\
@@ -280,6 +280,26 @@ extern "C" {
 #   define BN_UMULT_HIGH(a,b)		__umulh((a),(b))
 #   define BN_UMULT_LOHI(low,high,a,b)	((low)=_umul128((a),(b),&(high)))
 #  endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+#  if defined(__GNUC__) && __GNUC__>=2
+#   if __GNUC__>=4 && __GNUC_MINOR__>=4 /* "h" constraint is no more since 4.4 */
+#     define BN_UMULT_HIGH(a,b)		 (((__uint128_t)(a)*(b))>>64)
+#     define BN_UMULT_LOHI(low,high,a,b) ({	\
+	__uint128_t ret=(__uint128_t)(a)*(b);	\
+	(high)=ret>>64; (low)=ret;	 })
+#   else
+#     define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	asm ("dmultu	%1,%2"		\
+	     : "=h"(ret)		\
+	     : "r"(a), "r"(b) : "l");	\
+	ret;			})
+#     define BN_UMULT_LOHI(low,high,a,b)\
+	asm ("dmultu	%2,%3"		\
+	     : "=l"(low),"=h"(high)	\
+	     : "r"(a), "r"(b));
+#    endif
+#  endif
 # endif		/* cpu */
 #endif		/* OPENSSL_NO_ASM */
 
@@ -459,6 +479,10 @@ extern "C" {
 	}
 #endif /* !BN_LLONG */
 
+#if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS)
+#undef bn_div_words
+#endif
+
 void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb);
 void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
 void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
diff --git a/main/openssl/crypto/bn/bn_lib.c b/main/openssl/crypto/bn/bn_lib.c
index 5470fbe6..7a5676de 100644
--- a/main/openssl/crypto/bn/bn_lib.c
+++ b/main/openssl/crypto/bn/bn_lib.c
@@ -139,25 +139,6 @@ const BIGNUM *BN_value_one(void)
 	return(&const_one);
 	}
 
-char *BN_options(void)
-	{
-	static int init=0;
-	static char data[16];
-
-	if (!init)
-		{
-		init++;
-#ifdef BN_LLONG
-		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
-			     (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
-#else
-		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
-			     (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
-#endif
-		}
-	return(data);
-	}
-
 int BN_num_bits_word(BN_ULONG l)
 	{
 	static const unsigned char bits[256]={
diff --git a/main/openssl/crypto/bn/bn_mont.c b/main/openssl/crypto/bn/bn_mont.c
index 1a866880..427b5cf4 100644
--- a/main/openssl/crypto/bn/bn_mont.c
+++ b/main/openssl/crypto/bn/bn_mont.c
@@ -177,31 +177,26 @@ err:
 static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 	{
 	BIGNUM *n;
-	BN_ULONG *ap,*np,*rp,n0,v,*nrp;
-	int al,nl,max,i,x,ri;
+	BN_ULONG *ap,*np,*rp,n0,v,carry;
+	int nl,max,i;
 
 	n= &(mont->N);
-	/* mont->ri is the size of mont->N in bits (rounded up
-	   to the word size) */
-	al=ri=mont->ri/BN_BITS2;
-
 	nl=n->top;
-	if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+	if (nl == 0) { ret->top=0; return(1); }
 
-	max=(nl+al+1); /* allow for overflow (no?) XXX */
+	max=(2*nl); /* carry is stored separately */
 	if (bn_wexpand(r,max) == NULL) return(0);
 
 	r->neg^=n->neg;
 	np=n->d;
 	rp=r->d;
-	nrp= &(r->d[nl]);
 
 	/* clear the top words of T */
 #if 1
 	for (i=r->top; i<max; i++) /* memset? XXX */
-		r->d[i]=0;
+		rp[i]=0;
 #else
-	memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
+	memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
 #endif
 
 	r->top=max;
@@ -210,7 +205,7 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 #ifdef BN_COUNT
 	fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
 #endif
-	for (i=0; i<nl; i++)
+	for (carry=0, i=0; i<nl; i++, rp++)
 		{
 #ifdef __TANDEM
                 {
@@ -228,61 +223,33 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 #else
 		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
 #endif
-		nrp++;
-		rp++;
-		if (((nrp[-1]+=v)&BN_MASK2) >= v)
-			continue;
-		else
-			{
-			if (((++nrp[0])&BN_MASK2) != 0) continue;
-			if (((++nrp[1])&BN_MASK2) != 0) continue;
-			for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
-			}
-		}
-	bn_correct_top(r);
-
-	/* mont->ri will be a multiple of the word size and below code
-	 * is kind of BN_rshift(ret,r,mont->ri) equivalent */
-	if (r->top <= ri)
-		{
-		ret->top=0;
-		return(1);
+		v = (v+carry+rp[nl])&BN_MASK2;
+		carry |= (v != rp[nl]);
+		carry &= (v <= rp[nl]);
+		rp[nl]=v;
 		}
-	al=r->top-ri;
 
-#define BRANCH_FREE 1
-#if BRANCH_FREE
-	if (bn_wexpand(ret,ri) == NULL) return(0);
-	x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
-	ret->top=x=(ri&~x)|(al&x);	/* min(ri,al) */
+	if (bn_wexpand(ret,nl) == NULL) return(0);
+	ret->top=nl;
 	ret->neg=r->neg;
 
 	rp=ret->d;
-	ap=&(r->d[ri]);
+	ap=&(r->d[nl]);
 
+#define BRANCH_FREE 1
+#if BRANCH_FREE
 	{
-	size_t m1,m2;
-
-	v=bn_sub_words(rp,ap,np,ri);
-	/* this ----------------^^ works even in al<ri case
-	 * thanks to zealous zeroing of top of the vector in the
-	 * beginning. */
+	BN_ULONG *nrp;
+	size_t m;
 
-	/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
-	/* in other words if subtraction result is real, then
+	v=bn_sub_words(rp,ap,np,nl)-carry;
+	/* if subtraction result is real, then
 	 * trick unconditional memcpy below to perform in-place
 	 * "refresh" instead of actual copy. */
-	m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
-	m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
-	m1|=m2;			/* (al!=ri) */
-	m1|=(0-(size_t)v);	/* (al!=ri || v) */
-	m1&=~m2;		/* (al!=ri || v) && !al>ri */
-	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m1)|((PTR_SIZE_INT)ap&m1));
-	}
+	m=(0-(size_t)v);
+	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
 
-	/* 'i<ri' is chosen to eliminate dependency on input data, even
-	 * though it results in redundant copy in al<ri case. */
-	for (i=0,ri-=4; i<ri; i+=4)
+	for (i=0,nl-=4; i<nl; i+=4)
 		{
 		BN_ULONG t1,t2,t3,t4;
 		
@@ -295,40 +262,15 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 		rp[i+2]=t3;
 		rp[i+3]=t4;
 		}
-	for (ri+=4; i<ri; i++)
+	for (nl+=4; i<nl; i++)
 		rp[i]=nrp[i], ap[i]=0;
-	bn_correct_top(r);
-	bn_correct_top(ret);
+	}
 #else
-	if (bn_wexpand(ret,al) == NULL) return(0);
-	ret->top=al;
-	ret->neg=r->neg;
-
-	rp=ret->d;
-	ap=&(r->d[ri]);
-	al-=4;
-	for (i=0; i<al; i+=4)
-		{
-		BN_ULONG t1,t2,t3,t4;
-		
-		t1=ap[i+0];
-		t2=ap[i+1];
-		t3=ap[i+2];
-		t4=ap[i+3];
-		rp[i+0]=t1;
-		rp[i+1]=t2;
-		rp[i+2]=t3;
-		rp[i+3]=t4;
-		}
-	al+=4;
-	for (; i<al; i++)
-		rp[i]=ap[i];
-
-	if (BN_ucmp(ret, &(mont->N)) >= 0)
-		{
-		if (!BN_usub(ret,ret,&(mont->N))) return(0);
-		}
+	if (bn_sub_words (rp,ap,np,nl)-carry)
+		memcpy(rp,ap,nl*sizeof(BN_ULONG));
 #endif
+	bn_correct_top(r);
+	bn_correct_top(ret);
 	bn_check_top(ret);
 
 	return(1);
diff --git a/main/openssl/crypto/bn/bn_nist.c b/main/openssl/crypto/bn/bn_nist.c
index c6de0326..e22968d4 100644
--- a/main/openssl/crypto/bn/bn_nist.c
+++ b/main/openssl/crypto/bn/bn_nist.c
@@ -286,26 +286,25 @@ const BIGNUM *BN_get0_nist_prime_521(void)
 	}
 
 
-static void nist_cp_bn_0(BN_ULONG *buf, BN_ULONG *a, int top, int max)
+static void nist_cp_bn_0(BN_ULONG *dst, const BN_ULONG *src, int top, int max)
 	{
 	int i;
-	BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
 
 #ifdef BN_DEBUG
 	OPENSSL_assert(top <= max);
 #endif
-	for (i = (top); i != 0; i--)
-		*_tmp1++ = *_tmp2++;
-	for (i = (max) - (top); i != 0; i--)
-		*_tmp1++ = (BN_ULONG) 0;
+	for (i = 0; i < top; i++)
+		dst[i] = src[i];
+	for (; i < max; i++)
+		dst[i] = 0;
 	}
 
-static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
+static void nist_cp_bn(BN_ULONG *dst, const BN_ULONG *src, int top)
 	{ 
 	int i;
-	BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
-	for (i = (top); i != 0; i--)
-		*_tmp1++ = *_tmp2++;
+
+	for (i = 0; i < top; i++)
+		dst[i] = src[i];
 	}
 
 #if BN_BITS2 == 64
@@ -319,6 +318,13 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
 						:(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
 #define bn_32_set_0(to, n)		(((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
 #define bn_cp_32(to,n,from,m)		((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
+# if defined(L_ENDIAN)
+#  if defined(__arch64__)
+#   define NIST_INT64 long
+#  else
+#   define NIST_INT64 long long
+#  endif
+# endif
 #else
 #define bn_cp_64(to, n, from, m) \
 	{ \
@@ -330,13 +336,15 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
 	bn_32_set_0(to, (n)*2); \
 	bn_32_set_0(to, (n)*2+1); \
 	}
-#if BN_BITS2 == 32
 #define bn_cp_32(to, n, from, m)	(to)[n] = (m>=0)?((from)[m]):0;
 #define bn_32_set_0(to, n)		(to)[n] = (BN_ULONG)0;
-#endif
+# if defined(_WIN32) && !defined(__GNUC__)
+#  define NIST_INT64 __int64
+# elif defined(BN_LLONG)
+#  define NIST_INT64 long long
+# endif
 #endif /* BN_BITS2 != 64 */
 
-
 #define nist_set_192(to, from, a1, a2, a3) \
 	{ \
 	bn_cp_64(to, 0, from, (a3) - 3) \
@@ -350,9 +358,11 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int      top = a->top, i;
 	int      carry;
 	register BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_192_TOP],
-	         buf[BN_NIST_192_TOP],
-		 c_d[BN_NIST_192_TOP],
+	union	{
+		BN_ULONG	bn[BN_NIST_192_TOP];
+		unsigned int	ui[BN_NIST_192_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_192_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	static const BIGNUM _bignum_nist_p_192_sqr = {
@@ -385,15 +395,48 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc  = rp[0];	acc += bp[3*2-6];
+			acc += bp[5*2-6]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[3*2-5];
+			acc += bp[5*2-5]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[3*2-6];
+			acc += bp[4*2-6];
+			acc += bp[5*2-6]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[3*2-5];
+			acc += bp[4*2-5];
+			acc += bp[5*2-5]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[4*2-6];
+			acc += bp[5*2-6]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[4*2-5];
+			acc += bp[5*2-5]; rp[5] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_192_TOP];
 
-	nist_set_192(t_d, buf, 0, 3, 3);
+	nist_set_192(t_d, buf.bn, 0, 3, 3);
 	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-	nist_set_192(t_d, buf, 4, 4, 0);
+	nist_set_192(t_d, buf.bn, 4, 4, 0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-	nist_set_192(t_d, buf, 5, 5, 5)
+	nist_set_192(t_d, buf.bn, 5, 5, 5)
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-
+	}
+#endif
 	if (carry > 0)
 		carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP);
 	else
@@ -407,8 +450,9 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	 */
 	mask  = 0-(PTR_SIZE_INT)bn_sub_words(c_d,r_d,_nist_p_192[0],BN_NIST_192_TOP);
 	mask &= 0-(PTR_SIZE_INT)carry;
+	res   = c_d;
 	res   = (BN_ULONG *)
-	 (((PTR_SIZE_INT)c_d&~mask) | ((PTR_SIZE_INT)r_d&mask));
+	 (((PTR_SIZE_INT)res&~mask) | ((PTR_SIZE_INT)r_d&mask));
 	nist_cp_bn(r_d, res, BN_NIST_192_TOP);
 	r->top = BN_NIST_192_TOP;
 	bn_correct_top(r);
@@ -435,9 +479,11 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int	top = a->top, i;
 	int	carry;
 	BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_224_TOP],
-	         buf[BN_NIST_224_TOP],
-		 c_d[BN_NIST_224_TOP],
+	union	{
+		BN_ULONG	bn[BN_NIST_224_TOP];
+		unsigned int	ui[BN_NIST_224_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_224_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -474,26 +520,68 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 #if BN_BITS2==64
 	/* copy upper 256 bits of 448 bit number ... */
-	nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
+	nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
 	/* ... and right shift by 32 to obtain upper 224 bits */
-	nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8);
+	nist_set_224(buf.bn, c_d, 14, 13, 12, 11, 10, 9, 8);
 	/* truncate lower part to 224 bits too */
 	r_d[BN_NIST_224_TOP-1] &= BN_MASK2l;
 #else
-	nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
 #endif
-	nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
+
+#if defined(NIST_INT64) && BN_BITS2!=64
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc  = rp[0];	acc -= bp[7-7];
+			acc -= bp[11-7]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc -= bp[8-7];
+			acc -= bp[12-7]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc -= bp[9-7];
+			acc -= bp[13-7]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[7-7];
+			acc += bp[11-7];
+			acc -= bp[10-7]; rp[3] = (unsigned int)acc; acc>>= 32;
+
+	acc += rp[4];	acc += bp[8-7];
+			acc += bp[12-7];
+			acc -= bp[11-7]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[9-7];
+			acc += bp[13-7];
+			acc -= bp[12-7]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[6];	acc += bp[10-7];
+			acc -= bp[13-7]; rp[6] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+# if BN_BITS2==64
+	rp[7] = carry;
+# endif
+	}	
+#else
+	{
+	BN_ULONG t_d[BN_NIST_224_TOP];
+
+	nist_set_224(t_d, buf.bn, 10, 9, 8, 7, 0, 0, 0);
 	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
-	nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
+	nist_set_224(t_d, buf.bn, 0, 13, 12, 11, 0, 0, 0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
-	nist_set_224(t_d, buf, 13, 12, 11, 10, 9, 8, 7);
+	nist_set_224(t_d, buf.bn, 13, 12, 11, 10, 9, 8, 7);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
-	nist_set_224(t_d, buf, 0, 0, 0, 0, 13, 12, 11);
+	nist_set_224(t_d, buf.bn, 0, 0, 0, 0, 13, 12, 11);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
 
 #if BN_BITS2==64
 	carry = (int)(r_d[BN_NIST_224_TOP-1]>>32);
 #endif
+	}
+#endif
 	u.f = bn_sub_words;
 	if (carry > 0)
 		{
@@ -521,7 +609,8 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	/* otherwise it's effectively same as in BN_nist_mod_192... */
 	mask  = 0-(PTR_SIZE_INT)(*u.f)(c_d,r_d,_nist_p_224[0],BN_NIST_224_TOP);
 	mask &= 0-(PTR_SIZE_INT)carry;
-	res   = (BN_ULONG *)(((PTR_SIZE_INT)c_d&~mask) |
+	res   = c_d;
+	res   = (BN_ULONG *)(((PTR_SIZE_INT)res&~mask) |
 	 ((PTR_SIZE_INT)r_d&mask));
 	nist_cp_bn(r_d, res, BN_NIST_224_TOP);
 	r->top = BN_NIST_224_TOP;
@@ -548,9 +637,11 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *a_d = a->d, *r_d;
-	BN_ULONG t_d[BN_NIST_256_TOP],
-	         buf[BN_NIST_256_TOP],
-		 c_d[BN_NIST_256_TOP],
+	union	{
+		BN_ULONG bn[BN_NIST_256_TOP];
+		unsigned int ui[BN_NIST_256_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_256_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -584,12 +675,87 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc = rp[0];	acc += bp[8-8];
+			acc += bp[9-8];
+			acc -= bp[11-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[9-8];
+			acc += bp[10-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8];
+			acc -= bp[15-8]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[10-8];
+			acc += bp[11-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8];
+			acc -= bp[15-8]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[11-8];
+			acc += bp[11-8];
+			acc += bp[12-8];
+			acc += bp[12-8];
+			acc += bp[13-8];
+			acc -= bp[15-8];
+			acc -= bp[8-8];
+			acc -= bp[9-8];  rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[12-8];
+			acc += bp[12-8];
+			acc += bp[13-8];
+			acc += bp[13-8];
+			acc += bp[14-8];
+			acc -= bp[9-8];
+			acc -= bp[10-8]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[13-8];
+			acc += bp[13-8];
+			acc += bp[14-8];
+			acc += bp[14-8];
+			acc += bp[15-8];
+			acc -= bp[10-8];
+			acc -= bp[11-8]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[6];	acc += bp[14-8];
+			acc += bp[14-8];
+			acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[14-8];
+			acc += bp[13-8];
+			acc -= bp[8-8];
+			acc -= bp[9-8];  rp[6] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[7];	acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[8 -8];
+			acc -= bp[10-8];
+			acc -= bp[11-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8]; rp[7] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_256_TOP];
 
 	/*S1*/
-	nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0);
+	nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0);
 	/*S2*/
-	nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0);
+	nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0);
 	carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
 	/* left shift */
 		{
@@ -607,24 +773,26 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 		}
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S3*/
-	nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8);
+	nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S4*/
-	nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9);
+	nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D1*/
-	nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11);
+	nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D2*/
-	nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12);
+	nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D3*/
-	nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13);
+	nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D4*/
-	nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14);
+	nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 
+	}
+#endif
 	/* see BN_nist_mod_224 for explanation */
 	u.f = bn_sub_words;
 	if (carry > 0)
@@ -641,7 +809,8 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 	mask  = 0-(PTR_SIZE_INT)(*u.f)(c_d,r_d,_nist_p_256[0],BN_NIST_256_TOP);
 	mask &= 0-(PTR_SIZE_INT)carry;
-	res   = (BN_ULONG *)(((PTR_SIZE_INT)c_d&~mask) |
+	res   = c_d;
+	res   = (BN_ULONG *)(((PTR_SIZE_INT)res&~mask) |
 	 ((PTR_SIZE_INT)r_d&mask));
 	nist_cp_bn(r_d, res, BN_NIST_256_TOP);
 	r->top = BN_NIST_256_TOP;
@@ -672,9 +841,11 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_384_TOP],
-	         buf[BN_NIST_384_TOP],
-		 c_d[BN_NIST_384_TOP],
+	union	{
+		BN_ULONG bn[BN_NIST_384_TOP];
+		unsigned int ui[BN_NIST_384_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_384_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -709,10 +880,100 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc = rp[0];	acc += bp[12-12];
+			acc += bp[21-12];
+			acc += bp[20-12];
+			acc -= bp[23-12]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[13-12];
+			acc += bp[22-12];
+			acc += bp[23-12];
+			acc -= bp[12-12];
+			acc -= bp[20-12]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[14-12];
+			acc += bp[23-12];
+			acc -= bp[13-12];
+			acc -= bp[21-12]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[15-12];
+			acc += bp[12-12];
+			acc += bp[20-12];
+			acc += bp[21-12];
+			acc -= bp[14-12];
+			acc -= bp[22-12];
+			acc -= bp[23-12]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[21-12];
+			acc += bp[21-12];
+			acc += bp[16-12];
+			acc += bp[13-12];
+			acc += bp[12-12];
+			acc += bp[20-12];
+			acc += bp[22-12];
+			acc -= bp[15-12];
+			acc -= bp[23-12];
+			acc -= bp[23-12]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[22-12];
+			acc += bp[22-12];
+			acc += bp[17-12];
+			acc += bp[14-12];
+			acc += bp[13-12];
+			acc += bp[21-12];
+			acc += bp[23-12];
+			acc -= bp[16-12]; rp[5] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[6];	acc += bp[23-12];
+			acc += bp[23-12];
+			acc += bp[18-12];
+			acc += bp[15-12];
+			acc += bp[14-12];
+			acc += bp[22-12];
+			acc -= bp[17-12]; rp[6] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[7];	acc += bp[19-12];
+			acc += bp[16-12];
+			acc += bp[15-12];
+			acc += bp[23-12];
+			acc -= bp[18-12]; rp[7] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[8];	acc += bp[20-12];
+			acc += bp[17-12];
+			acc += bp[16-12];
+			acc -= bp[19-12]; rp[8] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[9];	acc += bp[21-12];
+			acc += bp[18-12];
+			acc += bp[17-12];
+			acc -= bp[20-12]; rp[9] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[10];	acc += bp[22-12];
+			acc += bp[19-12];
+			acc += bp[18-12];
+			acc -= bp[21-12]; rp[10] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[11];	acc += bp[23-12];
+			acc += bp[20-12];
+			acc += bp[19-12];
+			acc -= bp[22-12]; rp[11] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_384_TOP];
 
 	/*S1*/
-	nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
+	nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
 		/* left shift */
 		{
 		register BN_ULONG *ap,t,c;
@@ -729,29 +990,31 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
 		t_d, BN_NIST_256_TOP);
 	/*S2 */
-	carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP);
+	carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP);
 	/*S3*/
-	nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21);
+	nist_set_384(t_d,buf.bn,20,19,18,17,16,15,14,13,12,23,22,21);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S4*/
-	nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0);
+	nist_set_384(t_d,buf.bn,19,18,17,16,15,14,13,12,20,0,23,0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S5*/
-	nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0);
+	nist_set_384(t_d, buf.bn,0,0,0,0,23,22,21,20,0,0,0,0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S6*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,23,22,21,0,0,20);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D1*/
-	nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23);
+	nist_set_384(t_d,buf.bn,22,21,20,19,18,17,16,15,14,13,12,23);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D2*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,22,21,20,0);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D3*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,23,0,0,0);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 
+	}
+#endif
 	/* see BN_nist_mod_224 for explanation */
 	u.f = bn_sub_words;
 	if (carry > 0)
@@ -768,7 +1031,8 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 	mask  = 0-(PTR_SIZE_INT)(*u.f)(c_d,r_d,_nist_p_384[0],BN_NIST_384_TOP);
 	mask &= 0-(PTR_SIZE_INT)carry;
-	res   = (BN_ULONG *)(((PTR_SIZE_INT)c_d&~mask) |
+	res   = c_d;
+	res   = (BN_ULONG *)(((PTR_SIZE_INT)res&~mask) |
 	 ((PTR_SIZE_INT)r_d&mask));
 	nist_cp_bn(r_d, res, BN_NIST_384_TOP);
 	r->top = BN_NIST_384_TOP;
@@ -834,7 +1098,8 @@ int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 	bn_add_words(r_d,r_d,t_d,BN_NIST_521_TOP);
 	mask = 0-(PTR_SIZE_INT)bn_sub_words(t_d,r_d,_nist_p_521,BN_NIST_521_TOP);
-	res  = (BN_ULONG *)(((PTR_SIZE_INT)t_d&~mask) |
+	res  = t_d;
+	res  = (BN_ULONG *)(((PTR_SIZE_INT)res&~mask) |
 	 ((PTR_SIZE_INT)r_d&mask));
 	nist_cp_bn(r_d,res,BN_NIST_521_TOP);
 	r->top = BN_NIST_521_TOP;
diff --git a/main/openssl/crypto/bn/bn_print.c b/main/openssl/crypto/bn/bn_print.c
index bebb466d..1743b6a7 100644
--- a/main/openssl/crypto/bn/bn_print.c
+++ b/main/openssl/crypto/bn/bn_print.c
@@ -357,3 +357,22 @@ end:
 	return(ret);
 	}
 #endif
+
+char *BN_options(void)
+	{
+	static int init=0;
+	static char data[16];
+
+	if (!init)
+		{
+		init++;
+#ifdef BN_LLONG
+		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+			     (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
+#else
+		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+			     (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
+#endif
+		}
+	return(data);
+	}
diff --git a/main/openssl/crypto/bn/bn_rand.c b/main/openssl/crypto/bn/bn_rand.c
index b376c28f..55676f07 100644
--- a/main/openssl/crypto/bn/bn_rand.c
+++ b/main/openssl/crypto/bn/bn_rand.c
@@ -114,6 +114,7 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 #include <openssl/rand.h>
+#include <openssl/sha.h>
 
 static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
 	{
@@ -303,3 +304,72 @@ int	BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range)
 	{
 	return bn_rand_range(1, r, range);
 	}
+
+#ifndef OPENSSL_NO_SHA512
+/* BN_generate_dsa_nonce generates a random number 0 <= out < range. Unlike
+ * BN_rand_range, it also includes the contents of |priv| and |message| in the
+ * generation so that an RNG failure isn't fatal as long as |priv| remains
+ * secret. This is intended for use in DSA and ECDSA where an RNG weakness
+ * leads directly to private key exposure unless this function is used. */
+int BN_generate_dsa_nonce(BIGNUM *out, const BIGNUM *range, const BIGNUM* priv,
+			  const unsigned char *message, size_t message_len,
+			  BN_CTX *ctx)
+	{
+	SHA512_CTX sha;
+	/* We use 512 bits of random data per iteration to
+	 * ensure that we have at least |range| bits of randomness. */
+	unsigned char random_bytes[64];
+	unsigned char digest[SHA512_DIGEST_LENGTH];
+	unsigned done, todo;
+	/* We generate |range|+8 bytes of random output. */
+	const unsigned num_k_bytes = BN_num_bytes(range) + 8;
+	unsigned char private_bytes[96];
+	unsigned char *k_bytes;
+	int ret = 0;
+
+	k_bytes = OPENSSL_malloc(num_k_bytes);
+	if (!k_bytes)
+		goto err;
+
+	/* We copy |priv| into a local buffer to avoid exposing its length. */
+	todo = sizeof(priv->d[0])*priv->top;
+	if (todo > sizeof(private_bytes))
+		{
+		/* No reasonable DSA or ECDSA key should have a private key
+		 * this large and we don't handle this case in order to avoid
+		 * leaking the length of the private key. */
+		BNerr(BN_F_BN_GENERATE_DSA_NONCE, BN_R_PRIVATE_KEY_TOO_LARGE);
+		goto err;
+		}
+	memcpy(private_bytes, priv->d, todo);
+	memset(private_bytes + todo, 0, sizeof(private_bytes) - todo);
+
+	for (done = 0; done < num_k_bytes;) {
+		if (RAND_bytes(random_bytes, sizeof(random_bytes)) != 1)
+			goto err;
+		SHA512_Init(&sha);
+		SHA512_Update(&sha, &done, sizeof(done));
+		SHA512_Update(&sha, private_bytes, sizeof(private_bytes));
+		SHA512_Update(&sha, message, message_len);
+		SHA512_Update(&sha, random_bytes, sizeof(random_bytes));
+		SHA512_Final(digest, &sha);
+
+		todo = num_k_bytes - done;
+		if (todo > SHA512_DIGEST_LENGTH)
+			todo = SHA512_DIGEST_LENGTH;
+		memcpy(k_bytes + done, digest, todo);
+		done += todo;
+	}
+
+	if (!BN_bin2bn(k_bytes, num_k_bytes, out))
+		goto err;
+	if (BN_mod(out, out, range, ctx) != 1)
+		goto err;
+	ret = 1;
+
+err:
+	if (k_bytes)
+		OPENSSL_free(k_bytes);
+	return ret;
+	}
+#endif  /* OPENSSL_NO_SHA512 */
diff --git a/main/openssl/crypto/bn/bn_shift.c b/main/openssl/crypto/bn/bn_shift.c
index c4d301af..a6fca2c4 100644
--- a/main/openssl/crypto/bn/bn_shift.c
+++ b/main/openssl/crypto/bn/bn_shift.c
@@ -99,7 +99,7 @@ int BN_lshift1(BIGNUM *r, const BIGNUM *a)
 int BN_rshift1(BIGNUM *r, const BIGNUM *a)
 	{
 	BN_ULONG *ap,*rp,t,c;
-	int i;
+	int i,j;
 
 	bn_check_top(r);
 	bn_check_top(a);
@@ -109,22 +109,25 @@ int BN_rshift1(BIGNUM *r, const BIGNUM *a)
 		BN_zero(r);
 		return(1);
 		}
+	i = a->top;
+	ap= a->d;
+	j = i-(ap[i-1]==1);
 	if (a != r)
 		{
-		if (bn_wexpand(r,a->top) == NULL) return(0);
-		r->top=a->top;
+		if (bn_wexpand(r,j) == NULL) return(0);
 		r->neg=a->neg;
 		}
-	ap=a->d;
 	rp=r->d;
-	c=0;
-	for (i=a->top-1; i>=0; i--)
+	t=ap[--i];
+	c=(t&1)?BN_TBIT:0;
+	if (t>>=1) rp[i]=t;
+	while (i>0)
 		{
-		t=ap[i];
+		t=ap[--i];
 		rp[i]=((t>>1)&BN_MASK2)|c;
 		c=(t&1)?BN_TBIT:0;
 		}
-	bn_correct_top(r);
+	r->top=j;
 	bn_check_top(r);
 	return(1);
 	}
@@ -182,10 +185,11 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 		BN_zero(r);
 		return(1);
 		}
+	i = (BN_num_bits(a)-n+(BN_BITS2-1))/BN_BITS2;
 	if (r != a)
 		{
 		r->neg=a->neg;
-		if (bn_wexpand(r,a->top-nw+1) == NULL) return(0);
+		if (bn_wexpand(r,i) == NULL) return(0);
 		}
 	else
 		{
@@ -196,7 +200,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 	f= &(a->d[nw]);
 	t=r->d;
 	j=a->top-nw;
-	r->top=j;
+	r->top=i;
 
 	if (rb == 0)
 		{
@@ -212,9 +216,8 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 			l= *(f++);
 			*(t++) =(tmp|(l<<lb))&BN_MASK2;
 			}
-		*(t++) =(l>>rb)&BN_MASK2;
+		if ((l = (l>>rb)&BN_MASK2)) *(t) = l;
 		}
-	bn_correct_top(r);
 	bn_check_top(r);
 	return(1);
 	}
diff --git a/main/openssl/crypto/bn/bn_word.c b/main/openssl/crypto/bn/bn_word.c
index ee7b87c4..de83a15b 100644
--- a/main/openssl/crypto/bn/bn_word.c
+++ b/main/openssl/crypto/bn/bn_word.c
@@ -144,26 +144,17 @@ int BN_add_word(BIGNUM *a, BN_ULONG w)
 			a->neg=!(a->neg);
 		return(i);
 		}
-	/* Only expand (and risk failing) if it's possibly necessary */
-	if (((BN_ULONG)(a->d[a->top - 1] + 1) == 0) &&
-			(bn_wexpand(a,a->top+1) == NULL))
-		return(0);
-	i=0;
-	for (;;)
+	for (i=0;w!=0 && i<a->top;i++)
 		{
-		if (i >= a->top)
-			l=w;
-		else
-			l=(a->d[i]+w)&BN_MASK2;
-		a->d[i]=l;
-		if (w > l)
-			w=1;
-		else
-			break;
-		i++;
+		a->d[i] = l = (a->d[i]+w)&BN_MASK2;
+		w = (w>l)?1:0;
 		}
-	if (i >= a->top)
+	if (w && i==a->top)
+		{
+		if (bn_wexpand(a,a->top+1) == NULL) return 0;
 		a->top++;
+		a->d[i]=w;
+		}
 	bn_check_top(a);
 	return(1);
 	}
diff --git a/main/openssl/crypto/bn/bntest.c b/main/openssl/crypto/bn/bntest.c
index 0cd99c5b..06f5954a 100644
--- a/main/openssl/crypto/bn/bntest.c
+++ b/main/openssl/crypto/bn/bntest.c
@@ -262,7 +262,7 @@ int main(int argc, char *argv[])
 	message(out,"BN_mod_sqrt");
 	if (!test_sqrt(out,ctx)) goto err;
 	(void)BIO_flush(out);
-
+#ifndef OPENSSL_NO_EC2M
 	message(out,"BN_GF2m_add");
 	if (!test_gf2m_add(out)) goto err;
 	(void)BIO_flush(out);
@@ -298,7 +298,7 @@ int main(int argc, char *argv[])
 	message(out,"BN_GF2m_mod_solve_quad");
 	if (!test_gf2m_mod_solve_quad(out,ctx)) goto err;
 	(void)BIO_flush(out);
-
+#endif
 	BN_CTX_free(ctx);
 	BIO_free(out);
 
@@ -1061,7 +1061,7 @@ int test_exp(BIO *bp, BN_CTX *ctx)
 	BN_free(one);
 	return(1);
 	}
-
+#ifndef OPENSSL_NO_EC2M
 int test_gf2m_add(BIO *bp)
 	{
 	BIGNUM a,b,c;
@@ -1636,7 +1636,7 @@ int test_gf2m_mod_solve_quad(BIO *bp,BN_CTX *ctx)
 	BN_free(e);
 	return ret;
 	}
-
+#endif
 static int genprime_cb(int p, int n, BN_GENCB *arg)
 	{
 	char c='*';
diff --git a/main/openssl/crypto/buffer/buf_str.c b/main/openssl/crypto/buffer/buf_str.c
new file mode 100644
index 00000000..151f5ea9
--- /dev/null
+++ b/main/openssl/crypto/buffer/buf_str.c
@@ -0,0 +1,119 @@
+/* crypto/buffer/buffer.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/buffer.h>
+
+char *BUF_strdup(const char *str)
+	{
+	if (str == NULL) return(NULL);
+	return BUF_strndup(str, strlen(str));
+	}
+
+char *BUF_strndup(const char *str, size_t siz)
+	{
+	char *ret;
+
+	if (str == NULL) return(NULL);
+
+	ret=OPENSSL_malloc(siz+1);
+	if (ret == NULL) 
+		{
+		BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
+		return(NULL);
+		}
+	BUF_strlcpy(ret,str,siz+1);
+	return(ret);
+	}
+
+void *BUF_memdup(const void *data, size_t siz)
+	{
+	void *ret;
+
+	if (data == NULL) return(NULL);
+
+	ret=OPENSSL_malloc(siz);
+	if (ret == NULL) 
+		{
+		BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
+		return(NULL);
+		}
+	return memcpy(ret, data, siz);
+	}	
+
+size_t BUF_strlcpy(char *dst, const char *src, size_t size)
+	{
+	size_t l = 0;
+	for(; size > 1 && *src; size--)
+		{
+		*dst++ = *src++;
+		l++;
+		}
+	if (size)
+		*dst = '\0';
+	return l + strlen(src);
+	}
+
+size_t BUF_strlcat(char *dst, const char *src, size_t size)
+	{
+	size_t l = 0;
+	for(; size > 0 && *dst; size--, dst++)
+		l++;
+	return l + BUF_strlcpy(dst, src, size);
+	}
diff --git a/main/openssl/crypto/buffer/buffer.c b/main/openssl/crypto/buffer/buffer.c
index 620ea8d5..d4a4ce43 100644
--- a/main/openssl/crypto/buffer/buffer.c
+++ b/main/openssl/crypto/buffer/buffer.c
@@ -60,6 +60,11 @@
 #include "cryptlib.h"
 #include <openssl/buffer.h>
 
+/* LIMIT_BEFORE_EXPANSION is the maximum n such that (n+3)/3*4 < 2**31. That
+ * function is applied in several functions in this file and this limit ensures
+ * that the result fits in an int. */
+#define LIMIT_BEFORE_EXPANSION 0x5ffffffc
+
 BUF_MEM *BUF_MEM_new(void)
 	{
 	BUF_MEM *ret;
@@ -105,6 +110,12 @@ int BUF_MEM_grow(BUF_MEM *str, size_t len)
 		str->length=len;
 		return(len);
 		}
+	/* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+	if (len > LIMIT_BEFORE_EXPANSION)
+		{
+		BUFerr(BUF_F_BUF_MEM_GROW,ERR_R_MALLOC_FAILURE);
+		return 0;
+		}
 	n=(len+3)/3*4;
 	if (str->data == NULL)
 		ret=OPENSSL_malloc(n);
@@ -142,6 +153,12 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
 		str->length=len;
 		return(len);
 		}
+	/* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+	if (len > LIMIT_BEFORE_EXPANSION)
+		{
+		BUFerr(BUF_F_BUF_MEM_GROW_CLEAN,ERR_R_MALLOC_FAILURE);
+		return 0;
+		}
 	n=(len+3)/3*4;
 	if (str->data == NULL)
 		ret=OPENSSL_malloc(n);
@@ -162,72 +179,14 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
 	return(len);
 	}
 
-char *BUF_strdup(const char *str)
-	{
-	if (str == NULL) return(NULL);
-	return BUF_strndup(str, strlen(str));
-	}
-
-char *BUF_strndup(const char *str, size_t siz)
-	{
-	char *ret;
-
-	if (str == NULL) return(NULL);
-
-	ret=OPENSSL_malloc(siz+1);
-	if (ret == NULL) 
-		{
-		BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
-		return(NULL);
-		}
-	BUF_strlcpy(ret,str,siz+1);
-	return(ret);
-	}
-
-void *BUF_memdup(const void *data, size_t siz)
-	{
-	void *ret;
-
-	if (data == NULL) return(NULL);
-
-	ret=OPENSSL_malloc(siz);
-	if (ret == NULL) 
-		{
-		BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
-		return(NULL);
-		}
-	return memcpy(ret, data, siz);
-	}	
-
-size_t BUF_strlcpy(char *dst, const char *src, size_t size)
-	{
-	size_t l = 0;
-	for(; size > 1 && *src; size--)
-		{
-		*dst++ = *src++;
-		l++;
-		}
-	if (size)
-		*dst = '\0';
-	return l + strlen(src);
-	}
-
-size_t BUF_strlcat(char *dst, const char *src, size_t size)
-	{
-	size_t l = 0;
-	for(; size > 0 && *dst; size--, dst++)
-		l++;
-	return l + BUF_strlcpy(dst, src, size);
-	}
-
-void BUF_reverse(unsigned char *out, unsigned char *in, size_t size)
+void BUF_reverse(unsigned char *out, const unsigned char *in, size_t size)
 	{
 	size_t i;
 	if (in)
 		{
 		out += size - 1;
 		for (i = 0; i < size; i++)
-			*in++ = *out--;
+			*out-- = *in++;
 		}
 	else
 		{
diff --git a/main/openssl/crypto/buffer/buffer.h b/main/openssl/crypto/buffer/buffer.h
index 178e4182..f8da32b4 100644
--- a/main/openssl/crypto/buffer/buffer.h
+++ b/main/openssl/crypto/buffer/buffer.h
@@ -88,7 +88,7 @@ int	BUF_MEM_grow_clean(BUF_MEM *str, size_t len);
 char *	BUF_strdup(const char *str);
 char *	BUF_strndup(const char *str, size_t siz);
 void *	BUF_memdup(const void *data, size_t siz);
-void	BUF_reverse(unsigned char *out, unsigned char *in, size_t siz);
+void	BUF_reverse(unsigned char *out, const unsigned char *in, size_t siz);
 
 /* safe string functions */
 size_t BUF_strlcpy(char *dst,const char *src,size_t siz);
diff --git a/main/openssl/crypto/cmac/cm_ameth.c b/main/openssl/crypto/cmac/cm_ameth.c
new file mode 100644
index 00000000..0b8e5670
--- /dev/null
+++ b/main/openssl/crypto/cmac/cm_ameth.c
@@ -0,0 +1,97 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "asn1_locl.h"
+
+/* CMAC "ASN1" method. This is just here to indicate the
+ * maximum CMAC output length and to free up a CMAC
+ * key.
+ */
+
+static int cmac_size(const EVP_PKEY *pkey)
+	{
+	return EVP_MAX_BLOCK_LENGTH;
+	}
+
+static void cmac_key_free(EVP_PKEY *pkey)
+	{
+	CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr;
+	if (cmctx)
+		CMAC_CTX_free(cmctx);
+	}
+
+const EVP_PKEY_ASN1_METHOD cmac_asn1_meth = 
+	{
+	EVP_PKEY_CMAC,
+	EVP_PKEY_CMAC,
+	0,
+
+	"CMAC",
+	"OpenSSL CMAC method",
+
+	0,0,0,0,
+
+	0,0,0,
+
+	cmac_size,
+	0,
+	0,0,0,0,0,0,0,
+
+	cmac_key_free,
+	0,
+	0,0
+	};
+
diff --git a/main/openssl/crypto/cmac/cm_pmeth.c b/main/openssl/crypto/cmac/cm_pmeth.c
new file mode 100644
index 00000000..072228ec
--- /dev/null
+++ b/main/openssl/crypto/cmac/cm_pmeth.c
@@ -0,0 +1,224 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "evp_locl.h"
+
+/* The context structure and "key" is simply a CMAC_CTX */
+
+static int pkey_cmac_init(EVP_PKEY_CTX *ctx)
+	{
+	ctx->data = CMAC_CTX_new();
+	if (!ctx->data)
+		return 0;
+	ctx->keygen_info_count = 0;
+	return 1;
+	}
+
+static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
+	{
+	if (!pkey_cmac_init(dst))
+		return 0;
+	if (!CMAC_CTX_copy(dst->data, src->data))
+		return 0;
+	return 1;
+	}
+
+static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx)
+	{
+	CMAC_CTX_free(ctx->data);
+	}
+
+static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
+	{
+	CMAC_CTX *cmkey = CMAC_CTX_new();
+	CMAC_CTX *cmctx = ctx->data;
+	if (!cmkey)
+		return 0;
+	if (!CMAC_CTX_copy(cmkey, cmctx))
+		{
+		CMAC_CTX_free(cmkey);
+		return 0;
+		}
+	EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey);
+	
+	return 1;
+	}
+
+static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
+	{
+	if (!CMAC_Update(ctx->pctx->data, data, count))
+		return 0;
+	return 1;
+	}
+
+static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx)
+	{
+	EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT);
+	mctx->update = int_update;
+	return 1;
+	}
+
+static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
+					EVP_MD_CTX *mctx)
+	{
+	return CMAC_Final(ctx->data, sig, siglen);
+	}
+
+static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
+	{
+	CMAC_CTX *cmctx = ctx->data;
+	switch (type)
+		{
+
+		case EVP_PKEY_CTRL_SET_MAC_KEY:
+		if (!p2 || p1 < 0)
+			return 0;
+		if (!CMAC_Init(cmctx, p2, p1, NULL, NULL))
+			return 0;
+		break;
+
+		case EVP_PKEY_CTRL_CIPHER:
+		if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine))
+			return 0;
+		break;
+
+		case EVP_PKEY_CTRL_MD:
+		if (ctx->pkey && !CMAC_CTX_copy(ctx->data,
+					(CMAC_CTX *)ctx->pkey->pkey.ptr))
+			return 0;
+		if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL))
+			return 0;
+		break;
+
+		default:
+		return -2;
+
+		}
+	return 1;
+	}
+
+static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx,
+			const char *type, const char *value)
+	{
+	if (!value)
+		{
+		return 0;
+		}
+	if (!strcmp(type, "key"))
+		{
+		void *p = (void *)value;
+		return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY,
+								strlen(p), p);
+		}
+	if (!strcmp(type, "cipher"))
+		{
+		const EVP_CIPHER *c;
+		c = EVP_get_cipherbyname(value);
+		if (!c)
+			return 0;
+		return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c);
+		}
+	if (!strcmp(type, "hexkey"))
+		{
+		unsigned char *key;
+		int r;
+		long keylen;
+		key = string_to_hex(value, &keylen);
+		if (!key)
+			return 0;
+		r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key);
+		OPENSSL_free(key);
+		return r;
+		}
+	return -2;
+	}
+
+const EVP_PKEY_METHOD cmac_pkey_meth = 
+	{
+	EVP_PKEY_CMAC,
+	EVP_PKEY_FLAG_SIGCTX_CUSTOM,
+	pkey_cmac_init,
+	pkey_cmac_copy,
+	pkey_cmac_cleanup,
+
+	0, 0,
+
+	0,
+	pkey_cmac_keygen,
+
+	0, 0,
+
+	0, 0,
+
+	0,0,
+
+	cmac_signctx_init,
+	cmac_signctx,
+
+	0,0,
+
+	0,0,
+
+	0,0,
+
+	0,0,
+
+	pkey_cmac_ctrl,
+	pkey_cmac_ctrl_str
+
+	};
diff --git a/main/openssl/crypto/cmac/cmac.c b/main/openssl/crypto/cmac/cmac.c
new file mode 100644
index 00000000..8b72b096
--- /dev/null
+++ b/main/openssl/crypto/cmac/cmac.c
@@ -0,0 +1,308 @@
+/* crypto/cmac/cmac.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cryptlib.h"
+#include <openssl/cmac.h>
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
+struct CMAC_CTX_st
+	{
+	/* Cipher context to use */
+	EVP_CIPHER_CTX cctx;
+	/* Keys k1 and k2 */
+	unsigned char k1[EVP_MAX_BLOCK_LENGTH];
+	unsigned char k2[EVP_MAX_BLOCK_LENGTH];
+	/* Temporary block */
+	unsigned char tbl[EVP_MAX_BLOCK_LENGTH];
+	/* Last (possibly partial) block */
+	unsigned char last_block[EVP_MAX_BLOCK_LENGTH];
+	/* Number of bytes in last block: -1 means context not initialised */
+	int nlast_block;
+	};
+
+
+/* Make temporary keys K1 and K2 */
+
+static void make_kn(unsigned char *k1, unsigned char *l, int bl)
+	{
+	int i;
+	/* Shift block to left, including carry */
+	for (i = 0; i < bl; i++)
+		{
+		k1[i] = l[i] << 1;
+		if (i < bl - 1 && l[i + 1] & 0x80)
+			k1[i] |= 1;
+		}
+	/* If MSB set fixup with R */
+	if (l[0] & 0x80)
+		k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b;
+	}
+
+CMAC_CTX *CMAC_CTX_new(void)
+	{
+	CMAC_CTX *ctx;
+	ctx = OPENSSL_malloc(sizeof(CMAC_CTX));
+	if (!ctx)
+		return NULL;
+	EVP_CIPHER_CTX_init(&ctx->cctx);
+	ctx->nlast_block = -1;
+	return ctx;
+	}
+
+void CMAC_CTX_cleanup(CMAC_CTX *ctx)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		{
+		FIPS_cmac_ctx_cleanup(ctx);
+		return;
+		}
+#endif
+	EVP_CIPHER_CTX_cleanup(&ctx->cctx);
+	OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH);
+	ctx->nlast_block = -1;
+	}
+
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx)
+	{
+	return &ctx->cctx;
+	}
+
+void CMAC_CTX_free(CMAC_CTX *ctx)
+	{
+	CMAC_CTX_cleanup(ctx);
+	OPENSSL_free(ctx);
+	}
+
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in)
+	{
+	int bl;
+	if (in->nlast_block == -1)
+		return 0;
+	if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx))
+		return 0;
+	bl = EVP_CIPHER_CTX_block_size(&in->cctx);
+	memcpy(out->k1, in->k1, bl);
+	memcpy(out->k2, in->k2, bl);
+	memcpy(out->tbl, in->tbl, bl);
+	memcpy(out->last_block, in->last_block, bl);
+	out->nlast_block = in->nlast_block;
+	return 1;
+	}
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl)
+	{
+	static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH];
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		/* If we have an ENGINE need to allow non FIPS */
+		if ((impl || ctx->cctx.engine)
+			&& !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+
+			{
+			EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
+			return 0;
+			}
+		/* Other algorithm blocking will be done in FIPS_cmac_init,
+		 * via FIPS_cipherinit().
+		 */
+		if (!impl && !ctx->cctx.engine)
+			return FIPS_cmac_init(ctx, key, keylen, cipher, NULL);
+		}
+#endif
+	/* All zeros means restart */
+	if (!key && !cipher && !impl && keylen == 0)
+		{
+		/* Not initialised */
+		if (ctx->nlast_block == -1)
+			return 0;
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+			return 0;
+		memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx));
+		ctx->nlast_block = 0;
+		return 1;
+		}
+	/* Initialiase context */
+	if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL))
+		return 0;
+	/* Non-NULL key means initialisation complete */
+	if (key)
+		{
+		int bl;
+		if (!EVP_CIPHER_CTX_cipher(&ctx->cctx))
+			return 0;
+		if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen))
+			return 0;
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv))
+			return 0;
+		bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl))
+			return 0;
+		make_kn(ctx->k1, ctx->tbl, bl);
+		make_kn(ctx->k2, ctx->k1, bl);
+		OPENSSL_cleanse(ctx->tbl, bl);
+		/* Reset context again ready for first data block */
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+			return 0;
+		/* Zero tbl so resume works */
+		memset(ctx->tbl, 0, bl);
+		ctx->nlast_block = 0;
+		}
+	return 1;
+	}
+
+int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen)
+	{
+	const unsigned char *data = in;
+	size_t bl;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		return FIPS_cmac_update(ctx, in, dlen);
+#endif
+	if (ctx->nlast_block == -1)
+		return 0;
+	if (dlen == 0)
+		return 1;
+	bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+	/* Copy into partial block if we need to */
+	if (ctx->nlast_block > 0)
+		{
+		size_t nleft;
+		nleft = bl - ctx->nlast_block;
+		if (dlen < nleft)
+			nleft = dlen;
+		memcpy(ctx->last_block + ctx->nlast_block, data, nleft);
+		dlen -= nleft;
+		ctx->nlast_block += nleft;
+		/* If no more to process return */
+		if (dlen == 0)
+			return 1;
+		data += nleft;
+		/* Else not final block so encrypt it */
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl))
+			return 0;
+		}
+	/* Encrypt all but one of the complete blocks left */
+	while(dlen > bl)
+		{
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl))
+			return 0;
+		dlen -= bl;
+		data += bl;
+		}
+	/* Copy any data left to last block buffer */
+	memcpy(ctx->last_block, data, dlen);
+	ctx->nlast_block = dlen;
+	return 1;
+
+	}
+
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen)
+	{
+	int i, bl, lb;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		return FIPS_cmac_final(ctx, out, poutlen);
+#endif
+	if (ctx->nlast_block == -1)
+		return 0;
+	bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+	*poutlen = (size_t)bl;
+	if (!out)
+		return 1;
+	lb = ctx->nlast_block;
+	/* Is last block complete? */
+	if (lb == bl)
+		{
+		for (i = 0; i < bl; i++)
+			out[i] = ctx->last_block[i] ^ ctx->k1[i];
+		}
+	else
+		{
+		ctx->last_block[lb] = 0x80;
+		if (bl - lb > 1)
+			memset(ctx->last_block + lb + 1, 0, bl - lb - 1);
+		for (i = 0; i < bl; i++)
+			out[i] = ctx->last_block[i] ^ ctx->k2[i];
+		}
+	if (!EVP_Cipher(&ctx->cctx, out, out, bl))
+		{
+		OPENSSL_cleanse(out, bl);	
+		return 0;
+		}
+	return 1;
+	}
+
+int CMAC_resume(CMAC_CTX *ctx)
+	{
+	if (ctx->nlast_block == -1)
+		return 0;
+	/* The buffer "tbl" containes the last fully encrypted block
+	 * which is the last IV (or all zeroes if no last encrypted block).
+	 * The last block has not been modified since CMAC_final().
+	 * So reinitliasing using the last decrypted block will allow
+	 * CMAC to continue after calling CMAC_Final(). 
+	 */
+	return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl);
+	}
diff --git a/main/openssl/crypto/cmac/cmac.h b/main/openssl/crypto/cmac/cmac.h
new file mode 100644
index 00000000..712e92dc
--- /dev/null
+++ b/main/openssl/crypto/cmac/cmac.h
@@ -0,0 +1,82 @@
+/* crypto/cmac/cmac.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMAC_H
+#define HEADER_CMAC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/evp.h>
+
+/* Opaque */
+typedef struct CMAC_CTX_st CMAC_CTX;
+
+CMAC_CTX *CMAC_CTX_new(void);
+void CMAC_CTX_cleanup(CMAC_CTX *ctx);
+void CMAC_CTX_free(CMAC_CTX *ctx);
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx);
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in);
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl);
+int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen);
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen);
+int CMAC_resume(CMAC_CTX *ctx);
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/main/openssl/crypto/cms/cms.h b/main/openssl/crypto/cms/cms.h
new file mode 100644
index 00000000..36994fa6
--- /dev/null
+++ b/main/openssl/crypto/cms/cms.h
@@ -0,0 +1,501 @@
+/* crypto/cms/cms.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMS_H
+#define HEADER_CMS_H
+
+#include <openssl/x509.h>
+
+#ifdef OPENSSL_NO_CMS
+#error CMS is disabled.
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct CMS_ContentInfo_st CMS_ContentInfo;
+typedef struct CMS_SignerInfo_st CMS_SignerInfo;
+typedef struct CMS_CertificateChoices CMS_CertificateChoices;
+typedef struct CMS_RevocationInfoChoice_st CMS_RevocationInfoChoice;
+typedef struct CMS_RecipientInfo_st CMS_RecipientInfo;
+typedef struct CMS_ReceiptRequest_st CMS_ReceiptRequest;
+typedef struct CMS_Receipt_st CMS_Receipt;
+
+DECLARE_STACK_OF(CMS_SignerInfo)
+DECLARE_STACK_OF(GENERAL_NAMES)
+DECLARE_ASN1_FUNCTIONS(CMS_ContentInfo)
+DECLARE_ASN1_FUNCTIONS(CMS_ReceiptRequest)
+DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo)
+
+#define CMS_SIGNERINFO_ISSUER_SERIAL	0
+#define CMS_SIGNERINFO_KEYIDENTIFIER	1
+
+#define CMS_RECIPINFO_TRANS		0
+#define CMS_RECIPINFO_AGREE		1
+#define CMS_RECIPINFO_KEK		2
+#define CMS_RECIPINFO_PASS		3
+#define CMS_RECIPINFO_OTHER		4
+
+/* S/MIME related flags */
+
+#define CMS_TEXT			0x1
+#define CMS_NOCERTS			0x2
+#define CMS_NO_CONTENT_VERIFY		0x4
+#define CMS_NO_ATTR_VERIFY		0x8
+#define CMS_NOSIGS			\
+			(CMS_NO_CONTENT_VERIFY|CMS_NO_ATTR_VERIFY)
+#define CMS_NOINTERN			0x10
+#define CMS_NO_SIGNER_CERT_VERIFY	0x20
+#define CMS_NOVERIFY			0x20
+#define CMS_DETACHED			0x40
+#define CMS_BINARY			0x80
+#define CMS_NOATTR			0x100
+#define	CMS_NOSMIMECAP			0x200
+#define CMS_NOOLDMIMETYPE		0x400
+#define CMS_CRLFEOL			0x800
+#define CMS_STREAM			0x1000
+#define CMS_NOCRL			0x2000
+#define CMS_PARTIAL			0x4000
+#define CMS_REUSE_DIGEST		0x8000
+#define CMS_USE_KEYID			0x10000
+#define CMS_DEBUG_DECRYPT		0x20000
+
+const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms);
+
+BIO *CMS_dataInit(CMS_ContentInfo *cms, BIO *icont);
+int CMS_dataFinal(CMS_ContentInfo *cms, BIO *bio);
+
+ASN1_OCTET_STRING **CMS_get0_content(CMS_ContentInfo *cms);
+int CMS_is_detached(CMS_ContentInfo *cms);
+int CMS_set_detached(CMS_ContentInfo *cms, int detached);
+
+#ifdef HEADER_PEM_H
+DECLARE_PEM_rw_const(CMS, CMS_ContentInfo)
+#endif
+
+int CMS_stream(unsigned char ***boundary, CMS_ContentInfo *cms);
+CMS_ContentInfo *d2i_CMS_bio(BIO *bp, CMS_ContentInfo **cms);
+int i2d_CMS_bio(BIO *bp, CMS_ContentInfo *cms);
+
+BIO *BIO_new_CMS(BIO *out, CMS_ContentInfo *cms);
+int i2d_CMS_bio_stream(BIO *out, CMS_ContentInfo *cms, BIO *in, int flags);
+int PEM_write_bio_CMS_stream(BIO *out, CMS_ContentInfo *cms, BIO *in, int flags);
+CMS_ContentInfo *SMIME_read_CMS(BIO *bio, BIO **bcont);
+int SMIME_write_CMS(BIO *bio, CMS_ContentInfo *cms, BIO *data, int flags);
+
+int CMS_final(CMS_ContentInfo *cms, BIO *data, BIO *dcont, unsigned int flags);
+
+CMS_ContentInfo *CMS_sign(X509 *signcert, EVP_PKEY *pkey, STACK_OF(X509) *certs,
+						BIO *data, unsigned int flags);
+
+CMS_ContentInfo *CMS_sign_receipt(CMS_SignerInfo *si,
+					X509 *signcert, EVP_PKEY *pkey,
+					STACK_OF(X509) *certs,
+					unsigned int flags);
+
+int CMS_data(CMS_ContentInfo *cms, BIO *out, unsigned int flags);
+CMS_ContentInfo *CMS_data_create(BIO *in, unsigned int flags);
+
+int CMS_digest_verify(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+							unsigned int flags);
+CMS_ContentInfo *CMS_digest_create(BIO *in, const EVP_MD *md,
+							unsigned int flags);
+
+int CMS_EncryptedData_decrypt(CMS_ContentInfo *cms,
+				const unsigned char *key, size_t keylen,
+				BIO *dcont, BIO *out, unsigned int flags);
+
+CMS_ContentInfo *CMS_EncryptedData_encrypt(BIO *in, const EVP_CIPHER *cipher,
+					const unsigned char *key, size_t keylen,
+					unsigned int flags);
+
+int CMS_EncryptedData_set1_key(CMS_ContentInfo *cms, const EVP_CIPHER *ciph,
+				const unsigned char *key, size_t keylen);
+
+int CMS_verify(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
+		 X509_STORE *store, BIO *dcont, BIO *out, unsigned int flags);
+
+int CMS_verify_receipt(CMS_ContentInfo *rcms, CMS_ContentInfo *ocms,
+			STACK_OF(X509) *certs,
+			X509_STORE *store, unsigned int flags);
+
+STACK_OF(X509) *CMS_get0_signers(CMS_ContentInfo *cms);
+
+CMS_ContentInfo *CMS_encrypt(STACK_OF(X509) *certs, BIO *in,
+				const EVP_CIPHER *cipher, unsigned int flags);
+
+int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pkey, X509 *cert,
+				BIO *dcont, BIO *out,
+				unsigned int flags);
+	
+int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert);
+int CMS_decrypt_set1_key(CMS_ContentInfo *cms, 
+				unsigned char *key, size_t keylen,
+				unsigned char *id, size_t idlen);
+int CMS_decrypt_set1_password(CMS_ContentInfo *cms, 
+				unsigned char *pass, ossl_ssize_t passlen);
+
+STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms);
+int CMS_RecipientInfo_type(CMS_RecipientInfo *ri);
+CMS_ContentInfo *CMS_EnvelopedData_create(const EVP_CIPHER *cipher);
+CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms,
+					X509 *recip, unsigned int flags);
+int CMS_RecipientInfo_set0_pkey(CMS_RecipientInfo *ri, EVP_PKEY *pkey);
+int CMS_RecipientInfo_ktri_cert_cmp(CMS_RecipientInfo *ri, X509 *cert);
+int CMS_RecipientInfo_ktri_get0_algs(CMS_RecipientInfo *ri,
+					EVP_PKEY **pk, X509 **recip,
+					X509_ALGOR **palg);
+int CMS_RecipientInfo_ktri_get0_signer_id(CMS_RecipientInfo *ri,
+					ASN1_OCTET_STRING **keyid,
+					X509_NAME **issuer, ASN1_INTEGER **sno);
+
+CMS_RecipientInfo *CMS_add0_recipient_key(CMS_ContentInfo *cms, int nid,
+					unsigned char *key, size_t keylen,
+					unsigned char *id, size_t idlen,
+					ASN1_GENERALIZEDTIME *date,
+					ASN1_OBJECT *otherTypeId,
+					ASN1_TYPE *otherType);
+
+int CMS_RecipientInfo_kekri_get0_id(CMS_RecipientInfo *ri,
+					X509_ALGOR **palg,
+					ASN1_OCTET_STRING **pid,
+					ASN1_GENERALIZEDTIME **pdate,
+					ASN1_OBJECT **potherid,
+					ASN1_TYPE **pothertype);
+
+int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri, 
+				unsigned char *key, size_t keylen);
+
+int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, 
+					const unsigned char *id, size_t idlen);
+
+int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, 
+					unsigned char *pass,
+					ossl_ssize_t passlen);
+
+CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms,
+					int iter, int wrap_nid, int pbe_nid,
+					unsigned char *pass,
+					ossl_ssize_t passlen,
+					const EVP_CIPHER *kekciph);
+
+int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri);
+	
+int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+							unsigned int flags);
+CMS_ContentInfo *CMS_compress(BIO *in, int comp_nid, unsigned int flags);
+
+int CMS_set1_eContentType(CMS_ContentInfo *cms, const ASN1_OBJECT *oid);
+const ASN1_OBJECT *CMS_get0_eContentType(CMS_ContentInfo *cms);
+
+CMS_CertificateChoices *CMS_add0_CertificateChoices(CMS_ContentInfo *cms);
+int CMS_add0_cert(CMS_ContentInfo *cms, X509 *cert);
+int CMS_add1_cert(CMS_ContentInfo *cms, X509 *cert);
+STACK_OF(X509) *CMS_get1_certs(CMS_ContentInfo *cms);
+
+CMS_RevocationInfoChoice *CMS_add0_RevocationInfoChoice(CMS_ContentInfo *cms);
+int CMS_add0_crl(CMS_ContentInfo *cms, X509_CRL *crl);
+int CMS_add1_crl(CMS_ContentInfo *cms, X509_CRL *crl);
+STACK_OF(X509_CRL) *CMS_get1_crls(CMS_ContentInfo *cms);
+
+int CMS_SignedData_init(CMS_ContentInfo *cms);
+CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
+			X509 *signer, EVP_PKEY *pk, const EVP_MD *md,
+			unsigned int flags);
+STACK_OF(CMS_SignerInfo) *CMS_get0_SignerInfos(CMS_ContentInfo *cms);
+
+void CMS_SignerInfo_set1_signer_cert(CMS_SignerInfo *si, X509 *signer);
+int CMS_SignerInfo_get0_signer_id(CMS_SignerInfo *si,
+					ASN1_OCTET_STRING **keyid,
+					X509_NAME **issuer, ASN1_INTEGER **sno);
+int CMS_SignerInfo_cert_cmp(CMS_SignerInfo *si, X509 *cert);
+int CMS_set1_signers_certs(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
+					unsigned int flags);
+void CMS_SignerInfo_get0_algs(CMS_SignerInfo *si, EVP_PKEY **pk, X509 **signer,
+					X509_ALGOR **pdig, X509_ALGOR **psig);
+int CMS_SignerInfo_sign(CMS_SignerInfo *si);
+int CMS_SignerInfo_verify(CMS_SignerInfo *si);
+int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain);
+
+int CMS_add_smimecap(CMS_SignerInfo *si, STACK_OF(X509_ALGOR) *algs);
+int CMS_add_simple_smimecap(STACK_OF(X509_ALGOR) **algs,
+				int algnid, int keysize);
+int CMS_add_standard_smimecap(STACK_OF(X509_ALGOR) **smcap);
+
+int CMS_signed_get_attr_count(const CMS_SignerInfo *si);
+int CMS_signed_get_attr_by_NID(const CMS_SignerInfo *si, int nid,
+			  int lastpos);
+int CMS_signed_get_attr_by_OBJ(const CMS_SignerInfo *si, ASN1_OBJECT *obj,
+			  int lastpos);
+X509_ATTRIBUTE *CMS_signed_get_attr(const CMS_SignerInfo *si, int loc);
+X509_ATTRIBUTE *CMS_signed_delete_attr(CMS_SignerInfo *si, int loc);
+int CMS_signed_add1_attr(CMS_SignerInfo *si, X509_ATTRIBUTE *attr);
+int CMS_signed_add1_attr_by_OBJ(CMS_SignerInfo *si,
+			const ASN1_OBJECT *obj, int type,
+			const void *bytes, int len);
+int CMS_signed_add1_attr_by_NID(CMS_SignerInfo *si,
+			int nid, int type,
+			const void *bytes, int len);
+int CMS_signed_add1_attr_by_txt(CMS_SignerInfo *si,
+			const char *attrname, int type,
+			const void *bytes, int len);
+void *CMS_signed_get0_data_by_OBJ(CMS_SignerInfo *si, ASN1_OBJECT *oid,
+					int lastpos, int type);
+
+int CMS_unsigned_get_attr_count(const CMS_SignerInfo *si);
+int CMS_unsigned_get_attr_by_NID(const CMS_SignerInfo *si, int nid,
+			  int lastpos);
+int CMS_unsigned_get_attr_by_OBJ(const CMS_SignerInfo *si, ASN1_OBJECT *obj,
+			  int lastpos);
+X509_ATTRIBUTE *CMS_unsigned_get_attr(const CMS_SignerInfo *si, int loc);
+X509_ATTRIBUTE *CMS_unsigned_delete_attr(CMS_SignerInfo *si, int loc);
+int CMS_unsigned_add1_attr(CMS_SignerInfo *si, X509_ATTRIBUTE *attr);
+int CMS_unsigned_add1_attr_by_OBJ(CMS_SignerInfo *si,
+			const ASN1_OBJECT *obj, int type,
+			const void *bytes, int len);
+int CMS_unsigned_add1_attr_by_NID(CMS_SignerInfo *si,
+			int nid, int type,
+			const void *bytes, int len);
+int CMS_unsigned_add1_attr_by_txt(CMS_SignerInfo *si,
+			const char *attrname, int type,
+			const void *bytes, int len);
+void *CMS_unsigned_get0_data_by_OBJ(CMS_SignerInfo *si, ASN1_OBJECT *oid,
+					int lastpos, int type);
+
+#ifdef HEADER_X509V3_H
+
+int CMS_get1_ReceiptRequest(CMS_SignerInfo *si, CMS_ReceiptRequest **prr);
+CMS_ReceiptRequest *CMS_ReceiptRequest_create0(unsigned char *id, int idlen,
+				int allorfirst,
+				STACK_OF(GENERAL_NAMES) *receiptList,
+				STACK_OF(GENERAL_NAMES) *receiptsTo);
+int CMS_add1_ReceiptRequest(CMS_SignerInfo *si, CMS_ReceiptRequest *rr);
+void CMS_ReceiptRequest_get0_values(CMS_ReceiptRequest *rr,
+					ASN1_STRING **pcid,
+					int *pallorfirst,
+					STACK_OF(GENERAL_NAMES) **plist,
+					STACK_OF(GENERAL_NAMES) **prto);
+
+#endif
+
+/* BEGIN ERROR CODES */
+/* The following lines are auto generated by the script mkerr.pl. Any changes
+ * made after this point may be overwritten when the script is next run.
+ */
+void ERR_load_CMS_strings(void);
+
+/* Error codes for the CMS functions. */
+
+/* Function codes. */
+#define CMS_F_CHECK_CONTENT				 99
+#define CMS_F_CMS_ADD0_CERT				 164
+#define CMS_F_CMS_ADD0_RECIPIENT_KEY			 100
+#define CMS_F_CMS_ADD0_RECIPIENT_PASSWORD		 165
+#define CMS_F_CMS_ADD1_RECEIPTREQUEST			 158
+#define CMS_F_CMS_ADD1_RECIPIENT_CERT			 101
+#define CMS_F_CMS_ADD1_SIGNER				 102
+#define CMS_F_CMS_ADD1_SIGNINGTIME			 103
+#define CMS_F_CMS_COMPRESS				 104
+#define CMS_F_CMS_COMPRESSEDDATA_CREATE			 105
+#define CMS_F_CMS_COMPRESSEDDATA_INIT_BIO		 106
+#define CMS_F_CMS_COPY_CONTENT				 107
+#define CMS_F_CMS_COPY_MESSAGEDIGEST			 108
+#define CMS_F_CMS_DATA					 109
+#define CMS_F_CMS_DATAFINAL				 110
+#define CMS_F_CMS_DATAINIT				 111
+#define CMS_F_CMS_DECRYPT				 112
+#define CMS_F_CMS_DECRYPT_SET1_KEY			 113
+#define CMS_F_CMS_DECRYPT_SET1_PASSWORD			 166
+#define CMS_F_CMS_DECRYPT_SET1_PKEY			 114
+#define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX		 115
+#define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO		 116
+#define CMS_F_CMS_DIGESTEDDATA_DO_FINAL			 117
+#define CMS_F_CMS_DIGEST_VERIFY				 118
+#define CMS_F_CMS_ENCODE_RECEIPT			 161
+#define CMS_F_CMS_ENCRYPT				 119
+#define CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO		 120
+#define CMS_F_CMS_ENCRYPTEDDATA_DECRYPT			 121
+#define CMS_F_CMS_ENCRYPTEDDATA_ENCRYPT			 122
+#define CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY		 123
+#define CMS_F_CMS_ENVELOPEDDATA_CREATE			 124
+#define CMS_F_CMS_ENVELOPEDDATA_INIT_BIO		 125
+#define CMS_F_CMS_ENVELOPED_DATA_INIT			 126
+#define CMS_F_CMS_FINAL					 127
+#define CMS_F_CMS_GET0_CERTIFICATE_CHOICES		 128
+#define CMS_F_CMS_GET0_CONTENT				 129
+#define CMS_F_CMS_GET0_ECONTENT_TYPE			 130
+#define CMS_F_CMS_GET0_ENVELOPED			 131
+#define CMS_F_CMS_GET0_REVOCATION_CHOICES		 132
+#define CMS_F_CMS_GET0_SIGNED				 133
+#define CMS_F_CMS_MSGSIGDIGEST_ADD1			 162
+#define CMS_F_CMS_RECEIPTREQUEST_CREATE0		 159
+#define CMS_F_CMS_RECEIPT_VERIFY			 160
+#define CMS_F_CMS_RECIPIENTINFO_DECRYPT			 134
+#define CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT		 135
+#define CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT		 136
+#define CMS_F_CMS_RECIPIENTINFO_KEKRI_GET0_ID		 137
+#define CMS_F_CMS_RECIPIENTINFO_KEKRI_ID_CMP		 138
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_CERT_CMP		 139
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT		 140
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT		 141
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS		 142
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID	 143
+#define CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT		 167
+#define CMS_F_CMS_RECIPIENTINFO_SET0_KEY		 144
+#define CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD		 168
+#define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY		 145
+#define CMS_F_CMS_SET1_SIGNERIDENTIFIER			 146
+#define CMS_F_CMS_SET_DETACHED				 147
+#define CMS_F_CMS_SIGN					 148
+#define CMS_F_CMS_SIGNED_DATA_INIT			 149
+#define CMS_F_CMS_SIGNERINFO_CONTENT_SIGN		 150
+#define CMS_F_CMS_SIGNERINFO_SIGN			 151
+#define CMS_F_CMS_SIGNERINFO_VERIFY			 152
+#define CMS_F_CMS_SIGNERINFO_VERIFY_CERT		 153
+#define CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT		 154
+#define CMS_F_CMS_SIGN_RECEIPT				 163
+#define CMS_F_CMS_STREAM				 155
+#define CMS_F_CMS_UNCOMPRESS				 156
+#define CMS_F_CMS_VERIFY				 157
+
+/* Reason codes. */
+#define CMS_R_ADD_SIGNER_ERROR				 99
+#define CMS_R_CERTIFICATE_ALREADY_PRESENT		 175
+#define CMS_R_CERTIFICATE_HAS_NO_KEYID			 160
+#define CMS_R_CERTIFICATE_VERIFY_ERROR			 100
+#define CMS_R_CIPHER_INITIALISATION_ERROR		 101
+#define CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR	 102
+#define CMS_R_CMS_DATAFINAL_ERROR			 103
+#define CMS_R_CMS_LIB					 104
+#define CMS_R_CONTENTIDENTIFIER_MISMATCH		 170
+#define CMS_R_CONTENT_NOT_FOUND				 105
+#define CMS_R_CONTENT_TYPE_MISMATCH			 171
+#define CMS_R_CONTENT_TYPE_NOT_COMPRESSED_DATA		 106
+#define CMS_R_CONTENT_TYPE_NOT_ENVELOPED_DATA		 107
+#define CMS_R_CONTENT_TYPE_NOT_SIGNED_DATA		 108
+#define CMS_R_CONTENT_VERIFY_ERROR			 109
+#define CMS_R_CTRL_ERROR				 110
+#define CMS_R_CTRL_FAILURE				 111
+#define CMS_R_DECRYPT_ERROR				 112
+#define CMS_R_DIGEST_ERROR				 161
+#define CMS_R_ERROR_GETTING_PUBLIC_KEY			 113
+#define CMS_R_ERROR_READING_MESSAGEDIGEST_ATTRIBUTE	 114
+#define CMS_R_ERROR_SETTING_KEY				 115
+#define CMS_R_ERROR_SETTING_RECIPIENTINFO		 116
+#define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH		 117
+#define CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER		 176
+#define CMS_R_INVALID_KEY_LENGTH			 118
+#define CMS_R_MD_BIO_INIT_ERROR				 119
+#define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH	 120
+#define CMS_R_MESSAGEDIGEST_WRONG_LENGTH		 121
+#define CMS_R_MSGSIGDIGEST_ERROR			 172
+#define CMS_R_MSGSIGDIGEST_VERIFICATION_FAILURE		 162
+#define CMS_R_MSGSIGDIGEST_WRONG_LENGTH			 163
+#define CMS_R_NEED_ONE_SIGNER				 164
+#define CMS_R_NOT_A_SIGNED_RECEIPT			 165
+#define CMS_R_NOT_ENCRYPTED_DATA			 122
+#define CMS_R_NOT_KEK					 123
+#define CMS_R_NOT_KEY_TRANSPORT				 124
+#define CMS_R_NOT_PWRI					 177
+#define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE		 125
+#define CMS_R_NO_CIPHER					 126
+#define CMS_R_NO_CONTENT				 127
+#define CMS_R_NO_CONTENT_TYPE				 173
+#define CMS_R_NO_DEFAULT_DIGEST				 128
+#define CMS_R_NO_DIGEST_SET				 129
+#define CMS_R_NO_KEY					 130
+#define CMS_R_NO_KEY_OR_CERT				 174
+#define CMS_R_NO_MATCHING_DIGEST			 131
+#define CMS_R_NO_MATCHING_RECIPIENT			 132
+#define CMS_R_NO_MATCHING_SIGNATURE			 166
+#define CMS_R_NO_MSGSIGDIGEST				 167
+#define CMS_R_NO_PASSWORD				 178
+#define CMS_R_NO_PRIVATE_KEY				 133
+#define CMS_R_NO_PUBLIC_KEY				 134
+#define CMS_R_NO_RECEIPT_REQUEST			 168
+#define CMS_R_NO_SIGNERS				 135
+#define CMS_R_PRIVATE_KEY_DOES_NOT_MATCH_CERTIFICATE	 136
+#define CMS_R_RECEIPT_DECODE_ERROR			 169
+#define CMS_R_RECIPIENT_ERROR				 137
+#define CMS_R_SIGNER_CERTIFICATE_NOT_FOUND		 138
+#define CMS_R_SIGNFINAL_ERROR				 139
+#define CMS_R_SMIME_TEXT_ERROR				 140
+#define CMS_R_STORE_INIT_ERROR				 141
+#define CMS_R_TYPE_NOT_COMPRESSED_DATA			 142
+#define CMS_R_TYPE_NOT_DATA				 143
+#define CMS_R_TYPE_NOT_DIGESTED_DATA			 144
+#define CMS_R_TYPE_NOT_ENCRYPTED_DATA			 145
+#define CMS_R_TYPE_NOT_ENVELOPED_DATA			 146
+#define CMS_R_UNABLE_TO_FINALIZE_CONTEXT		 147
+#define CMS_R_UNKNOWN_CIPHER				 148
+#define CMS_R_UNKNOWN_DIGEST_ALGORIHM			 149
+#define CMS_R_UNKNOWN_ID				 150
+#define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM		 151
+#define CMS_R_UNSUPPORTED_CONTENT_TYPE			 152
+#define CMS_R_UNSUPPORTED_KEK_ALGORITHM			 153
+#define CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM	 179
+#define CMS_R_UNSUPPORTED_RECIPIENT_TYPE		 154
+#define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE		 155
+#define CMS_R_UNSUPPORTED_TYPE				 156
+#define CMS_R_UNWRAP_ERROR				 157
+#define CMS_R_UNWRAP_FAILURE				 180
+#define CMS_R_VERIFICATION_FAILURE			 158
+#define CMS_R_WRAP_ERROR				 159
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/main/openssl/crypto/cms/cms_asn1.c b/main/openssl/crypto/cms/cms_asn1.c
new file mode 100644
index 00000000..cfe67fb6
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_asn1.c
@@ -0,0 +1,389 @@
+/* crypto/cms/cms_asn1.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include "cms.h"
+#include "cms_lcl.h"
+
+
+ASN1_SEQUENCE(CMS_IssuerAndSerialNumber) = {
+	ASN1_SIMPLE(CMS_IssuerAndSerialNumber, issuer, X509_NAME),
+	ASN1_SIMPLE(CMS_IssuerAndSerialNumber, serialNumber, ASN1_INTEGER)
+} ASN1_SEQUENCE_END(CMS_IssuerAndSerialNumber)
+
+ASN1_SEQUENCE(CMS_OtherCertificateFormat) = {
+	ASN1_SIMPLE(CMS_OtherCertificateFormat, otherCertFormat, ASN1_OBJECT),
+	ASN1_OPT(CMS_OtherCertificateFormat, otherCert, ASN1_ANY)
+} ASN1_SEQUENCE_END(CMS_OtherCertificateFormat)
+
+ASN1_CHOICE(CMS_CertificateChoices) = {
+	ASN1_SIMPLE(CMS_CertificateChoices, d.certificate, X509),
+	ASN1_IMP(CMS_CertificateChoices, d.extendedCertificate, ASN1_SEQUENCE, 0),
+	ASN1_IMP(CMS_CertificateChoices, d.v1AttrCert, ASN1_SEQUENCE, 1),
+	ASN1_IMP(CMS_CertificateChoices, d.v2AttrCert, ASN1_SEQUENCE, 2),
+	ASN1_IMP(CMS_CertificateChoices, d.other, CMS_OtherCertificateFormat, 3)
+} ASN1_CHOICE_END(CMS_CertificateChoices)
+
+ASN1_CHOICE(CMS_SignerIdentifier) = {
+	ASN1_SIMPLE(CMS_SignerIdentifier, d.issuerAndSerialNumber, CMS_IssuerAndSerialNumber),
+	ASN1_IMP(CMS_SignerIdentifier, d.subjectKeyIdentifier, ASN1_OCTET_STRING, 0)
+} ASN1_CHOICE_END(CMS_SignerIdentifier)
+
+ASN1_NDEF_SEQUENCE(CMS_EncapsulatedContentInfo) = {
+	ASN1_SIMPLE(CMS_EncapsulatedContentInfo, eContentType, ASN1_OBJECT),
+	ASN1_NDEF_EXP_OPT(CMS_EncapsulatedContentInfo, eContent, ASN1_OCTET_STRING_NDEF, 0)
+} ASN1_NDEF_SEQUENCE_END(CMS_EncapsulatedContentInfo)
+
+/* Minor tweak to operation: free up signer key, cert */
+static int cms_si_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+							void *exarg)
+	{
+	if(operation == ASN1_OP_FREE_POST)
+		{
+		CMS_SignerInfo *si = (CMS_SignerInfo *)*pval;
+		if (si->pkey)
+			EVP_PKEY_free(si->pkey);
+		if (si->signer)
+			X509_free(si->signer);
+		}
+	return 1;
+	}
+
+ASN1_SEQUENCE_cb(CMS_SignerInfo, cms_si_cb) = {
+	ASN1_SIMPLE(CMS_SignerInfo, version, LONG),
+	ASN1_SIMPLE(CMS_SignerInfo, sid, CMS_SignerIdentifier),
+	ASN1_SIMPLE(CMS_SignerInfo, digestAlgorithm, X509_ALGOR),
+	ASN1_IMP_SET_OF_OPT(CMS_SignerInfo, signedAttrs, X509_ATTRIBUTE, 0),
+	ASN1_SIMPLE(CMS_SignerInfo, signatureAlgorithm, X509_ALGOR),
+	ASN1_SIMPLE(CMS_SignerInfo, signature, ASN1_OCTET_STRING),
+	ASN1_IMP_SET_OF_OPT(CMS_SignerInfo, unsignedAttrs, X509_ATTRIBUTE, 1)
+} ASN1_SEQUENCE_END_cb(CMS_SignerInfo, CMS_SignerInfo)
+
+ASN1_SEQUENCE(CMS_OtherRevocationInfoFormat) = {
+	ASN1_SIMPLE(CMS_OtherRevocationInfoFormat, otherRevInfoFormat, ASN1_OBJECT),
+	ASN1_OPT(CMS_OtherRevocationInfoFormat, otherRevInfo, ASN1_ANY)
+} ASN1_SEQUENCE_END(CMS_OtherRevocationInfoFormat)
+
+ASN1_CHOICE(CMS_RevocationInfoChoice) = {
+	ASN1_SIMPLE(CMS_RevocationInfoChoice, d.crl, X509_CRL),
+	ASN1_IMP(CMS_RevocationInfoChoice, d.other, CMS_OtherRevocationInfoFormat, 1)
+} ASN1_CHOICE_END(CMS_RevocationInfoChoice)
+
+ASN1_NDEF_SEQUENCE(CMS_SignedData) = {
+	ASN1_SIMPLE(CMS_SignedData, version, LONG),
+	ASN1_SET_OF(CMS_SignedData, digestAlgorithms, X509_ALGOR),
+	ASN1_SIMPLE(CMS_SignedData, encapContentInfo, CMS_EncapsulatedContentInfo),
+	ASN1_IMP_SET_OF_OPT(CMS_SignedData, certificates, CMS_CertificateChoices, 0),
+	ASN1_IMP_SET_OF_OPT(CMS_SignedData, crls, CMS_RevocationInfoChoice, 1),
+	ASN1_SET_OF(CMS_SignedData, signerInfos, CMS_SignerInfo)
+} ASN1_NDEF_SEQUENCE_END(CMS_SignedData)
+
+ASN1_SEQUENCE(CMS_OriginatorInfo) = {
+	ASN1_IMP_SET_OF_OPT(CMS_OriginatorInfo, certificates, CMS_CertificateChoices, 0),
+	ASN1_IMP_SET_OF_OPT(CMS_OriginatorInfo, crls, CMS_RevocationInfoChoice, 1)
+} ASN1_SEQUENCE_END(CMS_OriginatorInfo)
+
+ASN1_NDEF_SEQUENCE(CMS_EncryptedContentInfo) = {
+	ASN1_SIMPLE(CMS_EncryptedContentInfo, contentType, ASN1_OBJECT),
+	ASN1_SIMPLE(CMS_EncryptedContentInfo, contentEncryptionAlgorithm, X509_ALGOR),
+	ASN1_IMP_OPT(CMS_EncryptedContentInfo, encryptedContent, ASN1_OCTET_STRING_NDEF, 0)
+} ASN1_NDEF_SEQUENCE_END(CMS_EncryptedContentInfo)
+
+ASN1_SEQUENCE(CMS_KeyTransRecipientInfo) = {
+	ASN1_SIMPLE(CMS_KeyTransRecipientInfo, version, LONG),
+	ASN1_SIMPLE(CMS_KeyTransRecipientInfo, rid, CMS_SignerIdentifier),
+	ASN1_SIMPLE(CMS_KeyTransRecipientInfo, keyEncryptionAlgorithm, X509_ALGOR),
+	ASN1_SIMPLE(CMS_KeyTransRecipientInfo, encryptedKey, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_KeyTransRecipientInfo)
+
+ASN1_SEQUENCE(CMS_OtherKeyAttribute) = {
+	ASN1_SIMPLE(CMS_OtherKeyAttribute, keyAttrId, ASN1_OBJECT),
+	ASN1_OPT(CMS_OtherKeyAttribute, keyAttr, ASN1_ANY)
+} ASN1_SEQUENCE_END(CMS_OtherKeyAttribute)
+
+ASN1_SEQUENCE(CMS_RecipientKeyIdentifier) = {
+	ASN1_SIMPLE(CMS_RecipientKeyIdentifier, subjectKeyIdentifier, ASN1_OCTET_STRING),
+	ASN1_OPT(CMS_RecipientKeyIdentifier, date, ASN1_GENERALIZEDTIME),
+	ASN1_OPT(CMS_RecipientKeyIdentifier, other, CMS_OtherKeyAttribute)
+} ASN1_SEQUENCE_END(CMS_RecipientKeyIdentifier)
+
+ASN1_CHOICE(CMS_KeyAgreeRecipientIdentifier) = {
+  ASN1_SIMPLE(CMS_KeyAgreeRecipientIdentifier, d.issuerAndSerialNumber, CMS_IssuerAndSerialNumber),
+  ASN1_IMP(CMS_KeyAgreeRecipientIdentifier, d.rKeyId, CMS_RecipientKeyIdentifier, 0)
+} ASN1_CHOICE_END(CMS_KeyAgreeRecipientIdentifier)
+
+ASN1_SEQUENCE(CMS_RecipientEncryptedKey) = {
+	ASN1_SIMPLE(CMS_RecipientEncryptedKey, rid, CMS_KeyAgreeRecipientIdentifier),
+	ASN1_SIMPLE(CMS_RecipientEncryptedKey, encryptedKey, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_RecipientEncryptedKey)
+
+ASN1_SEQUENCE(CMS_OriginatorPublicKey) = {
+  ASN1_SIMPLE(CMS_OriginatorPublicKey, algorithm, X509_ALGOR),
+  ASN1_SIMPLE(CMS_OriginatorPublicKey, publicKey, ASN1_BIT_STRING)
+} ASN1_SEQUENCE_END(CMS_OriginatorPublicKey)
+
+ASN1_CHOICE(CMS_OriginatorIdentifierOrKey) = {
+  ASN1_SIMPLE(CMS_OriginatorIdentifierOrKey, d.issuerAndSerialNumber, CMS_IssuerAndSerialNumber),
+  ASN1_IMP(CMS_OriginatorIdentifierOrKey, d.subjectKeyIdentifier, ASN1_OCTET_STRING, 0),
+  ASN1_IMP(CMS_OriginatorIdentifierOrKey, d.originatorKey, CMS_OriginatorPublicKey, 1)
+} ASN1_CHOICE_END(CMS_OriginatorIdentifierOrKey)
+
+ASN1_SEQUENCE(CMS_KeyAgreeRecipientInfo) = {
+	ASN1_SIMPLE(CMS_KeyAgreeRecipientInfo, version, LONG),
+	ASN1_EXP(CMS_KeyAgreeRecipientInfo, originator, CMS_OriginatorIdentifierOrKey, 0),
+	ASN1_EXP_OPT(CMS_KeyAgreeRecipientInfo, ukm, ASN1_OCTET_STRING, 1),
+	ASN1_SIMPLE(CMS_KeyAgreeRecipientInfo, keyEncryptionAlgorithm, X509_ALGOR),
+	ASN1_SEQUENCE_OF(CMS_KeyAgreeRecipientInfo, recipientEncryptedKeys, CMS_RecipientEncryptedKey)
+} ASN1_SEQUENCE_END(CMS_KeyAgreeRecipientInfo)
+
+ASN1_SEQUENCE(CMS_KEKIdentifier) = {
+	ASN1_SIMPLE(CMS_KEKIdentifier, keyIdentifier, ASN1_OCTET_STRING),
+	ASN1_OPT(CMS_KEKIdentifier, date, ASN1_GENERALIZEDTIME),
+	ASN1_OPT(CMS_KEKIdentifier, other, CMS_OtherKeyAttribute)
+} ASN1_SEQUENCE_END(CMS_KEKIdentifier)
+
+ASN1_SEQUENCE(CMS_KEKRecipientInfo) = {
+	ASN1_SIMPLE(CMS_KEKRecipientInfo, version, LONG),
+	ASN1_SIMPLE(CMS_KEKRecipientInfo, kekid, CMS_KEKIdentifier),
+	ASN1_SIMPLE(CMS_KEKRecipientInfo, keyEncryptionAlgorithm, X509_ALGOR),
+	ASN1_SIMPLE(CMS_KEKRecipientInfo, encryptedKey, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_KEKRecipientInfo)
+
+ASN1_SEQUENCE(CMS_PasswordRecipientInfo) = {
+	ASN1_SIMPLE(CMS_PasswordRecipientInfo, version, LONG),
+	ASN1_IMP_OPT(CMS_PasswordRecipientInfo, keyDerivationAlgorithm, X509_ALGOR, 0),
+	ASN1_SIMPLE(CMS_PasswordRecipientInfo, keyEncryptionAlgorithm, X509_ALGOR),
+	ASN1_SIMPLE(CMS_PasswordRecipientInfo, encryptedKey, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_PasswordRecipientInfo)
+
+ASN1_SEQUENCE(CMS_OtherRecipientInfo) = {
+  ASN1_SIMPLE(CMS_OtherRecipientInfo, oriType, ASN1_OBJECT),
+  ASN1_OPT(CMS_OtherRecipientInfo, oriValue, ASN1_ANY)
+} ASN1_SEQUENCE_END(CMS_OtherRecipientInfo)
+
+/* Free up RecipientInfo additional data */
+static int cms_ri_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+							void *exarg)
+	{
+	if(operation == ASN1_OP_FREE_PRE)
+		{
+		CMS_RecipientInfo *ri = (CMS_RecipientInfo *)*pval;
+		if (ri->type == CMS_RECIPINFO_TRANS)
+			{
+			CMS_KeyTransRecipientInfo *ktri = ri->d.ktri;
+			if (ktri->pkey)
+				EVP_PKEY_free(ktri->pkey);
+			if (ktri->recip)
+				X509_free(ktri->recip);
+			}
+		else if (ri->type == CMS_RECIPINFO_KEK)
+			{
+			CMS_KEKRecipientInfo *kekri = ri->d.kekri;
+			if (kekri->key)
+				{
+				OPENSSL_cleanse(kekri->key, kekri->keylen);
+				OPENSSL_free(kekri->key);
+				}
+			}
+		else if (ri->type == CMS_RECIPINFO_PASS)
+			{
+			CMS_PasswordRecipientInfo *pwri = ri->d.pwri;
+			if (pwri->pass)
+				{
+				OPENSSL_cleanse(pwri->pass, pwri->passlen);
+				OPENSSL_free(pwri->pass);
+				}
+			}
+		}
+	return 1;
+	}
+
+ASN1_CHOICE_cb(CMS_RecipientInfo, cms_ri_cb) = {
+	ASN1_SIMPLE(CMS_RecipientInfo, d.ktri, CMS_KeyTransRecipientInfo),
+	ASN1_IMP(CMS_RecipientInfo, d.kari, CMS_KeyAgreeRecipientInfo, 1),
+	ASN1_IMP(CMS_RecipientInfo, d.kekri, CMS_KEKRecipientInfo, 2),
+	ASN1_IMP(CMS_RecipientInfo, d.pwri, CMS_PasswordRecipientInfo, 3),
+	ASN1_IMP(CMS_RecipientInfo, d.ori, CMS_OtherRecipientInfo, 4)
+} ASN1_CHOICE_END_cb(CMS_RecipientInfo, CMS_RecipientInfo, type)
+
+ASN1_NDEF_SEQUENCE(CMS_EnvelopedData) = {
+	ASN1_SIMPLE(CMS_EnvelopedData, version, LONG),
+	ASN1_IMP_OPT(CMS_EnvelopedData, originatorInfo, CMS_OriginatorInfo, 0),
+	ASN1_SET_OF(CMS_EnvelopedData, recipientInfos, CMS_RecipientInfo),
+	ASN1_SIMPLE(CMS_EnvelopedData, encryptedContentInfo, CMS_EncryptedContentInfo),
+	ASN1_IMP_SET_OF_OPT(CMS_EnvelopedData, unprotectedAttrs, X509_ATTRIBUTE, 1)
+} ASN1_NDEF_SEQUENCE_END(CMS_EnvelopedData)
+
+ASN1_NDEF_SEQUENCE(CMS_DigestedData) = {
+	ASN1_SIMPLE(CMS_DigestedData, version, LONG),
+	ASN1_SIMPLE(CMS_DigestedData, digestAlgorithm, X509_ALGOR),
+	ASN1_SIMPLE(CMS_DigestedData, encapContentInfo, CMS_EncapsulatedContentInfo),
+	ASN1_SIMPLE(CMS_DigestedData, digest, ASN1_OCTET_STRING)
+} ASN1_NDEF_SEQUENCE_END(CMS_DigestedData)
+
+ASN1_NDEF_SEQUENCE(CMS_EncryptedData) = {
+	ASN1_SIMPLE(CMS_EncryptedData, version, LONG),
+	ASN1_SIMPLE(CMS_EncryptedData, encryptedContentInfo, CMS_EncryptedContentInfo),
+	ASN1_IMP_SET_OF_OPT(CMS_EncryptedData, unprotectedAttrs, X509_ATTRIBUTE, 1)
+} ASN1_NDEF_SEQUENCE_END(CMS_EncryptedData)
+
+ASN1_NDEF_SEQUENCE(CMS_AuthenticatedData) = {
+	ASN1_SIMPLE(CMS_AuthenticatedData, version, LONG),
+	ASN1_IMP_OPT(CMS_AuthenticatedData, originatorInfo, CMS_OriginatorInfo, 0),
+	ASN1_SET_OF(CMS_AuthenticatedData, recipientInfos, CMS_RecipientInfo),
+	ASN1_SIMPLE(CMS_AuthenticatedData, macAlgorithm, X509_ALGOR),
+	ASN1_IMP(CMS_AuthenticatedData, digestAlgorithm, X509_ALGOR, 1),
+	ASN1_SIMPLE(CMS_AuthenticatedData, encapContentInfo, CMS_EncapsulatedContentInfo),
+	ASN1_IMP_SET_OF_OPT(CMS_AuthenticatedData, authAttrs, X509_ALGOR, 2),
+	ASN1_SIMPLE(CMS_AuthenticatedData, mac, ASN1_OCTET_STRING),
+	ASN1_IMP_SET_OF_OPT(CMS_AuthenticatedData, unauthAttrs, X509_ALGOR, 3)
+} ASN1_NDEF_SEQUENCE_END(CMS_AuthenticatedData)
+
+ASN1_NDEF_SEQUENCE(CMS_CompressedData) = {
+	ASN1_SIMPLE(CMS_CompressedData, version, LONG),
+	ASN1_SIMPLE(CMS_CompressedData, compressionAlgorithm, X509_ALGOR),
+	ASN1_SIMPLE(CMS_CompressedData, encapContentInfo, CMS_EncapsulatedContentInfo),
+} ASN1_NDEF_SEQUENCE_END(CMS_CompressedData)
+
+/* This is the ANY DEFINED BY table for the top level ContentInfo structure */
+
+ASN1_ADB_TEMPLATE(cms_default) = ASN1_EXP(CMS_ContentInfo, d.other, ASN1_ANY, 0);
+
+ASN1_ADB(CMS_ContentInfo) = {
+	ADB_ENTRY(NID_pkcs7_data, ASN1_NDEF_EXP(CMS_ContentInfo, d.data, ASN1_OCTET_STRING_NDEF, 0)),
+	ADB_ENTRY(NID_pkcs7_signed, ASN1_NDEF_EXP(CMS_ContentInfo, d.signedData, CMS_SignedData, 0)),
+	ADB_ENTRY(NID_pkcs7_enveloped, ASN1_NDEF_EXP(CMS_ContentInfo, d.envelopedData, CMS_EnvelopedData, 0)),
+	ADB_ENTRY(NID_pkcs7_digest, ASN1_NDEF_EXP(CMS_ContentInfo, d.digestedData, CMS_DigestedData, 0)),
+	ADB_ENTRY(NID_pkcs7_encrypted, ASN1_NDEF_EXP(CMS_ContentInfo, d.encryptedData, CMS_EncryptedData, 0)),
+	ADB_ENTRY(NID_id_smime_ct_authData, ASN1_NDEF_EXP(CMS_ContentInfo, d.authenticatedData, CMS_AuthenticatedData, 0)),
+	ADB_ENTRY(NID_id_smime_ct_compressedData, ASN1_NDEF_EXP(CMS_ContentInfo, d.compressedData, CMS_CompressedData, 0)),
+} ASN1_ADB_END(CMS_ContentInfo, 0, contentType, 0, &cms_default_tt, NULL);
+
+/* CMS streaming support */
+static int cms_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+							void *exarg)
+	{
+	ASN1_STREAM_ARG *sarg = exarg;
+	CMS_ContentInfo *cms = NULL;
+	if (pval)
+		cms = (CMS_ContentInfo *)*pval;
+	else
+		return 1;
+	switch(operation)
+		{
+
+		case ASN1_OP_STREAM_PRE:
+		if (CMS_stream(&sarg->boundary, cms) <= 0)
+			return 0;
+		case ASN1_OP_DETACHED_PRE:
+		sarg->ndef_bio = CMS_dataInit(cms, sarg->out);
+		if (!sarg->ndef_bio)
+			return 0;
+		break;
+
+		case ASN1_OP_STREAM_POST:
+		case ASN1_OP_DETACHED_POST:
+		if (CMS_dataFinal(cms, sarg->ndef_bio) <= 0)
+			return 0;
+		break;
+
+		}
+	return 1;
+	}
+
+ASN1_NDEF_SEQUENCE_cb(CMS_ContentInfo, cms_cb) = {
+	ASN1_SIMPLE(CMS_ContentInfo, contentType, ASN1_OBJECT),
+	ASN1_ADB_OBJECT(CMS_ContentInfo)
+} ASN1_NDEF_SEQUENCE_END_cb(CMS_ContentInfo, CMS_ContentInfo)
+
+/* Specials for signed attributes */
+
+/* When signing attributes we want to reorder them to match the sorted
+ * encoding.
+ */
+
+ASN1_ITEM_TEMPLATE(CMS_Attributes_Sign) = 
+	ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SET_ORDER, 0, CMS_ATTRIBUTES, X509_ATTRIBUTE)
+ASN1_ITEM_TEMPLATE_END(CMS_Attributes_Sign)
+
+/* When verifying attributes we need to use the received order. So 
+ * we use SEQUENCE OF and tag it to SET OF
+ */
+
+ASN1_ITEM_TEMPLATE(CMS_Attributes_Verify) = 
+	ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SEQUENCE_OF | ASN1_TFLG_IMPTAG | ASN1_TFLG_UNIVERSAL,
+				V_ASN1_SET, CMS_ATTRIBUTES, X509_ATTRIBUTE)
+ASN1_ITEM_TEMPLATE_END(CMS_Attributes_Verify)
+
+
+
+ASN1_CHOICE(CMS_ReceiptsFrom) = {
+  ASN1_IMP(CMS_ReceiptsFrom, d.allOrFirstTier, LONG, 0),
+  ASN1_IMP_SEQUENCE_OF(CMS_ReceiptsFrom, d.receiptList, GENERAL_NAMES, 1)
+} ASN1_CHOICE_END(CMS_ReceiptsFrom)
+
+ASN1_SEQUENCE(CMS_ReceiptRequest) = {
+  ASN1_SIMPLE(CMS_ReceiptRequest, signedContentIdentifier, ASN1_OCTET_STRING),
+  ASN1_SIMPLE(CMS_ReceiptRequest, receiptsFrom, CMS_ReceiptsFrom),
+  ASN1_SEQUENCE_OF(CMS_ReceiptRequest, receiptsTo, GENERAL_NAMES)
+} ASN1_SEQUENCE_END(CMS_ReceiptRequest)
+
+ASN1_SEQUENCE(CMS_Receipt) = {
+  ASN1_SIMPLE(CMS_Receipt, version, LONG),
+  ASN1_SIMPLE(CMS_Receipt, contentType, ASN1_OBJECT),
+  ASN1_SIMPLE(CMS_Receipt, signedContentIdentifier, ASN1_OCTET_STRING),
+  ASN1_SIMPLE(CMS_Receipt, originatorSignatureValue, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_Receipt)
+
diff --git a/main/openssl/crypto/cms/cms_att.c b/main/openssl/crypto/cms/cms_att.c
new file mode 100644
index 00000000..5b71722e
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_att.c
@@ -0,0 +1,195 @@
+/* crypto/cms/cms_att.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include "cms.h"
+#include "cms_lcl.h"
+
+/* CMS SignedData Attribute utilities */
+
+int CMS_signed_get_attr_count(const CMS_SignerInfo *si)
+{
+	return X509at_get_attr_count(si->signedAttrs);
+}
+
+int CMS_signed_get_attr_by_NID(const CMS_SignerInfo *si, int nid,
+			  int lastpos)
+{
+	return X509at_get_attr_by_NID(si->signedAttrs, nid, lastpos);
+}
+
+int CMS_signed_get_attr_by_OBJ(const CMS_SignerInfo *si, ASN1_OBJECT *obj,
+			  int lastpos)
+{
+	return X509at_get_attr_by_OBJ(si->signedAttrs, obj, lastpos);
+}
+
+X509_ATTRIBUTE *CMS_signed_get_attr(const CMS_SignerInfo *si, int loc)
+{
+	return X509at_get_attr(si->signedAttrs, loc);
+}
+
+X509_ATTRIBUTE *CMS_signed_delete_attr(CMS_SignerInfo *si, int loc)
+{
+	return X509at_delete_attr(si->signedAttrs, loc);
+}
+
+int CMS_signed_add1_attr(CMS_SignerInfo *si, X509_ATTRIBUTE *attr)
+{
+	if(X509at_add1_attr(&si->signedAttrs, attr)) return 1;
+	return 0;
+}
+
+int CMS_signed_add1_attr_by_OBJ(CMS_SignerInfo *si,
+			const ASN1_OBJECT *obj, int type,
+			const void *bytes, int len)
+{
+	if(X509at_add1_attr_by_OBJ(&si->signedAttrs, obj,
+				type, bytes, len)) return 1;
+	return 0;
+}
+
+int CMS_signed_add1_attr_by_NID(CMS_SignerInfo *si,
+			int nid, int type,
+			const void *bytes, int len)
+{
+	if(X509at_add1_attr_by_NID(&si->signedAttrs, nid,
+				type, bytes, len)) return 1;
+	return 0;
+}
+
+int CMS_signed_add1_attr_by_txt(CMS_SignerInfo *si,
+			const char *attrname, int type,
+			const void *bytes, int len)
+{
+	if(X509at_add1_attr_by_txt(&si->signedAttrs, attrname,
+				type, bytes, len)) return 1;
+	return 0;
+}
+
+void *CMS_signed_get0_data_by_OBJ(CMS_SignerInfo *si, ASN1_OBJECT *oid,
+					int lastpos, int type)
+{
+	return X509at_get0_data_by_OBJ(si->signedAttrs, oid, lastpos, type);
+}
+
+int CMS_unsigned_get_attr_count(const CMS_SignerInfo *si)
+{
+	return X509at_get_attr_count(si->unsignedAttrs);
+}
+
+int CMS_unsigned_get_attr_by_NID(const CMS_SignerInfo *si, int nid,
+			  int lastpos)
+{
+	return X509at_get_attr_by_NID(si->unsignedAttrs, nid, lastpos);
+}
+
+int CMS_unsigned_get_attr_by_OBJ(const CMS_SignerInfo *si, ASN1_OBJECT *obj,
+			  int lastpos)
+{
+	return X509at_get_attr_by_OBJ(si->unsignedAttrs, obj, lastpos);
+}
+
+X509_ATTRIBUTE *CMS_unsigned_get_attr(const CMS_SignerInfo *si, int loc)
+{
+	return X509at_get_attr(si->unsignedAttrs, loc);
+}
+
+X509_ATTRIBUTE *CMS_unsigned_delete_attr(CMS_SignerInfo *si, int loc)
+{
+	return X509at_delete_attr(si->unsignedAttrs, loc);
+}
+
+int CMS_unsigned_add1_attr(CMS_SignerInfo *si, X509_ATTRIBUTE *attr)
+{
+	if(X509at_add1_attr(&si->unsignedAttrs, attr)) return 1;
+	return 0;
+}
+
+int CMS_unsigned_add1_attr_by_OBJ(CMS_SignerInfo *si,
+			const ASN1_OBJECT *obj, int type,
+			const void *bytes, int len)
+{
+	if(X509at_add1_attr_by_OBJ(&si->unsignedAttrs, obj,
+				type, bytes, len)) return 1;
+	return 0;
+}
+
+int CMS_unsigned_add1_attr_by_NID(CMS_SignerInfo *si,
+			int nid, int type,
+			const void *bytes, int len)
+{
+	if(X509at_add1_attr_by_NID(&si->unsignedAttrs, nid,
+				type, bytes, len)) return 1;
+	return 0;
+}
+
+int CMS_unsigned_add1_attr_by_txt(CMS_SignerInfo *si,
+			const char *attrname, int type,
+			const void *bytes, int len)
+{
+	if(X509at_add1_attr_by_txt(&si->unsignedAttrs, attrname,
+				type, bytes, len)) return 1;
+	return 0;
+}
+
+void *CMS_unsigned_get0_data_by_OBJ(CMS_SignerInfo *si, ASN1_OBJECT *oid,
+					int lastpos, int type)
+{
+	return X509at_get0_data_by_OBJ(si->unsignedAttrs, oid, lastpos, type);
+}
+
+/* Specific attribute cases */
diff --git a/main/openssl/crypto/cms/cms_cd.c b/main/openssl/crypto/cms/cms_cd.c
new file mode 100644
index 00000000..20216881
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_cd.c
@@ -0,0 +1,136 @@
+/* crypto/cms/cms_cd.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/bio.h>
+#ifndef OPENSSL_NO_COMP
+#include <openssl/comp.h>
+#endif
+#include "cms_lcl.h"
+
+DECLARE_ASN1_ITEM(CMS_CompressedData)
+
+#ifdef ZLIB
+
+/* CMS CompressedData Utilities */
+
+CMS_ContentInfo *cms_CompressedData_create(int comp_nid)
+	{
+	CMS_ContentInfo *cms;
+	CMS_CompressedData *cd;
+	/* Will need something cleverer if there is ever more than one
+	 * compression algorithm or parameters have some meaning...
+	 */
+	if (comp_nid != NID_zlib_compression)
+		{
+		CMSerr(CMS_F_CMS_COMPRESSEDDATA_CREATE,
+				CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM);
+		return NULL;
+		}
+	cms = CMS_ContentInfo_new();
+	if (!cms)
+		return NULL;
+
+	cd = M_ASN1_new_of(CMS_CompressedData);
+
+	if (!cd)
+		goto err;
+
+	cms->contentType = OBJ_nid2obj(NID_id_smime_ct_compressedData);
+	cms->d.compressedData = cd;
+
+	cd->version = 0;
+
+	X509_ALGOR_set0(cd->compressionAlgorithm,
+			OBJ_nid2obj(NID_zlib_compression),
+			V_ASN1_UNDEF, NULL);
+
+	cd->encapContentInfo->eContentType = OBJ_nid2obj(NID_pkcs7_data);
+
+	return cms;
+
+	err:
+
+	if (cms)
+		CMS_ContentInfo_free(cms);
+
+	return NULL;
+	}
+
+BIO *cms_CompressedData_init_bio(CMS_ContentInfo *cms)
+	{
+	CMS_CompressedData *cd;
+	ASN1_OBJECT *compoid;
+	if (OBJ_obj2nid(cms->contentType) != NID_id_smime_ct_compressedData)
+		{
+		CMSerr(CMS_F_CMS_COMPRESSEDDATA_INIT_BIO,
+				CMS_R_CONTENT_TYPE_NOT_COMPRESSED_DATA);
+		return NULL;
+		}
+	cd = cms->d.compressedData;
+	X509_ALGOR_get0(&compoid, NULL, NULL, cd->compressionAlgorithm);
+	if (OBJ_obj2nid(compoid) != NID_zlib_compression)
+		{
+		CMSerr(CMS_F_CMS_COMPRESSEDDATA_INIT_BIO,
+				CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM);
+		return NULL;
+		}
+	return BIO_new(BIO_f_zlib());
+	}
+
+#endif
diff --git a/main/openssl/crypto/cms/cms_dd.c b/main/openssl/crypto/cms/cms_dd.c
new file mode 100644
index 00000000..8919c15b
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_dd.c
@@ -0,0 +1,148 @@
+/* crypto/cms/cms_dd.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include "cms_lcl.h"
+
+DECLARE_ASN1_ITEM(CMS_DigestedData)
+
+/* CMS DigestedData Utilities */
+
+CMS_ContentInfo *cms_DigestedData_create(const EVP_MD *md)
+	{
+	CMS_ContentInfo *cms;
+	CMS_DigestedData *dd;
+	cms = CMS_ContentInfo_new();
+	if (!cms)
+		return NULL;
+
+	dd = M_ASN1_new_of(CMS_DigestedData);
+
+	if (!dd)
+		goto err;
+
+	cms->contentType = OBJ_nid2obj(NID_pkcs7_digest);
+	cms->d.digestedData = dd;
+
+	dd->version = 0;
+	dd->encapContentInfo->eContentType = OBJ_nid2obj(NID_pkcs7_data);
+
+	cms_DigestAlgorithm_set(dd->digestAlgorithm, md);
+
+	return cms;
+
+	err:
+
+	if (cms)
+		CMS_ContentInfo_free(cms);
+
+	return NULL;
+	}
+
+BIO *cms_DigestedData_init_bio(CMS_ContentInfo *cms)
+	{
+	CMS_DigestedData *dd;
+	dd = cms->d.digestedData;
+	return cms_DigestAlgorithm_init_bio(dd->digestAlgorithm);
+	}
+
+int cms_DigestedData_do_final(CMS_ContentInfo *cms, BIO *chain, int verify)
+	{
+	EVP_MD_CTX mctx;
+	unsigned char md[EVP_MAX_MD_SIZE];
+	unsigned int mdlen;
+	int r = 0;
+	CMS_DigestedData *dd;
+	EVP_MD_CTX_init(&mctx);
+
+	dd = cms->d.digestedData;
+
+	if (!cms_DigestAlgorithm_find_ctx(&mctx, chain, dd->digestAlgorithm))
+		goto err;
+
+	if (EVP_DigestFinal_ex(&mctx, md, &mdlen) <= 0)
+		goto err;
+
+	if (verify)
+		{
+		if (mdlen != (unsigned int)dd->digest->length)
+			{
+			CMSerr(CMS_F_CMS_DIGESTEDDATA_DO_FINAL,
+				CMS_R_MESSAGEDIGEST_WRONG_LENGTH);
+			goto err;
+			}
+
+		if (memcmp(md, dd->digest->data, mdlen))
+			CMSerr(CMS_F_CMS_DIGESTEDDATA_DO_FINAL,
+				CMS_R_VERIFICATION_FAILURE);
+		else
+			r = 1;
+		}
+	else
+		{
+		if (!ASN1_STRING_set(dd->digest, md, mdlen))
+			goto err;
+		r = 1;
+		}
+
+	err:
+	EVP_MD_CTX_cleanup(&mctx);
+
+	return r;
+
+	}
diff --git a/main/openssl/crypto/cms/cms_enc.c b/main/openssl/crypto/cms/cms_enc.c
new file mode 100644
index 00000000..bebeaf29
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_enc.c
@@ -0,0 +1,294 @@
+/* crypto/cms/cms_enc.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/rand.h>
+#include "cms_lcl.h"
+
+/* CMS EncryptedData Utilities */
+
+DECLARE_ASN1_ITEM(CMS_EncryptedData)
+
+/* Return BIO based on EncryptedContentInfo and key */
+
+BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec)
+	{
+	BIO *b;
+	EVP_CIPHER_CTX *ctx;
+	const EVP_CIPHER *ciph;
+	X509_ALGOR *calg = ec->contentEncryptionAlgorithm;
+	unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL;
+	unsigned char *tkey = NULL;
+	size_t tkeylen = 0;
+
+	int ok = 0;
+
+	int enc, keep_key = 0;
+
+	enc = ec->cipher ? 1 : 0;
+
+	b = BIO_new(BIO_f_cipher());
+	if (!b)
+		{
+		CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+							ERR_R_MALLOC_FAILURE);
+		return NULL;
+		}
+
+	BIO_get_cipher_ctx(b, &ctx);
+
+	if (enc)
+		{
+		ciph = ec->cipher;
+		/* If not keeping key set cipher to NULL so subsequent calls
+		 * decrypt.
+		 */
+		if (ec->key)
+			ec->cipher = NULL;
+		}
+	else
+		{
+		ciph = EVP_get_cipherbyobj(calg->algorithm);
+
+		if (!ciph)
+			{
+			CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+							CMS_R_UNKNOWN_CIPHER);
+			goto err;
+			}
+		}
+
+	if (EVP_CipherInit_ex(ctx, ciph, NULL, NULL, NULL, enc) <= 0)
+		{
+		CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+				CMS_R_CIPHER_INITIALISATION_ERROR);
+		goto err;
+		}
+
+	if (enc)
+		{
+		int ivlen;
+		calg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(ctx));
+		/* Generate a random IV if we need one */
+		ivlen = EVP_CIPHER_CTX_iv_length(ctx);
+		if (ivlen > 0)
+			{
+			if (RAND_pseudo_bytes(iv, ivlen) <= 0)
+				goto err;
+			piv = iv;
+			}
+		}
+	else if (EVP_CIPHER_asn1_to_param(ctx, calg->parameter) <= 0)
+		{
+		CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+				CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+		goto err;
+		}
+	tkeylen = EVP_CIPHER_CTX_key_length(ctx);
+	/* Generate random session key */
+	if (!enc || !ec->key)
+		{
+		tkey = OPENSSL_malloc(tkeylen);
+		if (!tkey)
+			{
+			CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+							ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		if (EVP_CIPHER_CTX_rand_key(ctx, tkey) <= 0)
+			goto err;
+		}
+
+	if (!ec->key)
+		{
+		ec->key = tkey;
+		ec->keylen = tkeylen;
+		tkey = NULL;
+		if (enc)
+			keep_key = 1;
+		else
+			ERR_clear_error();
+		
+		}
+
+	if (ec->keylen != tkeylen)
+		{
+		/* If necessary set key length */
+		if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0)
+			{
+			/* Only reveal failure if debugging so we don't
+			 * leak information which may be useful in MMA.
+			 */
+			if (enc || ec->debug)
+				{
+				CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+						CMS_R_INVALID_KEY_LENGTH);
+				goto err;
+				}
+			else
+				{
+				/* Use random key */
+				OPENSSL_cleanse(ec->key, ec->keylen);
+				OPENSSL_free(ec->key);
+				ec->key = tkey;
+				ec->keylen = tkeylen;
+				tkey = NULL;
+				ERR_clear_error();
+				}
+			}
+		}
+
+	if (EVP_CipherInit_ex(ctx, NULL, NULL, ec->key, piv, enc) <= 0)
+		{
+		CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+				CMS_R_CIPHER_INITIALISATION_ERROR);
+		goto err;
+		}
+
+	if (piv)
+		{
+		calg->parameter = ASN1_TYPE_new();
+		if (!calg->parameter)
+			{
+			CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+							ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		if (EVP_CIPHER_param_to_asn1(ctx, calg->parameter) <= 0)
+			{
+			CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+				CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+			goto err;
+			}
+		}
+	ok = 1;
+
+	err:
+	if (ec->key && !keep_key)
+		{
+		OPENSSL_cleanse(ec->key, ec->keylen);
+		OPENSSL_free(ec->key);
+		ec->key = NULL;
+		}
+	if (tkey)
+		{
+		OPENSSL_cleanse(tkey, tkeylen);
+		OPENSSL_free(tkey);
+		}
+	if (ok)
+		return b;
+	BIO_free(b);
+	return NULL;
+	}
+
+int cms_EncryptedContent_init(CMS_EncryptedContentInfo *ec, 
+				const EVP_CIPHER *cipher,
+				const unsigned char *key, size_t keylen)
+	{
+	ec->cipher = cipher;
+	if (key)
+		{
+		ec->key = OPENSSL_malloc(keylen);
+		if (!ec->key)
+			return 0;
+		memcpy(ec->key, key, keylen);
+		}
+	ec->keylen = keylen;
+	if (cipher)
+		ec->contentType = OBJ_nid2obj(NID_pkcs7_data);
+	return 1;
+	}
+
+int CMS_EncryptedData_set1_key(CMS_ContentInfo *cms, const EVP_CIPHER *ciph,
+				const unsigned char *key, size_t keylen)
+	{
+	CMS_EncryptedContentInfo *ec;
+	if (!key || !keylen)
+		{
+		CMSerr(CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY, CMS_R_NO_KEY);
+		return 0;
+		}
+	if (ciph)
+		{
+		cms->d.encryptedData = M_ASN1_new_of(CMS_EncryptedData);
+		if (!cms->d.encryptedData)
+			{
+			CMSerr(CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY,
+				ERR_R_MALLOC_FAILURE);
+			return 0;
+			}
+		cms->contentType = OBJ_nid2obj(NID_pkcs7_encrypted);
+		cms->d.encryptedData->version = 0;
+		}
+	else if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_encrypted)
+		{
+		CMSerr(CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY,
+						CMS_R_NOT_ENCRYPTED_DATA);
+		return 0;
+		}
+	ec = cms->d.encryptedData->encryptedContentInfo;
+	return cms_EncryptedContent_init(ec, ciph, key, keylen);
+	}
+
+BIO *cms_EncryptedData_init_bio(CMS_ContentInfo *cms)
+	{
+	CMS_EncryptedData *enc = cms->d.encryptedData;
+	if (enc->encryptedContentInfo->cipher && enc->unprotectedAttrs)
+		enc->version = 2;
+	return cms_EncryptedContent_init_bio(enc->encryptedContentInfo);
+	}
diff --git a/main/openssl/crypto/cms/cms_env.c b/main/openssl/crypto/cms/cms_env.c
new file mode 100644
index 00000000..be20b1c0
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_env.c
@@ -0,0 +1,876 @@
+/* crypto/cms/cms_env.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/rand.h>
+#include <openssl/aes.h>
+#include "cms_lcl.h"
+#include "asn1_locl.h"
+
+/* CMS EnvelopedData Utilities */
+
+DECLARE_ASN1_ITEM(CMS_EnvelopedData)
+DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo)
+DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo)
+DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute)
+
+DECLARE_STACK_OF(CMS_RecipientInfo)
+
+CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms)
+	{
+	if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped)
+		{
+		CMSerr(CMS_F_CMS_GET0_ENVELOPED,
+				CMS_R_CONTENT_TYPE_NOT_ENVELOPED_DATA);
+		return NULL;
+		}
+	return cms->d.envelopedData;
+	}
+
+static CMS_EnvelopedData *cms_enveloped_data_init(CMS_ContentInfo *cms)
+	{
+	if (cms->d.other == NULL)
+		{
+		cms->d.envelopedData = M_ASN1_new_of(CMS_EnvelopedData);
+		if (!cms->d.envelopedData)
+			{
+			CMSerr(CMS_F_CMS_ENVELOPED_DATA_INIT,
+							ERR_R_MALLOC_FAILURE);
+			return NULL;
+			}
+		cms->d.envelopedData->version = 0;
+		cms->d.envelopedData->encryptedContentInfo->contentType =
+						OBJ_nid2obj(NID_pkcs7_data);
+		ASN1_OBJECT_free(cms->contentType);
+		cms->contentType = OBJ_nid2obj(NID_pkcs7_enveloped);
+		return cms->d.envelopedData;
+		}
+	return cms_get0_enveloped(cms);
+	}
+
+STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms)
+	{
+	CMS_EnvelopedData *env;
+	env = cms_get0_enveloped(cms);
+	if (!env)
+		return NULL;
+	return env->recipientInfos;
+	}
+
+int CMS_RecipientInfo_type(CMS_RecipientInfo *ri)
+	{
+	return ri->type;
+	}
+
+CMS_ContentInfo *CMS_EnvelopedData_create(const EVP_CIPHER *cipher)
+	{
+	CMS_ContentInfo *cms;
+	CMS_EnvelopedData *env;
+	cms = CMS_ContentInfo_new();
+	if (!cms)
+		goto merr;
+	env = cms_enveloped_data_init(cms);
+	if (!env)
+		goto merr;
+	if (!cms_EncryptedContent_init(env->encryptedContentInfo,
+					cipher, NULL, 0))
+		goto merr;
+	return cms;
+	merr:
+	if (cms)
+		CMS_ContentInfo_free(cms);
+	CMSerr(CMS_F_CMS_ENVELOPEDDATA_CREATE, ERR_R_MALLOC_FAILURE);
+	return NULL;
+	}
+
+/* Key Transport Recipient Info (KTRI) routines */
+
+/* Add a recipient certificate. For now only handle key transport.
+ * If we ever handle key agreement will need updating.
+ */
+
+CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms,
+					X509 *recip, unsigned int flags)
+	{
+	CMS_RecipientInfo *ri = NULL;
+	CMS_KeyTransRecipientInfo *ktri;
+	CMS_EnvelopedData *env;
+	EVP_PKEY *pk = NULL;
+	int i, type;
+	env = cms_get0_enveloped(cms);
+	if (!env)
+		goto err;
+
+	/* Initialize recipient info */
+	ri = M_ASN1_new_of(CMS_RecipientInfo);
+	if (!ri)
+		goto merr;
+
+	/* Initialize and add key transport recipient info */
+
+	ri->d.ktri = M_ASN1_new_of(CMS_KeyTransRecipientInfo);
+	if (!ri->d.ktri)
+		goto merr;
+	ri->type = CMS_RECIPINFO_TRANS;
+
+	ktri = ri->d.ktri;
+
+	X509_check_purpose(recip, -1, -1);
+	pk = X509_get_pubkey(recip);
+	if (!pk)
+		{
+		CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT,
+				CMS_R_ERROR_GETTING_PUBLIC_KEY);
+		goto err;
+		}
+	CRYPTO_add(&recip->references, 1, CRYPTO_LOCK_X509);
+	ktri->pkey = pk;
+	ktri->recip = recip;
+
+	if (flags & CMS_USE_KEYID)
+		{
+		ktri->version = 2;
+		type = CMS_RECIPINFO_KEYIDENTIFIER;
+		}
+	else
+		{
+		ktri->version = 0;
+		type = CMS_RECIPINFO_ISSUER_SERIAL;
+		}
+
+	/* Not a typo: RecipientIdentifier and SignerIdentifier are the
+	 * same structure.
+	 */
+
+	if (!cms_set1_SignerIdentifier(ktri->rid, recip, type))
+		goto err;
+
+	if (pk->ameth && pk->ameth->pkey_ctrl)
+		{
+		i = pk->ameth->pkey_ctrl(pk, ASN1_PKEY_CTRL_CMS_ENVELOPE,
+						0, ri);
+		if (i == -2)
+			{
+			CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT,
+				CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
+			goto err;
+			}
+		if (i <= 0)
+			{
+			CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT,
+				CMS_R_CTRL_FAILURE);
+			goto err;
+			}
+		}
+
+	if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri))
+		goto merr;
+
+	return ri;
+
+	merr:
+	CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT, ERR_R_MALLOC_FAILURE);
+	err:
+	if (ri)
+		M_ASN1_free_of(ri, CMS_RecipientInfo);
+	return NULL;
+
+	}
+
+int CMS_RecipientInfo_ktri_get0_algs(CMS_RecipientInfo *ri,
+					EVP_PKEY **pk, X509 **recip,
+					X509_ALGOR **palg)
+	{
+	CMS_KeyTransRecipientInfo *ktri;
+	if (ri->type != CMS_RECIPINFO_TRANS)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS,
+			CMS_R_NOT_KEY_TRANSPORT);
+		return 0;
+		}
+
+	ktri = ri->d.ktri;
+
+	if (pk)
+		*pk = ktri->pkey;
+	if (recip)
+		*recip = ktri->recip;
+	if (palg)
+		*palg = ktri->keyEncryptionAlgorithm;
+	return 1;
+	}
+
+int CMS_RecipientInfo_ktri_get0_signer_id(CMS_RecipientInfo *ri,
+					ASN1_OCTET_STRING **keyid,
+					X509_NAME **issuer, ASN1_INTEGER **sno)
+	{
+	CMS_KeyTransRecipientInfo *ktri;
+	if (ri->type != CMS_RECIPINFO_TRANS)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID,
+			CMS_R_NOT_KEY_TRANSPORT);
+		return 0;
+		}
+	ktri = ri->d.ktri;
+
+	return cms_SignerIdentifier_get0_signer_id(ktri->rid,
+							keyid, issuer, sno);
+	}
+
+int CMS_RecipientInfo_ktri_cert_cmp(CMS_RecipientInfo *ri, X509 *cert)
+	{
+	if (ri->type != CMS_RECIPINFO_TRANS)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_CERT_CMP,
+			CMS_R_NOT_KEY_TRANSPORT);
+		return -2;
+		}
+	return cms_SignerIdentifier_cert_cmp(ri->d.ktri->rid, cert);
+	}
+
+int CMS_RecipientInfo_set0_pkey(CMS_RecipientInfo *ri, EVP_PKEY *pkey)
+	{
+	if (ri->type != CMS_RECIPINFO_TRANS)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY,
+			CMS_R_NOT_KEY_TRANSPORT);
+		return 0;
+		}
+	ri->d.ktri->pkey = pkey;
+	return 1;
+	}
+
+/* Encrypt content key in key transport recipient info */
+
+static int cms_RecipientInfo_ktri_encrypt(CMS_ContentInfo *cms,
+					CMS_RecipientInfo *ri)
+	{
+	CMS_KeyTransRecipientInfo *ktri;
+	CMS_EncryptedContentInfo *ec;
+	EVP_PKEY_CTX *pctx = NULL;
+	unsigned char *ek = NULL;
+	size_t eklen;
+
+	int ret = 0;
+
+	if (ri->type != CMS_RECIPINFO_TRANS)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT,
+			CMS_R_NOT_KEY_TRANSPORT);
+		return 0;
+		}
+	ktri = ri->d.ktri;
+	ec = cms->d.envelopedData->encryptedContentInfo;
+
+	pctx = EVP_PKEY_CTX_new(ktri->pkey, NULL);
+	if (!pctx)
+		return 0;
+
+	if (EVP_PKEY_encrypt_init(pctx) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_ENCRYPT,
+				EVP_PKEY_CTRL_CMS_ENCRYPT, 0, ri) <= 0)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT, CMS_R_CTRL_ERROR);
+		goto err;
+		}
+
+	if (EVP_PKEY_encrypt(pctx, NULL, &eklen, ec->key, ec->keylen) <= 0)
+		goto err;
+
+	ek = OPENSSL_malloc(eklen);
+
+	if (ek == NULL)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT,
+							ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	if (EVP_PKEY_encrypt(pctx, ek, &eklen, ec->key, ec->keylen) <= 0)
+		goto err;
+
+	ASN1_STRING_set0(ktri->encryptedKey, ek, eklen);
+	ek = NULL;
+
+	ret = 1;
+
+	err:
+	if (pctx)
+		EVP_PKEY_CTX_free(pctx);
+	if (ek)
+		OPENSSL_free(ek);
+	return ret;
+
+	}
+
+/* Decrypt content key from KTRI */
+
+static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
+							CMS_RecipientInfo *ri)
+	{
+	CMS_KeyTransRecipientInfo *ktri = ri->d.ktri;
+	EVP_PKEY_CTX *pctx = NULL;
+	unsigned char *ek = NULL;
+	size_t eklen;
+	int ret = 0;
+	CMS_EncryptedContentInfo *ec;
+	ec = cms->d.envelopedData->encryptedContentInfo;
+
+	if (ktri->pkey == NULL)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT,
+			CMS_R_NO_PRIVATE_KEY);
+		return 0;
+		}
+
+	pctx = EVP_PKEY_CTX_new(ktri->pkey, NULL);
+	if (!pctx)
+		return 0;
+
+	if (EVP_PKEY_decrypt_init(pctx) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_DECRYPT,
+				EVP_PKEY_CTRL_CMS_DECRYPT, 0, ri) <= 0)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT, CMS_R_CTRL_ERROR);
+		goto err;
+		}
+
+	if (EVP_PKEY_decrypt(pctx, NULL, &eklen,
+				ktri->encryptedKey->data,
+				ktri->encryptedKey->length) <= 0)
+		goto err;
+
+	ek = OPENSSL_malloc(eklen);
+
+	if (ek == NULL)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT,
+							ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	if (EVP_PKEY_decrypt(pctx, ek, &eklen,
+				ktri->encryptedKey->data,
+				ktri->encryptedKey->length) <= 0)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT, CMS_R_CMS_LIB);
+		goto err;
+		}
+
+	ret = 1;
+
+	if (ec->key)
+		{
+		OPENSSL_cleanse(ec->key, ec->keylen);
+		OPENSSL_free(ec->key);
+		}
+
+	ec->key = ek;
+	ec->keylen = eklen;
+
+	err:
+	if (pctx)
+		EVP_PKEY_CTX_free(pctx);
+	if (!ret && ek)
+		OPENSSL_free(ek);
+
+	return ret;
+	}
+
+/* Key Encrypted Key (KEK) RecipientInfo routines */
+
+int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, 
+					const unsigned char *id, size_t idlen)
+	{
+	ASN1_OCTET_STRING tmp_os;
+	CMS_KEKRecipientInfo *kekri;
+	if (ri->type != CMS_RECIPINFO_KEK)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ID_CMP, CMS_R_NOT_KEK);
+		return -2;
+		}
+	kekri = ri->d.kekri;
+	tmp_os.type = V_ASN1_OCTET_STRING;
+	tmp_os.flags = 0;
+	tmp_os.data = (unsigned char *)id;
+	tmp_os.length = (int)idlen;
+	return ASN1_OCTET_STRING_cmp(&tmp_os, kekri->kekid->keyIdentifier);
+	}
+
+/* For now hard code AES key wrap info */
+
+static size_t aes_wrap_keylen(int nid)
+	{
+	switch (nid)
+		{
+		case NID_id_aes128_wrap:
+		return 16;
+
+		case NID_id_aes192_wrap:
+		return  24;
+
+		case NID_id_aes256_wrap:
+		return  32;
+
+		default:
+		return 0;
+		}
+	}
+
+CMS_RecipientInfo *CMS_add0_recipient_key(CMS_ContentInfo *cms, int nid,
+					unsigned char *key, size_t keylen,
+					unsigned char *id, size_t idlen,
+					ASN1_GENERALIZEDTIME *date,
+					ASN1_OBJECT *otherTypeId,
+					ASN1_TYPE *otherType)
+	{
+	CMS_RecipientInfo *ri = NULL;
+	CMS_EnvelopedData *env;
+	CMS_KEKRecipientInfo *kekri;
+	env = cms_get0_enveloped(cms);
+	if (!env)
+		goto err;
+
+	if (nid == NID_undef)
+		{
+		switch (keylen)
+			{
+			case 16:
+			nid = NID_id_aes128_wrap;
+			break;
+
+			case  24:
+			nid = NID_id_aes192_wrap;
+			break;
+
+			case  32:
+			nid = NID_id_aes256_wrap;
+			break;
+
+			default:
+			CMSerr(CMS_F_CMS_ADD0_RECIPIENT_KEY,
+						CMS_R_INVALID_KEY_LENGTH);
+			goto err;
+			}
+
+		}
+	else
+		{
+
+		size_t exp_keylen = aes_wrap_keylen(nid);
+
+		if (!exp_keylen)
+			{
+			CMSerr(CMS_F_CMS_ADD0_RECIPIENT_KEY,
+					CMS_R_UNSUPPORTED_KEK_ALGORITHM);
+			goto err;
+			}
+
+		if (keylen != exp_keylen)
+			{
+			CMSerr(CMS_F_CMS_ADD0_RECIPIENT_KEY,
+					CMS_R_INVALID_KEY_LENGTH);
+			goto err;
+			}
+
+		}
+
+	/* Initialize recipient info */
+	ri = M_ASN1_new_of(CMS_RecipientInfo);
+	if (!ri)
+		goto merr;
+
+	ri->d.kekri = M_ASN1_new_of(CMS_KEKRecipientInfo);
+	if (!ri->d.kekri)
+		goto merr;
+	ri->type = CMS_RECIPINFO_KEK;
+
+	kekri = ri->d.kekri;
+
+	if (otherTypeId)
+		{
+		kekri->kekid->other = M_ASN1_new_of(CMS_OtherKeyAttribute);
+		if (kekri->kekid->other == NULL)
+			goto merr;
+		}
+
+	if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri))
+		goto merr;
+
+
+	/* After this point no calls can fail */
+
+	kekri->version = 4;
+
+	kekri->key = key;
+	kekri->keylen = keylen;
+
+	ASN1_STRING_set0(kekri->kekid->keyIdentifier, id, idlen);
+
+	kekri->kekid->date = date;
+
+	if (kekri->kekid->other)
+		{
+		kekri->kekid->other->keyAttrId = otherTypeId;
+		kekri->kekid->other->keyAttr = otherType;
+		}
+
+	X509_ALGOR_set0(kekri->keyEncryptionAlgorithm,
+				OBJ_nid2obj(nid), V_ASN1_UNDEF, NULL);
+
+	return ri;
+
+	merr:
+	CMSerr(CMS_F_CMS_ADD0_RECIPIENT_KEY, ERR_R_MALLOC_FAILURE);
+	err:
+	if (ri)
+		M_ASN1_free_of(ri, CMS_RecipientInfo);
+	return NULL;
+
+	}
+
+int CMS_RecipientInfo_kekri_get0_id(CMS_RecipientInfo *ri,
+					X509_ALGOR **palg,
+					ASN1_OCTET_STRING **pid,
+					ASN1_GENERALIZEDTIME **pdate,
+					ASN1_OBJECT **potherid,
+					ASN1_TYPE **pothertype)
+	{
+	CMS_KEKIdentifier *rkid;
+	if (ri->type != CMS_RECIPINFO_KEK)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_GET0_ID, CMS_R_NOT_KEK);
+		return 0;
+		}
+	rkid =  ri->d.kekri->kekid;
+	if (palg)
+		*palg = ri->d.kekri->keyEncryptionAlgorithm;
+	if (pid)
+		*pid = rkid->keyIdentifier;
+	if (pdate)
+		*pdate = rkid->date;
+	if (potherid)
+		{
+		if (rkid->other)
+			*potherid = rkid->other->keyAttrId;
+		else
+			*potherid = NULL;
+		}
+	if (pothertype)
+		{
+		if (rkid->other)
+			*pothertype = rkid->other->keyAttr;
+		else
+			*pothertype = NULL;
+		}
+	return 1;
+	}
+
+int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri, 
+				unsigned char *key, size_t keylen)
+	{
+	CMS_KEKRecipientInfo *kekri;
+	if (ri->type != CMS_RECIPINFO_KEK)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_KEY, CMS_R_NOT_KEK);
+		return 0;
+		}
+
+	kekri = ri->d.kekri;
+	kekri->key = key;
+	kekri->keylen = keylen;
+	return 1;
+	}
+
+
+/* Encrypt content key in KEK recipient info */
+
+static int cms_RecipientInfo_kekri_encrypt(CMS_ContentInfo *cms,
+					CMS_RecipientInfo *ri)
+	{
+	CMS_EncryptedContentInfo *ec;
+	CMS_KEKRecipientInfo *kekri;
+	AES_KEY actx;
+	unsigned char *wkey = NULL;
+	int wkeylen;
+	int r = 0;
+
+	ec = cms->d.envelopedData->encryptedContentInfo;
+
+	kekri = ri->d.kekri;
+
+	if (!kekri->key)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT, CMS_R_NO_KEY);
+		return 0;
+		}
+
+	if (AES_set_encrypt_key(kekri->key, kekri->keylen << 3, &actx))
+		{ 
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT,
+						CMS_R_ERROR_SETTING_KEY);
+		goto err;
+		}
+
+	wkey = OPENSSL_malloc(ec->keylen + 8);
+
+	if (!wkey)
+		{ 
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT,
+						ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	wkeylen = AES_wrap_key(&actx, NULL, wkey, ec->key, ec->keylen);
+
+	if (wkeylen <= 0)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT, CMS_R_WRAP_ERROR);
+		goto err;
+		}
+
+	ASN1_STRING_set0(kekri->encryptedKey, wkey, wkeylen);
+
+	r = 1;
+
+	err:
+
+	if (!r && wkey)
+		OPENSSL_free(wkey);
+	OPENSSL_cleanse(&actx, sizeof(actx));
+
+	return r;
+
+	}
+
+/* Decrypt content key in KEK recipient info */
+
+static int cms_RecipientInfo_kekri_decrypt(CMS_ContentInfo *cms,
+					CMS_RecipientInfo *ri)
+	{
+	CMS_EncryptedContentInfo *ec;
+	CMS_KEKRecipientInfo *kekri;
+	AES_KEY actx;
+	unsigned char *ukey = NULL;
+	int ukeylen;
+	int r = 0, wrap_nid;
+
+	ec = cms->d.envelopedData->encryptedContentInfo;
+
+	kekri = ri->d.kekri;
+
+	if (!kekri->key)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT, CMS_R_NO_KEY);
+		return 0;
+		}
+
+	wrap_nid = OBJ_obj2nid(kekri->keyEncryptionAlgorithm->algorithm);
+	if (aes_wrap_keylen(wrap_nid) != kekri->keylen)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+			CMS_R_INVALID_KEY_LENGTH);
+		return 0;
+		}
+
+	/* If encrypted key length is invalid don't bother */
+
+	if (kekri->encryptedKey->length < 16)
+		{ 
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+					CMS_R_INVALID_ENCRYPTED_KEY_LENGTH);
+		goto err;
+		}
+
+	if (AES_set_decrypt_key(kekri->key, kekri->keylen << 3, &actx))
+		{ 
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+						CMS_R_ERROR_SETTING_KEY);
+		goto err;
+		}
+
+	ukey = OPENSSL_malloc(kekri->encryptedKey->length - 8);
+
+	if (!ukey)
+		{ 
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+						ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	ukeylen = AES_unwrap_key(&actx, NULL, ukey,
+					kekri->encryptedKey->data,
+					kekri->encryptedKey->length);
+
+	if (ukeylen <= 0)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+							CMS_R_UNWRAP_ERROR);
+		goto err;
+		}
+
+	ec->key = ukey;
+	ec->keylen = ukeylen;
+
+	r = 1;
+
+	err:
+
+	if (!r && ukey)
+		OPENSSL_free(ukey);
+	OPENSSL_cleanse(&actx, sizeof(actx));
+
+	return r;
+
+	}
+
+int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri)
+	{
+	switch(ri->type)
+		{
+		case CMS_RECIPINFO_TRANS:
+		return cms_RecipientInfo_ktri_decrypt(cms, ri);
+
+		case CMS_RECIPINFO_KEK:
+		return cms_RecipientInfo_kekri_decrypt(cms, ri);
+
+		case CMS_RECIPINFO_PASS:
+		return cms_RecipientInfo_pwri_crypt(cms, ri, 0);
+
+		default:
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT,
+			CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE);
+		return 0;
+		}
+	}
+
+BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms)
+	{
+	CMS_EncryptedContentInfo *ec;
+	STACK_OF(CMS_RecipientInfo) *rinfos;
+	CMS_RecipientInfo *ri;
+	int i, r, ok = 0;
+	BIO *ret;
+
+	/* Get BIO first to set up key */
+
+	ec = cms->d.envelopedData->encryptedContentInfo;
+	ret = cms_EncryptedContent_init_bio(ec);
+
+	/* If error or no cipher end of processing */
+
+	if (!ret || !ec->cipher)
+		return ret;
+
+	/* Now encrypt content key according to each RecipientInfo type */
+
+	rinfos = cms->d.envelopedData->recipientInfos;
+
+	for (i = 0; i < sk_CMS_RecipientInfo_num(rinfos); i++)
+		{
+		ri = sk_CMS_RecipientInfo_value(rinfos, i);
+
+		switch (ri->type)
+			{
+			case CMS_RECIPINFO_TRANS:
+			r = cms_RecipientInfo_ktri_encrypt(cms, ri);
+			break;
+
+			case CMS_RECIPINFO_KEK:
+			r = cms_RecipientInfo_kekri_encrypt(cms, ri);
+			break;
+
+			case CMS_RECIPINFO_PASS:
+			r = cms_RecipientInfo_pwri_crypt(cms, ri, 1);
+			break;
+
+			default:
+			CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO,
+				CMS_R_UNSUPPORTED_RECIPIENT_TYPE);
+			goto err;
+			}
+
+		if (r <= 0)
+			{
+			CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO,
+				CMS_R_ERROR_SETTING_RECIPIENTINFO);
+			goto err;
+			}
+		}
+
+	ok = 1;
+
+	err:
+	ec->cipher = NULL;
+	if (ec->key)
+		{
+		OPENSSL_cleanse(ec->key, ec->keylen);
+		OPENSSL_free(ec->key);
+		ec->key = NULL;
+		ec->keylen = 0;
+		}
+	if (ok)
+		return ret;
+	BIO_free(ret);
+	return NULL;
+
+	}
diff --git a/main/openssl/crypto/cms/cms_err.c b/main/openssl/crypto/cms/cms_err.c
new file mode 100644
index 00000000..8330ead7
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_err.c
@@ -0,0 +1,245 @@
+/* crypto/cms/cms_err.c */
+/* ====================================================================
+ * Copyright (c) 1999-2009 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/* NOTE: this file was auto generated by the mkerr.pl script: any changes
+ * made to it will be overwritten when the script next updates this file,
+ * only reason strings will be preserved.
+ */
+
+#include <stdio.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+
+/* BEGIN ERROR CODES */
+#ifndef OPENSSL_NO_ERR
+
+#define ERR_FUNC(func) ERR_PACK(ERR_LIB_CMS,func,0)
+#define ERR_REASON(reason) ERR_PACK(ERR_LIB_CMS,0,reason)
+
+static ERR_STRING_DATA CMS_str_functs[]=
+	{
+{ERR_FUNC(CMS_F_CHECK_CONTENT),	"CHECK_CONTENT"},
+{ERR_FUNC(CMS_F_CMS_ADD0_CERT),	"CMS_add0_cert"},
+{ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY),	"CMS_add0_recipient_key"},
+{ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD),	"CMS_add0_recipient_password"},
+{ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST),	"CMS_add1_ReceiptRequest"},
+{ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT),	"CMS_add1_recipient_cert"},
+{ERR_FUNC(CMS_F_CMS_ADD1_SIGNER),	"CMS_add1_signer"},
+{ERR_FUNC(CMS_F_CMS_ADD1_SIGNINGTIME),	"CMS_ADD1_SIGNINGTIME"},
+{ERR_FUNC(CMS_F_CMS_COMPRESS),	"CMS_compress"},
+{ERR_FUNC(CMS_F_CMS_COMPRESSEDDATA_CREATE),	"cms_CompressedData_create"},
+{ERR_FUNC(CMS_F_CMS_COMPRESSEDDATA_INIT_BIO),	"cms_CompressedData_init_bio"},
+{ERR_FUNC(CMS_F_CMS_COPY_CONTENT),	"CMS_COPY_CONTENT"},
+{ERR_FUNC(CMS_F_CMS_COPY_MESSAGEDIGEST),	"CMS_COPY_MESSAGEDIGEST"},
+{ERR_FUNC(CMS_F_CMS_DATA),	"CMS_data"},
+{ERR_FUNC(CMS_F_CMS_DATAFINAL),	"CMS_dataFinal"},
+{ERR_FUNC(CMS_F_CMS_DATAINIT),	"CMS_dataInit"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT),	"CMS_decrypt"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY),	"CMS_decrypt_set1_key"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PASSWORD),	"CMS_decrypt_set1_password"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY),	"CMS_decrypt_set1_pkey"},
+{ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX),	"cms_DigestAlgorithm_find_ctx"},
+{ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO),	"cms_DigestAlgorithm_init_bio"},
+{ERR_FUNC(CMS_F_CMS_DIGESTEDDATA_DO_FINAL),	"cms_DigestedData_do_final"},
+{ERR_FUNC(CMS_F_CMS_DIGEST_VERIFY),	"CMS_digest_verify"},
+{ERR_FUNC(CMS_F_CMS_ENCODE_RECEIPT),	"cms_encode_Receipt"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPT),	"CMS_encrypt"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO),	"cms_EncryptedContent_init_bio"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPTEDDATA_DECRYPT),	"CMS_EncryptedData_decrypt"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPTEDDATA_ENCRYPT),	"CMS_EncryptedData_encrypt"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY),	"CMS_EncryptedData_set1_key"},
+{ERR_FUNC(CMS_F_CMS_ENVELOPEDDATA_CREATE),	"CMS_EnvelopedData_create"},
+{ERR_FUNC(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO),	"cms_EnvelopedData_init_bio"},
+{ERR_FUNC(CMS_F_CMS_ENVELOPED_DATA_INIT),	"CMS_ENVELOPED_DATA_INIT"},
+{ERR_FUNC(CMS_F_CMS_FINAL),	"CMS_final"},
+{ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES),	"CMS_GET0_CERTIFICATE_CHOICES"},
+{ERR_FUNC(CMS_F_CMS_GET0_CONTENT),	"CMS_get0_content"},
+{ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE),	"CMS_GET0_ECONTENT_TYPE"},
+{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED),	"cms_get0_enveloped"},
+{ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES),	"CMS_GET0_REVOCATION_CHOICES"},
+{ERR_FUNC(CMS_F_CMS_GET0_SIGNED),	"CMS_GET0_SIGNED"},
+{ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1),	"cms_msgSigDigest_add1"},
+{ERR_FUNC(CMS_F_CMS_RECEIPTREQUEST_CREATE0),	"CMS_ReceiptRequest_create0"},
+{ERR_FUNC(CMS_F_CMS_RECEIPT_VERIFY),	"cms_Receipt_verify"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_DECRYPT),	"CMS_RecipientInfo_decrypt"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT),	"CMS_RECIPIENTINFO_KEKRI_DECRYPT"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT),	"CMS_RECIPIENTINFO_KEKRI_ENCRYPT"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_GET0_ID),	"CMS_RecipientInfo_kekri_get0_id"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_ID_CMP),	"CMS_RecipientInfo_kekri_id_cmp"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_CERT_CMP),	"CMS_RecipientInfo_ktri_cert_cmp"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT),	"CMS_RECIPIENTINFO_KTRI_DECRYPT"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT),	"CMS_RECIPIENTINFO_KTRI_ENCRYPT"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS),	"CMS_RecipientInfo_ktri_get0_algs"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID),	"CMS_RecipientInfo_ktri_get0_signer_id"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT),	"cms_RecipientInfo_pwri_crypt"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY),	"CMS_RecipientInfo_set0_key"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD),	"CMS_RecipientInfo_set0_password"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY),	"CMS_RecipientInfo_set0_pkey"},
+{ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER),	"cms_set1_SignerIdentifier"},
+{ERR_FUNC(CMS_F_CMS_SET_DETACHED),	"CMS_set_detached"},
+{ERR_FUNC(CMS_F_CMS_SIGN),	"CMS_sign"},
+{ERR_FUNC(CMS_F_CMS_SIGNED_DATA_INIT),	"CMS_SIGNED_DATA_INIT"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_CONTENT_SIGN),	"CMS_SIGNERINFO_CONTENT_SIGN"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_SIGN),	"CMS_SignerInfo_sign"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_VERIFY),	"CMS_SignerInfo_verify"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_VERIFY_CERT),	"CMS_SIGNERINFO_VERIFY_CERT"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT),	"CMS_SignerInfo_verify_content"},
+{ERR_FUNC(CMS_F_CMS_SIGN_RECEIPT),	"CMS_sign_receipt"},
+{ERR_FUNC(CMS_F_CMS_STREAM),	"CMS_stream"},
+{ERR_FUNC(CMS_F_CMS_UNCOMPRESS),	"CMS_uncompress"},
+{ERR_FUNC(CMS_F_CMS_VERIFY),	"CMS_verify"},
+{0,NULL}
+	};
+
+static ERR_STRING_DATA CMS_str_reasons[]=
+	{
+{ERR_REASON(CMS_R_ADD_SIGNER_ERROR)      ,"add signer error"},
+{ERR_REASON(CMS_R_CERTIFICATE_ALREADY_PRESENT),"certificate already present"},
+{ERR_REASON(CMS_R_CERTIFICATE_HAS_NO_KEYID),"certificate has no keyid"},
+{ERR_REASON(CMS_R_CERTIFICATE_VERIFY_ERROR),"certificate verify error"},
+{ERR_REASON(CMS_R_CIPHER_INITIALISATION_ERROR),"cipher initialisation error"},
+{ERR_REASON(CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR),"cipher parameter initialisation error"},
+{ERR_REASON(CMS_R_CMS_DATAFINAL_ERROR)   ,"cms datafinal error"},
+{ERR_REASON(CMS_R_CMS_LIB)               ,"cms lib"},
+{ERR_REASON(CMS_R_CONTENTIDENTIFIER_MISMATCH),"contentidentifier mismatch"},
+{ERR_REASON(CMS_R_CONTENT_NOT_FOUND)     ,"content not found"},
+{ERR_REASON(CMS_R_CONTENT_TYPE_MISMATCH) ,"content type mismatch"},
+{ERR_REASON(CMS_R_CONTENT_TYPE_NOT_COMPRESSED_DATA),"content type not compressed data"},
+{ERR_REASON(CMS_R_CONTENT_TYPE_NOT_ENVELOPED_DATA),"content type not enveloped data"},
+{ERR_REASON(CMS_R_CONTENT_TYPE_NOT_SIGNED_DATA),"content type not signed data"},
+{ERR_REASON(CMS_R_CONTENT_VERIFY_ERROR)  ,"content verify error"},
+{ERR_REASON(CMS_R_CTRL_ERROR)            ,"ctrl error"},
+{ERR_REASON(CMS_R_CTRL_FAILURE)          ,"ctrl failure"},
+{ERR_REASON(CMS_R_DECRYPT_ERROR)         ,"decrypt error"},
+{ERR_REASON(CMS_R_DIGEST_ERROR)          ,"digest error"},
+{ERR_REASON(CMS_R_ERROR_GETTING_PUBLIC_KEY),"error getting public key"},
+{ERR_REASON(CMS_R_ERROR_READING_MESSAGEDIGEST_ATTRIBUTE),"error reading messagedigest attribute"},
+{ERR_REASON(CMS_R_ERROR_SETTING_KEY)     ,"error setting key"},
+{ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"},
+{ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"},
+{ERR_REASON(CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER),"invalid key encryption parameter"},
+{ERR_REASON(CMS_R_INVALID_KEY_LENGTH)    ,"invalid key length"},
+{ERR_REASON(CMS_R_MD_BIO_INIT_ERROR)     ,"md bio init error"},
+{ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"},
+{ERR_REASON(CMS_R_MESSAGEDIGEST_WRONG_LENGTH),"messagedigest wrong length"},
+{ERR_REASON(CMS_R_MSGSIGDIGEST_ERROR)    ,"msgsigdigest error"},
+{ERR_REASON(CMS_R_MSGSIGDIGEST_VERIFICATION_FAILURE),"msgsigdigest verification failure"},
+{ERR_REASON(CMS_R_MSGSIGDIGEST_WRONG_LENGTH),"msgsigdigest wrong length"},
+{ERR_REASON(CMS_R_NEED_ONE_SIGNER)       ,"need one signer"},
+{ERR_REASON(CMS_R_NOT_A_SIGNED_RECEIPT)  ,"not a signed receipt"},
+{ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA)    ,"not encrypted data"},
+{ERR_REASON(CMS_R_NOT_KEK)               ,"not kek"},
+{ERR_REASON(CMS_R_NOT_KEY_TRANSPORT)     ,"not key transport"},
+{ERR_REASON(CMS_R_NOT_PWRI)              ,"not pwri"},
+{ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"},
+{ERR_REASON(CMS_R_NO_CIPHER)             ,"no cipher"},
+{ERR_REASON(CMS_R_NO_CONTENT)            ,"no content"},
+{ERR_REASON(CMS_R_NO_CONTENT_TYPE)       ,"no content type"},
+{ERR_REASON(CMS_R_NO_DEFAULT_DIGEST)     ,"no default digest"},
+{ERR_REASON(CMS_R_NO_DIGEST_SET)         ,"no digest set"},
+{ERR_REASON(CMS_R_NO_KEY)                ,"no key"},
+{ERR_REASON(CMS_R_NO_KEY_OR_CERT)        ,"no key or cert"},
+{ERR_REASON(CMS_R_NO_MATCHING_DIGEST)    ,"no matching digest"},
+{ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"},
+{ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"},
+{ERR_REASON(CMS_R_NO_MSGSIGDIGEST)       ,"no msgsigdigest"},
+{ERR_REASON(CMS_R_NO_PASSWORD)           ,"no password"},
+{ERR_REASON(CMS_R_NO_PRIVATE_KEY)        ,"no private key"},
+{ERR_REASON(CMS_R_NO_PUBLIC_KEY)         ,"no public key"},
+{ERR_REASON(CMS_R_NO_RECEIPT_REQUEST)    ,"no receipt request"},
+{ERR_REASON(CMS_R_NO_SIGNERS)            ,"no signers"},
+{ERR_REASON(CMS_R_PRIVATE_KEY_DOES_NOT_MATCH_CERTIFICATE),"private key does not match certificate"},
+{ERR_REASON(CMS_R_RECEIPT_DECODE_ERROR)  ,"receipt decode error"},
+{ERR_REASON(CMS_R_RECIPIENT_ERROR)       ,"recipient error"},
+{ERR_REASON(CMS_R_SIGNER_CERTIFICATE_NOT_FOUND),"signer certificate not found"},
+{ERR_REASON(CMS_R_SIGNFINAL_ERROR)       ,"signfinal error"},
+{ERR_REASON(CMS_R_SMIME_TEXT_ERROR)      ,"smime text error"},
+{ERR_REASON(CMS_R_STORE_INIT_ERROR)      ,"store init error"},
+{ERR_REASON(CMS_R_TYPE_NOT_COMPRESSED_DATA),"type not compressed data"},
+{ERR_REASON(CMS_R_TYPE_NOT_DATA)         ,"type not data"},
+{ERR_REASON(CMS_R_TYPE_NOT_DIGESTED_DATA),"type not digested data"},
+{ERR_REASON(CMS_R_TYPE_NOT_ENCRYPTED_DATA),"type not encrypted data"},
+{ERR_REASON(CMS_R_TYPE_NOT_ENVELOPED_DATA),"type not enveloped data"},
+{ERR_REASON(CMS_R_UNABLE_TO_FINALIZE_CONTEXT),"unable to finalize context"},
+{ERR_REASON(CMS_R_UNKNOWN_CIPHER)        ,"unknown cipher"},
+{ERR_REASON(CMS_R_UNKNOWN_DIGEST_ALGORIHM),"unknown digest algorihm"},
+{ERR_REASON(CMS_R_UNKNOWN_ID)            ,"unknown id"},
+{ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"},
+{ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"},
+{ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"},
+{ERR_REASON(CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM),"unsupported key encryption algorithm"},
+{ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"},
+{ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"},
+{ERR_REASON(CMS_R_UNSUPPORTED_TYPE)      ,"unsupported type"},
+{ERR_REASON(CMS_R_UNWRAP_ERROR)          ,"unwrap error"},
+{ERR_REASON(CMS_R_UNWRAP_FAILURE)        ,"unwrap failure"},
+{ERR_REASON(CMS_R_VERIFICATION_FAILURE)  ,"verification failure"},
+{ERR_REASON(CMS_R_WRAP_ERROR)            ,"wrap error"},
+{0,NULL}
+	};
+
+#endif
+
+void ERR_load_CMS_strings(void)
+	{
+#ifndef OPENSSL_NO_ERR
+
+	if (ERR_func_error_string(CMS_str_functs[0].error) == NULL)
+		{
+		ERR_load_strings(0,CMS_str_functs);
+		ERR_load_strings(0,CMS_str_reasons);
+		}
+#endif
+	}
diff --git a/main/openssl/crypto/cms/cms_ess.c b/main/openssl/crypto/cms/cms_ess.c
new file mode 100644
index 00000000..90c0b82f
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_ess.c
@@ -0,0 +1,420 @@
+/* crypto/cms/cms_ess.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/rand.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include "cms_lcl.h"
+
+DECLARE_ASN1_ITEM(CMS_ReceiptRequest)
+DECLARE_ASN1_ITEM(CMS_Receipt)
+
+IMPLEMENT_ASN1_FUNCTIONS(CMS_ReceiptRequest)
+
+/* ESS services: for now just Signed Receipt related */
+
+int CMS_get1_ReceiptRequest(CMS_SignerInfo *si, CMS_ReceiptRequest **prr)
+	{
+	ASN1_STRING *str;
+	CMS_ReceiptRequest *rr = NULL;
+	if (prr)
+		*prr = NULL;
+	str = CMS_signed_get0_data_by_OBJ(si,
+				OBJ_nid2obj(NID_id_smime_aa_receiptRequest),
+					-3, V_ASN1_SEQUENCE);
+	if (!str)
+		return 0;
+
+	rr = ASN1_item_unpack(str, ASN1_ITEM_rptr(CMS_ReceiptRequest));
+	if (!rr)
+		return -1;
+	if (prr)
+		*prr = rr;
+	else
+		CMS_ReceiptRequest_free(rr);
+	return 1;
+	}
+
+CMS_ReceiptRequest *CMS_ReceiptRequest_create0(unsigned char *id, int idlen,
+				int allorfirst,
+				STACK_OF(GENERAL_NAMES) *receiptList,
+				STACK_OF(GENERAL_NAMES) *receiptsTo)
+	{
+	CMS_ReceiptRequest *rr = NULL;
+
+	rr = CMS_ReceiptRequest_new();
+	if (!rr)
+		goto merr;
+	if (id)
+		ASN1_STRING_set0(rr->signedContentIdentifier, id, idlen);
+	else
+		{
+		if (!ASN1_STRING_set(rr->signedContentIdentifier, NULL, 32))
+			goto merr;
+		if (RAND_pseudo_bytes(rr->signedContentIdentifier->data, 32) 
+					<= 0)
+			goto err;
+		}
+
+	sk_GENERAL_NAMES_pop_free(rr->receiptsTo, GENERAL_NAMES_free);
+	rr->receiptsTo = receiptsTo;
+
+	if (receiptList)
+		{
+		rr->receiptsFrom->type = 1;
+		rr->receiptsFrom->d.receiptList = receiptList;
+		}
+	else
+		{
+		rr->receiptsFrom->type = 0;
+		rr->receiptsFrom->d.allOrFirstTier = allorfirst;
+		}
+
+	return rr;
+
+	merr:
+	CMSerr(CMS_F_CMS_RECEIPTREQUEST_CREATE0, ERR_R_MALLOC_FAILURE);
+
+	err:
+	if (rr)
+		CMS_ReceiptRequest_free(rr);
+
+	return NULL;
+	
+	}
+
+int CMS_add1_ReceiptRequest(CMS_SignerInfo *si, CMS_ReceiptRequest *rr)
+	{
+	unsigned char *rrder = NULL;
+	int rrderlen, r = 0;
+
+	rrderlen = i2d_CMS_ReceiptRequest(rr, &rrder);
+	if (rrderlen < 0)
+		goto merr;
+
+	if (!CMS_signed_add1_attr_by_NID(si, NID_id_smime_aa_receiptRequest,
+					V_ASN1_SEQUENCE, rrder, rrderlen))
+		goto merr;
+
+	r = 1;
+
+	merr:
+	if (!r)
+		CMSerr(CMS_F_CMS_ADD1_RECEIPTREQUEST, ERR_R_MALLOC_FAILURE);
+
+	if (rrder)
+		OPENSSL_free(rrder);
+
+	return r;
+	
+	}
+
+void CMS_ReceiptRequest_get0_values(CMS_ReceiptRequest *rr,
+					ASN1_STRING **pcid,
+					int *pallorfirst,
+					STACK_OF(GENERAL_NAMES) **plist,
+					STACK_OF(GENERAL_NAMES) **prto)
+	{
+	if (pcid)
+		*pcid = rr->signedContentIdentifier;
+	if (rr->receiptsFrom->type == 0)
+		{
+		if (pallorfirst)
+			*pallorfirst = (int)rr->receiptsFrom->d.allOrFirstTier;
+		if (plist)
+			*plist = NULL;
+		}
+	else
+		{
+		if (pallorfirst)
+			*pallorfirst = -1;
+		if (plist)
+			*plist = rr->receiptsFrom->d.receiptList;
+		}
+	if (prto)
+		*prto = rr->receiptsTo;
+	}
+
+/* Digest a SignerInfo structure for msgSigDigest attribute processing */
+
+static int cms_msgSigDigest(CMS_SignerInfo *si,
+				unsigned char *dig, unsigned int *diglen)
+	{
+	const EVP_MD *md;
+	md = EVP_get_digestbyobj(si->digestAlgorithm->algorithm);
+	if (md == NULL)
+		return 0;
+	if (!ASN1_item_digest(ASN1_ITEM_rptr(CMS_Attributes_Verify), md,
+						si->signedAttrs, dig, diglen))
+		return 0;
+	return 1;
+	}
+
+/* Add a msgSigDigest attribute to a SignerInfo */
+
+int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src)
+	{
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int diglen;
+	if (!cms_msgSigDigest(src, dig, &diglen))
+		{
+		CMSerr(CMS_F_CMS_MSGSIGDIGEST_ADD1, CMS_R_MSGSIGDIGEST_ERROR);
+		return 0;
+		}
+	if (!CMS_signed_add1_attr_by_NID(dest, NID_id_smime_aa_msgSigDigest,
+					V_ASN1_OCTET_STRING, dig, diglen))
+		{
+		CMSerr(CMS_F_CMS_MSGSIGDIGEST_ADD1, ERR_R_MALLOC_FAILURE);
+		return 0;
+		}
+	return 1;
+	}
+
+/* Verify signed receipt after it has already passed normal CMS verify */
+
+int cms_Receipt_verify(CMS_ContentInfo *cms, CMS_ContentInfo *req_cms)
+	{
+	int r = 0, i;
+	CMS_ReceiptRequest *rr = NULL;
+	CMS_Receipt *rct = NULL;
+	STACK_OF(CMS_SignerInfo) *sis, *osis;
+	CMS_SignerInfo *si, *osi = NULL;
+	ASN1_OCTET_STRING *msig, **pcont;
+	ASN1_OBJECT *octype;
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int diglen;
+
+	/* Get SignerInfos, also checks SignedData content type */
+	osis = CMS_get0_SignerInfos(req_cms);
+	sis = CMS_get0_SignerInfos(cms);
+	if (!osis || !sis)
+		goto err;
+
+	if (sk_CMS_SignerInfo_num(sis) != 1)
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NEED_ONE_SIGNER);
+		goto err;
+		}
+
+	/* Check receipt content type */
+	if (OBJ_obj2nid(CMS_get0_eContentType(cms)) != NID_id_smime_ct_receipt)
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NOT_A_SIGNED_RECEIPT);
+		goto err;
+		}
+
+	/* Extract and decode receipt content */
+	pcont = CMS_get0_content(cms);
+	if (!pcont || !*pcont)
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_CONTENT);
+		goto err;
+		}
+
+	rct = ASN1_item_unpack(*pcont, ASN1_ITEM_rptr(CMS_Receipt));
+
+	if (!rct)	
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_RECEIPT_DECODE_ERROR);
+		goto err;
+		}
+
+	/* Locate original request */
+
+	for (i = 0; i < sk_CMS_SignerInfo_num(osis); i++)
+		{
+		osi = sk_CMS_SignerInfo_value(osis, i);
+		if (!ASN1_STRING_cmp(osi->signature,
+					rct->originatorSignatureValue))
+			break;
+		}
+
+	if (i == sk_CMS_SignerInfo_num(osis))
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_MATCHING_SIGNATURE);
+		goto err;
+		}
+
+	si = sk_CMS_SignerInfo_value(sis, 0);
+
+	/* Get msgSigDigest value and compare */
+
+	msig = CMS_signed_get0_data_by_OBJ(si,
+				OBJ_nid2obj(NID_id_smime_aa_msgSigDigest),
+					-3, V_ASN1_OCTET_STRING);
+
+	if (!msig)
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_MSGSIGDIGEST);
+		goto err;
+		}
+
+	if (!cms_msgSigDigest(osi, dig, &diglen))
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_MSGSIGDIGEST_ERROR);
+		goto err;
+		}
+
+	if (diglen != (unsigned int)msig->length)
+			{
+			CMSerr(CMS_F_CMS_RECEIPT_VERIFY,
+				CMS_R_MSGSIGDIGEST_WRONG_LENGTH);
+			goto err;
+			}
+
+	if (memcmp(dig, msig->data, diglen))
+			{
+			CMSerr(CMS_F_CMS_RECEIPT_VERIFY,
+				CMS_R_MSGSIGDIGEST_VERIFICATION_FAILURE);
+			goto err;
+			}
+
+	/* Compare content types */
+
+	octype = CMS_signed_get0_data_by_OBJ(osi,
+				OBJ_nid2obj(NID_pkcs9_contentType),
+					-3, V_ASN1_OBJECT);
+	if (!octype)
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_CONTENT_TYPE);
+		goto err;
+		}
+
+	/* Compare details in receipt request */
+
+	if (OBJ_cmp(octype, rct->contentType))
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_CONTENT_TYPE_MISMATCH);
+		goto err;
+		}
+
+	/* Get original receipt request details */
+
+	if (CMS_get1_ReceiptRequest(osi, &rr) <= 0)
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_RECEIPT_REQUEST);
+		goto err;
+		}
+
+	if (ASN1_STRING_cmp(rr->signedContentIdentifier,
+					rct->signedContentIdentifier))
+		{
+		CMSerr(CMS_F_CMS_RECEIPT_VERIFY,
+					CMS_R_CONTENTIDENTIFIER_MISMATCH);
+		goto err;
+		}
+
+	r = 1;
+
+	err:
+	if (rr)
+		CMS_ReceiptRequest_free(rr);
+	if (rct)
+		M_ASN1_free_of(rct, CMS_Receipt);
+
+	return r;
+
+	}
+
+/* Encode a Receipt into an OCTET STRING read for including into content of
+ * a SignedData ContentInfo.
+ */
+
+ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si)
+	{
+	CMS_Receipt rct;
+	CMS_ReceiptRequest *rr = NULL;
+	ASN1_OBJECT *ctype;
+	ASN1_OCTET_STRING *os = NULL;
+
+	/* Get original receipt request */
+
+	/* Get original receipt request details */
+
+	if (CMS_get1_ReceiptRequest(si, &rr) <= 0)
+		{
+		CMSerr(CMS_F_CMS_ENCODE_RECEIPT, CMS_R_NO_RECEIPT_REQUEST);
+		goto err;
+		}
+
+	/* Get original content type */
+
+	ctype = CMS_signed_get0_data_by_OBJ(si,
+				OBJ_nid2obj(NID_pkcs9_contentType),
+					-3, V_ASN1_OBJECT);
+	if (!ctype)
+		{
+		CMSerr(CMS_F_CMS_ENCODE_RECEIPT, CMS_R_NO_CONTENT_TYPE);
+		goto err;
+		}
+
+	rct.version = 1;
+	rct.contentType = ctype;
+	rct.signedContentIdentifier = rr->signedContentIdentifier;
+	rct.originatorSignatureValue = si->signature;
+
+	os = ASN1_item_pack(&rct, ASN1_ITEM_rptr(CMS_Receipt), NULL);
+
+	err:
+	if (rr)
+		CMS_ReceiptRequest_free(rr);
+
+	return os;
+
+	}
+
+
diff --git a/main/openssl/crypto/cms/cms_io.c b/main/openssl/crypto/cms/cms_io.c
new file mode 100644
index 00000000..1cb0264c
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_io.c
@@ -0,0 +1,133 @@
+/* crypto/cms/cms_io.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/err.h>
+#include <openssl/pem.h>
+#include "cms.h"
+#include "cms_lcl.h"
+
+int CMS_stream(unsigned char ***boundary, CMS_ContentInfo *cms)
+	{
+	ASN1_OCTET_STRING **pos;
+	pos = CMS_get0_content(cms);
+	if (!pos)
+		return 0;
+	if (!*pos)
+		*pos = ASN1_OCTET_STRING_new();
+	if (*pos)
+		{
+		(*pos)->flags |= ASN1_STRING_FLAG_NDEF;
+		(*pos)->flags &= ~ASN1_STRING_FLAG_CONT;
+		*boundary = &(*pos)->data;
+		return 1;
+		}
+	CMSerr(CMS_F_CMS_STREAM, ERR_R_MALLOC_FAILURE);
+	return 0;
+	}
+
+CMS_ContentInfo *d2i_CMS_bio(BIO *bp, CMS_ContentInfo **cms)
+	{
+	return ASN1_item_d2i_bio(ASN1_ITEM_rptr(CMS_ContentInfo), bp, cms);
+	}
+
+int i2d_CMS_bio(BIO *bp, CMS_ContentInfo *cms)
+	{
+	return ASN1_item_i2d_bio(ASN1_ITEM_rptr(CMS_ContentInfo), bp, cms);
+	}
+
+IMPLEMENT_PEM_rw_const(CMS, CMS_ContentInfo, PEM_STRING_CMS, CMS_ContentInfo)
+
+BIO *BIO_new_CMS(BIO *out, CMS_ContentInfo *cms) 
+	{
+	return BIO_new_NDEF(out, (ASN1_VALUE *)cms,
+				ASN1_ITEM_rptr(CMS_ContentInfo));
+	}
+
+/* CMS wrappers round generalised stream and MIME routines */
+
+int i2d_CMS_bio_stream(BIO *out, CMS_ContentInfo *cms, BIO *in, int flags)
+	{
+	return i2d_ASN1_bio_stream(out, (ASN1_VALUE *)cms, in, flags,
+					ASN1_ITEM_rptr(CMS_ContentInfo));
+	}
+
+int PEM_write_bio_CMS_stream(BIO *out, CMS_ContentInfo *cms, BIO *in, int flags)
+	{
+	return PEM_write_bio_ASN1_stream(out, (ASN1_VALUE *) cms, in, flags,
+					"CMS",
+					ASN1_ITEM_rptr(CMS_ContentInfo));
+	}
+
+int SMIME_write_CMS(BIO *bio, CMS_ContentInfo *cms, BIO *data, int flags)
+	{
+	STACK_OF(X509_ALGOR) *mdalgs;
+	int ctype_nid = OBJ_obj2nid(cms->contentType);
+	int econt_nid = OBJ_obj2nid(CMS_get0_eContentType(cms));
+	if (ctype_nid == NID_pkcs7_signed)
+		mdalgs = cms->d.signedData->digestAlgorithms;
+	else
+		mdalgs = NULL;
+
+	return SMIME_write_ASN1(bio, (ASN1_VALUE *)cms, data, flags,
+					ctype_nid, econt_nid, mdalgs,
+					ASN1_ITEM_rptr(CMS_ContentInfo));	
+	}
+
+CMS_ContentInfo *SMIME_read_CMS(BIO *bio, BIO **bcont)
+	{
+	return (CMS_ContentInfo *)SMIME_read_ASN1(bio, bcont,
+					ASN1_ITEM_rptr(CMS_ContentInfo));
+	}
+
diff --git a/main/openssl/crypto/cms/cms_lcl.h b/main/openssl/crypto/cms/cms_lcl.h
new file mode 100644
index 00000000..a9f97301
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_lcl.h
@@ -0,0 +1,473 @@
+/* crypto/cms/cms_lcl.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#ifndef HEADER_CMS_LCL_H
+#define HEADER_CMS_LCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/x509.h>
+
+/* Cryptographic message syntax (CMS) structures: taken
+ * from RFC3852
+ */
+
+/* Forward references */
+
+typedef struct CMS_IssuerAndSerialNumber_st CMS_IssuerAndSerialNumber;
+typedef struct CMS_EncapsulatedContentInfo_st CMS_EncapsulatedContentInfo;
+typedef struct CMS_SignerIdentifier_st CMS_SignerIdentifier;
+typedef struct CMS_SignedData_st CMS_SignedData;
+typedef struct CMS_OtherRevocationInfoFormat_st CMS_OtherRevocationInfoFormat;
+typedef struct CMS_OriginatorInfo_st CMS_OriginatorInfo;
+typedef struct CMS_EncryptedContentInfo_st CMS_EncryptedContentInfo;
+typedef struct CMS_EnvelopedData_st CMS_EnvelopedData;
+typedef struct CMS_DigestedData_st CMS_DigestedData;
+typedef struct CMS_EncryptedData_st CMS_EncryptedData;
+typedef struct CMS_AuthenticatedData_st CMS_AuthenticatedData;
+typedef struct CMS_CompressedData_st CMS_CompressedData;
+typedef struct CMS_OtherCertificateFormat_st CMS_OtherCertificateFormat;
+typedef struct CMS_KeyTransRecipientInfo_st CMS_KeyTransRecipientInfo;
+typedef struct CMS_OriginatorPublicKey_st CMS_OriginatorPublicKey;
+typedef struct CMS_OriginatorIdentifierOrKey_st CMS_OriginatorIdentifierOrKey;
+typedef struct CMS_KeyAgreeRecipientInfo_st CMS_KeyAgreeRecipientInfo;
+typedef struct CMS_OtherKeyAttribute_st CMS_OtherKeyAttribute;
+typedef struct CMS_RecipientKeyIdentifier_st CMS_RecipientKeyIdentifier;
+typedef struct CMS_KeyAgreeRecipientIdentifier_st CMS_KeyAgreeRecipientIdentifier;
+typedef struct CMS_RecipientEncryptedKey_st CMS_RecipientEncryptedKey;
+typedef struct CMS_KEKIdentifier_st CMS_KEKIdentifier;
+typedef struct CMS_KEKRecipientInfo_st CMS_KEKRecipientInfo;
+typedef struct CMS_PasswordRecipientInfo_st CMS_PasswordRecipientInfo;
+typedef struct CMS_OtherRecipientInfo_st CMS_OtherRecipientInfo;
+typedef struct CMS_ReceiptsFrom_st CMS_ReceiptsFrom;
+
+struct CMS_ContentInfo_st
+	{
+	ASN1_OBJECT *contentType;
+	union	{
+		ASN1_OCTET_STRING *data;
+		CMS_SignedData *signedData;
+		CMS_EnvelopedData *envelopedData;
+		CMS_DigestedData *digestedData;
+		CMS_EncryptedData *encryptedData;
+		CMS_AuthenticatedData *authenticatedData;
+		CMS_CompressedData *compressedData;
+		ASN1_TYPE *other;
+		/* Other types ... */
+		void *otherData;
+		} d;
+	};
+
+struct CMS_SignedData_st
+	{
+	long version;
+	STACK_OF(X509_ALGOR) *digestAlgorithms;
+	CMS_EncapsulatedContentInfo *encapContentInfo;
+	STACK_OF(CMS_CertificateChoices) *certificates;
+	STACK_OF(CMS_RevocationInfoChoice) *crls;
+	STACK_OF(CMS_SignerInfo) *signerInfos;
+	};
+ 
+struct CMS_EncapsulatedContentInfo_st
+	{
+	ASN1_OBJECT *eContentType;
+	ASN1_OCTET_STRING *eContent;
+	/* Set to 1 if incomplete structure only part set up */
+	int partial;
+	};
+
+struct CMS_SignerInfo_st
+	{
+	long version;
+	CMS_SignerIdentifier *sid;
+	X509_ALGOR *digestAlgorithm;
+	STACK_OF(X509_ATTRIBUTE) *signedAttrs;
+	X509_ALGOR *signatureAlgorithm;
+	ASN1_OCTET_STRING *signature;
+	STACK_OF(X509_ATTRIBUTE) *unsignedAttrs;
+	/* Signing certificate and key */
+	X509 *signer;
+	EVP_PKEY *pkey;
+	};
+
+struct CMS_SignerIdentifier_st
+	{
+	int type;
+	union	{
+		CMS_IssuerAndSerialNumber *issuerAndSerialNumber;
+		ASN1_OCTET_STRING *subjectKeyIdentifier;
+		} d;
+	};
+
+struct CMS_EnvelopedData_st
+	{
+	long version;
+	CMS_OriginatorInfo *originatorInfo;
+	STACK_OF(CMS_RecipientInfo) *recipientInfos;
+	CMS_EncryptedContentInfo *encryptedContentInfo;
+	STACK_OF(X509_ATTRIBUTE) *unprotectedAttrs;
+	};
+
+struct CMS_OriginatorInfo_st
+	{
+	STACK_OF(CMS_CertificateChoices) *certificates;
+	STACK_OF(CMS_RevocationInfoChoice) *crls;
+	};
+
+struct CMS_EncryptedContentInfo_st
+	{
+	ASN1_OBJECT *contentType;
+	X509_ALGOR *contentEncryptionAlgorithm;
+	ASN1_OCTET_STRING *encryptedContent;
+	/* Content encryption algorithm and key */
+	const EVP_CIPHER *cipher;
+	unsigned char *key;
+	size_t keylen;
+	/* Set to 1 if we are debugging decrypt and don't fake keys for MMA */
+	int debug;
+	};
+
+struct CMS_RecipientInfo_st
+	{
+ 	int type;
+ 	union	{
+  	 	CMS_KeyTransRecipientInfo *ktri;
+   		CMS_KeyAgreeRecipientInfo *kari;
+   		CMS_KEKRecipientInfo *kekri;
+		CMS_PasswordRecipientInfo *pwri;
+		CMS_OtherRecipientInfo *ori;
+		} d;
+	};
+
+typedef CMS_SignerIdentifier CMS_RecipientIdentifier;
+
+struct CMS_KeyTransRecipientInfo_st
+	{
+	long version;
+	CMS_RecipientIdentifier *rid;
+	X509_ALGOR *keyEncryptionAlgorithm;
+	ASN1_OCTET_STRING *encryptedKey;
+	/* Recipient Key and cert */
+	X509 *recip;
+	EVP_PKEY *pkey;
+	};
+
+struct CMS_KeyAgreeRecipientInfo_st
+	{
+	long version;
+	CMS_OriginatorIdentifierOrKey *originator;
+	ASN1_OCTET_STRING *ukm;
+ 	X509_ALGOR *keyEncryptionAlgorithm;
+	STACK_OF(CMS_RecipientEncryptedKey) *recipientEncryptedKeys;
+	};
+
+struct CMS_OriginatorIdentifierOrKey_st
+	{
+	int type;
+	union	{
+		CMS_IssuerAndSerialNumber *issuerAndSerialNumber;
+		ASN1_OCTET_STRING *subjectKeyIdentifier;
+		CMS_OriginatorPublicKey *originatorKey;
+		} d;
+	};
+
+struct CMS_OriginatorPublicKey_st
+	{
+	X509_ALGOR *algorithm;
+	ASN1_BIT_STRING *publicKey;
+	};
+
+struct CMS_RecipientEncryptedKey_st
+	{
+ 	CMS_KeyAgreeRecipientIdentifier *rid;
+ 	ASN1_OCTET_STRING *encryptedKey;
+	};
+
+struct CMS_KeyAgreeRecipientIdentifier_st
+	{
+	int type;
+	union	{
+		CMS_IssuerAndSerialNumber *issuerAndSerialNumber;
+		CMS_RecipientKeyIdentifier *rKeyId;
+		} d;
+	};
+
+struct CMS_RecipientKeyIdentifier_st
+	{
+ 	ASN1_OCTET_STRING *subjectKeyIdentifier;
+ 	ASN1_GENERALIZEDTIME *date;
+ 	CMS_OtherKeyAttribute *other;
+	};
+
+struct CMS_KEKRecipientInfo_st
+	{
+ 	long version;
+ 	CMS_KEKIdentifier *kekid;
+ 	X509_ALGOR *keyEncryptionAlgorithm;
+ 	ASN1_OCTET_STRING *encryptedKey;
+	/* Extra info: symmetric key to use */
+	unsigned char *key;
+	size_t keylen;
+	};
+
+struct CMS_KEKIdentifier_st
+	{
+ 	ASN1_OCTET_STRING *keyIdentifier;
+ 	ASN1_GENERALIZEDTIME *date;
+ 	CMS_OtherKeyAttribute *other;
+	};
+
+struct CMS_PasswordRecipientInfo_st
+	{
+ 	long version;
+ 	X509_ALGOR *keyDerivationAlgorithm;
+ 	X509_ALGOR *keyEncryptionAlgorithm;
+ 	ASN1_OCTET_STRING *encryptedKey;
+	/* Extra info: password to use */
+	unsigned char *pass;
+	size_t passlen;
+	};
+
+struct CMS_OtherRecipientInfo_st
+	{
+ 	ASN1_OBJECT *oriType;
+ 	ASN1_TYPE *oriValue;
+	};
+
+struct CMS_DigestedData_st
+	{
+	long version;
+	X509_ALGOR *digestAlgorithm;
+	CMS_EncapsulatedContentInfo *encapContentInfo;
+	ASN1_OCTET_STRING *digest;
+	};
+
+struct CMS_EncryptedData_st
+	{
+	long version;
+	CMS_EncryptedContentInfo *encryptedContentInfo;
+	STACK_OF(X509_ATTRIBUTE) *unprotectedAttrs;
+	};
+
+struct CMS_AuthenticatedData_st
+	{
+	long version;
+	CMS_OriginatorInfo *originatorInfo;
+	STACK_OF(CMS_RecipientInfo) *recipientInfos;
+	X509_ALGOR *macAlgorithm;
+	X509_ALGOR *digestAlgorithm;
+	CMS_EncapsulatedContentInfo *encapContentInfo;
+	STACK_OF(X509_ATTRIBUTE) *authAttrs;
+	ASN1_OCTET_STRING *mac;
+	STACK_OF(X509_ATTRIBUTE) *unauthAttrs;
+	};
+
+struct CMS_CompressedData_st
+	{
+	long version;
+	X509_ALGOR *compressionAlgorithm;
+	STACK_OF(CMS_RecipientInfo) *recipientInfos;
+	CMS_EncapsulatedContentInfo *encapContentInfo;
+	};
+
+struct CMS_RevocationInfoChoice_st
+	{
+	int type;
+	union	{
+		X509_CRL *crl;
+		CMS_OtherRevocationInfoFormat *other;
+		} d;
+	};
+
+#define CMS_REVCHOICE_CRL		0
+#define CMS_REVCHOICE_OTHER		1
+
+struct CMS_OtherRevocationInfoFormat_st
+	{
+	ASN1_OBJECT *otherRevInfoFormat;
+ 	ASN1_TYPE *otherRevInfo;
+	};
+
+struct CMS_CertificateChoices
+	{
+	int type;
+		union {
+		X509 *certificate;
+		ASN1_STRING *extendedCertificate;	/* Obsolete */
+		ASN1_STRING *v1AttrCert;	/* Left encoded for now */
+		ASN1_STRING *v2AttrCert;	/* Left encoded for now */
+		CMS_OtherCertificateFormat *other;
+		} d;
+	};
+
+#define CMS_CERTCHOICE_CERT		0
+#define CMS_CERTCHOICE_EXCERT		1
+#define CMS_CERTCHOICE_V1ACERT		2
+#define CMS_CERTCHOICE_V2ACERT		3
+#define CMS_CERTCHOICE_OTHER		4
+
+struct CMS_OtherCertificateFormat_st
+	{
+	ASN1_OBJECT *otherCertFormat;
+ 	ASN1_TYPE *otherCert;
+	};
+
+/* This is also defined in pkcs7.h but we duplicate it
+ * to allow the CMS code to be independent of PKCS#7
+ */
+
+struct CMS_IssuerAndSerialNumber_st
+	{
+	X509_NAME *issuer;
+	ASN1_INTEGER *serialNumber;
+	};
+
+struct CMS_OtherKeyAttribute_st
+	{
+	ASN1_OBJECT *keyAttrId;
+ 	ASN1_TYPE *keyAttr;
+	};
+
+/* ESS structures */
+
+#ifdef HEADER_X509V3_H
+
+struct CMS_ReceiptRequest_st
+	{
+	ASN1_OCTET_STRING *signedContentIdentifier;
+	CMS_ReceiptsFrom *receiptsFrom;
+	STACK_OF(GENERAL_NAMES) *receiptsTo;
+	};
+
+
+struct CMS_ReceiptsFrom_st
+	{
+	int type;
+	union
+		{
+		long allOrFirstTier;
+		STACK_OF(GENERAL_NAMES) *receiptList;
+		} d;
+	};
+#endif
+
+struct CMS_Receipt_st
+	{
+	long version;
+	ASN1_OBJECT *contentType;
+	ASN1_OCTET_STRING *signedContentIdentifier;
+	ASN1_OCTET_STRING *originatorSignatureValue;
+	};
+
+DECLARE_ASN1_FUNCTIONS(CMS_ContentInfo)
+DECLARE_ASN1_ITEM(CMS_SignerInfo)
+DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber)
+DECLARE_ASN1_ITEM(CMS_Attributes_Sign)
+DECLARE_ASN1_ITEM(CMS_Attributes_Verify)
+DECLARE_ASN1_ITEM(CMS_RecipientInfo)
+DECLARE_ASN1_ITEM(CMS_PasswordRecipientInfo)
+DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber)
+
+#define CMS_SIGNERINFO_ISSUER_SERIAL	0
+#define CMS_SIGNERINFO_KEYIDENTIFIER	1
+
+#define CMS_RECIPINFO_ISSUER_SERIAL	0
+#define CMS_RECIPINFO_KEYIDENTIFIER	1
+
+BIO *cms_content_bio(CMS_ContentInfo *cms);
+
+CMS_ContentInfo *cms_Data_create(void);
+
+CMS_ContentInfo *cms_DigestedData_create(const EVP_MD *md);
+BIO *cms_DigestedData_init_bio(CMS_ContentInfo *cms);
+int cms_DigestedData_do_final(CMS_ContentInfo *cms, BIO *chain, int verify);
+
+BIO *cms_SignedData_init_bio(CMS_ContentInfo *cms);
+int cms_SignedData_final(CMS_ContentInfo *cms, BIO *chain);
+int cms_set1_SignerIdentifier(CMS_SignerIdentifier *sid, X509 *cert, int type);
+int cms_SignerIdentifier_get0_signer_id(CMS_SignerIdentifier *sid,
+					ASN1_OCTET_STRING **keyid,
+					X509_NAME **issuer, ASN1_INTEGER **sno);
+int cms_SignerIdentifier_cert_cmp(CMS_SignerIdentifier *sid, X509 *cert);
+
+CMS_ContentInfo *cms_CompressedData_create(int comp_nid);
+BIO *cms_CompressedData_init_bio(CMS_ContentInfo *cms);
+
+void cms_DigestAlgorithm_set(X509_ALGOR *alg, const EVP_MD *md);
+BIO *cms_DigestAlgorithm_init_bio(X509_ALGOR *digestAlgorithm);
+int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain,
+					X509_ALGOR *mdalg);
+
+BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec);
+BIO *cms_EncryptedData_init_bio(CMS_ContentInfo *cms);
+int cms_EncryptedContent_init(CMS_EncryptedContentInfo *ec, 
+				const EVP_CIPHER *cipher,
+				const unsigned char *key, size_t keylen);
+
+int cms_Receipt_verify(CMS_ContentInfo *cms, CMS_ContentInfo *req_cms);
+int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src);
+ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si);
+
+BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms);
+CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms);
+
+/* PWRI routines */
+int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
+							int en_de);
+	
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/main/openssl/crypto/cms/cms_lib.c b/main/openssl/crypto/cms/cms_lib.c
new file mode 100644
index 00000000..b62d1bfa
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_lib.c
@@ -0,0 +1,624 @@
+/* crypto/cms/cms_lib.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/err.h>
+#include <openssl/pem.h>
+#include <openssl/bio.h>
+#include <openssl/asn1.h>
+#include "cms.h"
+#include "cms_lcl.h"
+
+IMPLEMENT_ASN1_FUNCTIONS(CMS_ContentInfo)
+IMPLEMENT_ASN1_PRINT_FUNCTION(CMS_ContentInfo)
+
+DECLARE_ASN1_ITEM(CMS_CertificateChoices)
+DECLARE_ASN1_ITEM(CMS_RevocationInfoChoice)
+DECLARE_STACK_OF(CMS_CertificateChoices)
+DECLARE_STACK_OF(CMS_RevocationInfoChoice)
+
+const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms)
+	{
+	return cms->contentType;
+	}
+
+CMS_ContentInfo *cms_Data_create(void)
+	{
+	CMS_ContentInfo *cms;
+	cms = CMS_ContentInfo_new();
+	if (cms)
+		{
+		cms->contentType = OBJ_nid2obj(NID_pkcs7_data);
+		/* Never detached */
+		CMS_set_detached(cms, 0);
+		}
+	return cms;
+	}
+
+BIO *cms_content_bio(CMS_ContentInfo *cms)
+	{
+	ASN1_OCTET_STRING **pos = CMS_get0_content(cms);
+	if (!pos)
+		return NULL;
+	/* If content detached data goes nowhere: create NULL BIO */
+	if (!*pos)
+		return BIO_new(BIO_s_null());
+	/* If content not detached and created return memory BIO
+	 */
+	if (!*pos || ((*pos)->flags == ASN1_STRING_FLAG_CONT))
+		return BIO_new(BIO_s_mem());
+	/* Else content was read in: return read only BIO for it */
+	return BIO_new_mem_buf((*pos)->data, (*pos)->length);
+	}
+
+BIO *CMS_dataInit(CMS_ContentInfo *cms, BIO *icont)
+	{
+	BIO *cmsbio, *cont;
+	if (icont)
+		cont = icont;
+	else
+		cont = cms_content_bio(cms);
+	if (!cont)
+		{
+		CMSerr(CMS_F_CMS_DATAINIT, CMS_R_NO_CONTENT);
+		return NULL;
+		}
+	switch (OBJ_obj2nid(cms->contentType))
+		{
+
+		case NID_pkcs7_data:
+		return cont;
+
+		case NID_pkcs7_signed:
+		cmsbio = cms_SignedData_init_bio(cms);
+		break;
+
+		case NID_pkcs7_digest:
+		cmsbio = cms_DigestedData_init_bio(cms);
+		break;
+#ifdef ZLIB
+		case NID_id_smime_ct_compressedData:
+		cmsbio = cms_CompressedData_init_bio(cms);
+		break;
+#endif
+
+		case NID_pkcs7_encrypted:
+		cmsbio = cms_EncryptedData_init_bio(cms);
+		break;
+
+		case NID_pkcs7_enveloped:
+		cmsbio = cms_EnvelopedData_init_bio(cms);
+		break;
+
+		default:
+		CMSerr(CMS_F_CMS_DATAINIT, CMS_R_UNSUPPORTED_TYPE);
+		return NULL;
+		}
+
+	if (cmsbio)
+		return BIO_push(cmsbio, cont);
+
+	if (!icont)
+		BIO_free(cont);
+	return NULL;
+
+	}
+
+int CMS_dataFinal(CMS_ContentInfo *cms, BIO *cmsbio)
+	{
+	ASN1_OCTET_STRING **pos = CMS_get0_content(cms);
+	if (!pos)
+		return 0;
+	/* If ebmedded content find memory BIO and set content */
+	if (*pos && ((*pos)->flags & ASN1_STRING_FLAG_CONT))
+		{
+		BIO *mbio;
+		unsigned char *cont;
+		long contlen;
+		mbio = BIO_find_type(cmsbio, BIO_TYPE_MEM);
+		if (!mbio)
+			{
+			CMSerr(CMS_F_CMS_DATAFINAL, CMS_R_CONTENT_NOT_FOUND);
+			return 0;
+			}
+		contlen = BIO_get_mem_data(mbio, &cont);
+		/* Set bio as read only so its content can't be clobbered */
+		BIO_set_flags(mbio, BIO_FLAGS_MEM_RDONLY);
+		BIO_set_mem_eof_return(mbio, 0);
+		ASN1_STRING_set0(*pos, cont, contlen);
+		(*pos)->flags &= ~ASN1_STRING_FLAG_CONT;
+		}
+
+	switch (OBJ_obj2nid(cms->contentType))
+		{
+
+		case NID_pkcs7_data:
+		case NID_pkcs7_enveloped:
+		case NID_pkcs7_encrypted:
+		case NID_id_smime_ct_compressedData:
+		/* Nothing to do */
+		return 1;
+
+		case NID_pkcs7_signed:
+		return cms_SignedData_final(cms, cmsbio);
+
+		case NID_pkcs7_digest:
+		return cms_DigestedData_do_final(cms, cmsbio, 0);
+
+		default:
+		CMSerr(CMS_F_CMS_DATAFINAL, CMS_R_UNSUPPORTED_TYPE);
+		return 0;
+		}
+	}
+
+/* Return an OCTET STRING pointer to content. This allows it to
+ * be accessed or set later.
+ */
+
+ASN1_OCTET_STRING **CMS_get0_content(CMS_ContentInfo *cms)
+	{
+	switch (OBJ_obj2nid(cms->contentType))
+		{
+
+		case NID_pkcs7_data:
+		return &cms->d.data;
+
+		case NID_pkcs7_signed:
+		return &cms->d.signedData->encapContentInfo->eContent;
+
+		case NID_pkcs7_enveloped:
+		return &cms->d.envelopedData->encryptedContentInfo->encryptedContent;
+
+		case NID_pkcs7_digest:
+		return &cms->d.digestedData->encapContentInfo->eContent;
+
+		case NID_pkcs7_encrypted:
+		return &cms->d.encryptedData->encryptedContentInfo->encryptedContent;
+
+		case NID_id_smime_ct_authData:
+		return &cms->d.authenticatedData->encapContentInfo->eContent;
+
+		case NID_id_smime_ct_compressedData:
+		return &cms->d.compressedData->encapContentInfo->eContent;
+
+		default:
+		if (cms->d.other->type == V_ASN1_OCTET_STRING)
+			return &cms->d.other->value.octet_string;
+		CMSerr(CMS_F_CMS_GET0_CONTENT, CMS_R_UNSUPPORTED_CONTENT_TYPE);
+		return NULL;
+
+		}
+	}
+
+/* Return an ASN1_OBJECT pointer to content type. This allows it to
+ * be accessed or set later.
+ */
+
+static ASN1_OBJECT **cms_get0_econtent_type(CMS_ContentInfo *cms)
+	{
+	switch (OBJ_obj2nid(cms->contentType))
+		{
+
+		case NID_pkcs7_signed:
+		return &cms->d.signedData->encapContentInfo->eContentType;
+
+		case NID_pkcs7_enveloped:
+		return &cms->d.envelopedData->encryptedContentInfo->contentType;
+
+		case NID_pkcs7_digest:
+		return &cms->d.digestedData->encapContentInfo->eContentType;
+
+		case NID_pkcs7_encrypted:
+		return &cms->d.encryptedData->encryptedContentInfo->contentType;
+
+		case NID_id_smime_ct_authData:
+		return &cms->d.authenticatedData->encapContentInfo->eContentType;
+
+		case NID_id_smime_ct_compressedData:
+		return &cms->d.compressedData->encapContentInfo->eContentType;
+
+		default:
+		CMSerr(CMS_F_CMS_GET0_ECONTENT_TYPE,
+					CMS_R_UNSUPPORTED_CONTENT_TYPE);
+		return NULL;
+
+		}
+	}
+
+const ASN1_OBJECT *CMS_get0_eContentType(CMS_ContentInfo *cms)
+	{
+	ASN1_OBJECT **petype;
+	petype = cms_get0_econtent_type(cms);
+	if (petype)
+		return *petype;
+	return NULL;
+	}
+
+int CMS_set1_eContentType(CMS_ContentInfo *cms, const ASN1_OBJECT *oid)
+	{
+	ASN1_OBJECT **petype, *etype;
+	petype = cms_get0_econtent_type(cms);
+	if (!petype)
+		return 0;
+	if (!oid)
+		return 1;
+	etype = OBJ_dup(oid);
+	if (!etype)
+		return 0;
+	ASN1_OBJECT_free(*petype);
+	*petype = etype;
+	return 1;
+	}
+
+int CMS_is_detached(CMS_ContentInfo *cms)
+	{
+	ASN1_OCTET_STRING **pos;
+	pos = CMS_get0_content(cms);
+	if (!pos)
+		return -1;
+	if (*pos)
+		return 0;
+	return 1;
+	}
+
+int CMS_set_detached(CMS_ContentInfo *cms, int detached)
+	{
+	ASN1_OCTET_STRING **pos;
+	pos = CMS_get0_content(cms);
+	if (!pos)
+		return 0;
+	if (detached)
+		{
+		if (*pos)
+			{
+			ASN1_OCTET_STRING_free(*pos);
+			*pos = NULL;
+			}
+		return 1;
+		}
+	if (!*pos)
+		*pos = ASN1_OCTET_STRING_new();
+	if (*pos)
+		{
+		/* NB: special flag to show content is created and not
+		 * read in.
+		 */
+		(*pos)->flags |= ASN1_STRING_FLAG_CONT;
+		return 1;
+		}
+	CMSerr(CMS_F_CMS_SET_DETACHED, ERR_R_MALLOC_FAILURE);
+	return 0;
+	}
+
+/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */
+
+void cms_DigestAlgorithm_set(X509_ALGOR *alg, const EVP_MD *md)
+	{
+	int param_type;
+
+	if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT)
+		param_type = V_ASN1_UNDEF;
+	else
+		param_type = V_ASN1_NULL;
+
+	X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+
+	}
+
+/* Create a digest BIO from an X509_ALGOR structure */
+
+BIO *cms_DigestAlgorithm_init_bio(X509_ALGOR *digestAlgorithm)
+	{
+	BIO *mdbio = NULL;
+	ASN1_OBJECT *digestoid;
+	const EVP_MD *digest;
+	X509_ALGOR_get0(&digestoid, NULL, NULL, digestAlgorithm);
+	digest = EVP_get_digestbyobj(digestoid);
+	if (!digest)
+		{
+		CMSerr(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO,
+				CMS_R_UNKNOWN_DIGEST_ALGORIHM);
+		goto err;	
+		}
+	mdbio = BIO_new(BIO_f_md());
+	if (!mdbio || !BIO_set_md(mdbio, digest))
+		{
+		CMSerr(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO,
+				CMS_R_MD_BIO_INIT_ERROR);
+		goto err;	
+		}
+	return mdbio;
+	err:
+	if (mdbio)
+		BIO_free(mdbio);
+	return NULL;
+	}
+
+/* Locate a message digest content from a BIO chain based on SignerInfo */
+
+int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain,
+					X509_ALGOR *mdalg)
+	{
+	int nid;
+	ASN1_OBJECT *mdoid;
+	X509_ALGOR_get0(&mdoid, NULL, NULL, mdalg);
+	nid = OBJ_obj2nid(mdoid);
+	/* Look for digest type to match signature */
+	for (;;)
+		{
+		EVP_MD_CTX *mtmp;
+		chain = BIO_find_type(chain, BIO_TYPE_MD);
+		if (chain == NULL)
+			{
+			CMSerr(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX,
+						CMS_R_NO_MATCHING_DIGEST);
+			return 0;
+			}
+		BIO_get_md_ctx(chain, &mtmp);
+		if (EVP_MD_CTX_type(mtmp) == nid
+		/* Workaround for broken implementations that use signature
+		 * algorithm  OID instead of digest.
+		 */
+			|| EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid)
+			return EVP_MD_CTX_copy_ex(mctx, mtmp);
+		chain = BIO_next(chain);
+		}
+	}
+
+static STACK_OF(CMS_CertificateChoices) **cms_get0_certificate_choices(CMS_ContentInfo *cms)
+	{
+	switch (OBJ_obj2nid(cms->contentType))
+		{
+
+		case NID_pkcs7_signed:
+		return &cms->d.signedData->certificates;
+
+		case NID_pkcs7_enveloped:
+		return &cms->d.envelopedData->originatorInfo->certificates;
+
+		default:
+		CMSerr(CMS_F_CMS_GET0_CERTIFICATE_CHOICES,
+					CMS_R_UNSUPPORTED_CONTENT_TYPE);
+		return NULL;
+
+		}
+	}
+
+CMS_CertificateChoices *CMS_add0_CertificateChoices(CMS_ContentInfo *cms)
+	{
+	STACK_OF(CMS_CertificateChoices) **pcerts;
+	CMS_CertificateChoices *cch;
+	pcerts = cms_get0_certificate_choices(cms);
+	if (!pcerts)
+		return NULL;
+	if (!*pcerts)
+		*pcerts = sk_CMS_CertificateChoices_new_null();
+	if (!*pcerts)
+		return NULL;
+	cch = M_ASN1_new_of(CMS_CertificateChoices);
+	if (!cch)
+		return NULL;
+	if (!sk_CMS_CertificateChoices_push(*pcerts, cch))
+		{
+		M_ASN1_free_of(cch, CMS_CertificateChoices);
+		return NULL;
+		}
+	return cch;
+	}
+
+int CMS_add0_cert(CMS_ContentInfo *cms, X509 *cert)
+	{
+	CMS_CertificateChoices *cch;
+	STACK_OF(CMS_CertificateChoices) **pcerts;
+	int i;
+	pcerts = cms_get0_certificate_choices(cms);
+	if (!pcerts)
+		return 0;
+	if (!pcerts)
+		return 0;
+	for (i = 0; i < sk_CMS_CertificateChoices_num(*pcerts); i++)
+		{
+		cch = sk_CMS_CertificateChoices_value(*pcerts, i);
+		if (cch->type == CMS_CERTCHOICE_CERT)
+			{
+			if (!X509_cmp(cch->d.certificate, cert))
+				{
+				CMSerr(CMS_F_CMS_ADD0_CERT, 
+					CMS_R_CERTIFICATE_ALREADY_PRESENT);
+				return 0;
+				}
+			}
+		}
+	cch = CMS_add0_CertificateChoices(cms);
+	if (!cch)
+		return 0;
+	cch->type = CMS_CERTCHOICE_CERT;
+	cch->d.certificate = cert;
+	return 1;
+	}
+
+int CMS_add1_cert(CMS_ContentInfo *cms, X509 *cert)
+	{
+	int r;
+	r = CMS_add0_cert(cms, cert);
+	if (r > 0)
+		CRYPTO_add(&cert->references, 1, CRYPTO_LOCK_X509);
+	return r;
+	}
+
+static STACK_OF(CMS_RevocationInfoChoice) **cms_get0_revocation_choices(CMS_ContentInfo *cms)
+	{
+	switch (OBJ_obj2nid(cms->contentType))
+		{
+
+		case NID_pkcs7_signed:
+		return &cms->d.signedData->crls;
+
+		case NID_pkcs7_enveloped:
+		return &cms->d.envelopedData->originatorInfo->crls;
+
+		default:
+		CMSerr(CMS_F_CMS_GET0_REVOCATION_CHOICES,
+					CMS_R_UNSUPPORTED_CONTENT_TYPE);
+		return NULL;
+
+		}
+	}
+
+CMS_RevocationInfoChoice *CMS_add0_RevocationInfoChoice(CMS_ContentInfo *cms)
+	{
+	STACK_OF(CMS_RevocationInfoChoice) **pcrls;
+	CMS_RevocationInfoChoice *rch;
+	pcrls = cms_get0_revocation_choices(cms);
+	if (!pcrls)
+		return NULL;
+	if (!*pcrls)
+		*pcrls = sk_CMS_RevocationInfoChoice_new_null();
+	if (!*pcrls)
+		return NULL;
+	rch = M_ASN1_new_of(CMS_RevocationInfoChoice);
+	if (!rch)
+		return NULL;
+	if (!sk_CMS_RevocationInfoChoice_push(*pcrls, rch))
+		{
+		M_ASN1_free_of(rch, CMS_RevocationInfoChoice);
+		return NULL;
+		}
+	return rch;
+	}
+
+int CMS_add0_crl(CMS_ContentInfo *cms, X509_CRL *crl)
+	{
+	CMS_RevocationInfoChoice *rch;
+	rch = CMS_add0_RevocationInfoChoice(cms);
+	if (!rch)
+		return 0;
+	rch->type = CMS_REVCHOICE_CRL;
+	rch->d.crl = crl;
+	return 1;
+	}
+
+int CMS_add1_crl(CMS_ContentInfo *cms, X509_CRL *crl)
+	{
+	int r;
+	r = CMS_add0_crl(cms, crl);
+	if (r > 0)
+		CRYPTO_add(&crl->references, 1, CRYPTO_LOCK_X509_CRL);
+	return r;
+	}
+
+STACK_OF(X509) *CMS_get1_certs(CMS_ContentInfo *cms)
+	{
+	STACK_OF(X509) *certs = NULL;
+	CMS_CertificateChoices *cch;
+	STACK_OF(CMS_CertificateChoices) **pcerts;
+	int i;
+	pcerts = cms_get0_certificate_choices(cms);
+	if (!pcerts)
+		return NULL;
+	for (i = 0; i < sk_CMS_CertificateChoices_num(*pcerts); i++)
+		{
+		cch = sk_CMS_CertificateChoices_value(*pcerts, i);
+		if (cch->type == 0)
+			{
+			if (!certs)
+				{
+				certs = sk_X509_new_null();
+				if (!certs)
+					return NULL;
+				}
+			if (!sk_X509_push(certs, cch->d.certificate))
+				{
+				sk_X509_pop_free(certs, X509_free);
+				return NULL;
+				}
+			CRYPTO_add(&cch->d.certificate->references,
+						1, CRYPTO_LOCK_X509);
+			}
+		}
+	return certs;
+
+	}
+
+STACK_OF(X509_CRL) *CMS_get1_crls(CMS_ContentInfo *cms)
+	{
+	STACK_OF(X509_CRL) *crls = NULL;
+	STACK_OF(CMS_RevocationInfoChoice) **pcrls;
+	CMS_RevocationInfoChoice *rch;
+	int i;
+	pcrls = cms_get0_revocation_choices(cms);
+	if (!pcrls)
+		return NULL;
+	for (i = 0; i < sk_CMS_RevocationInfoChoice_num(*pcrls); i++)
+		{
+		rch = sk_CMS_RevocationInfoChoice_value(*pcrls, i);
+		if (rch->type == 0)
+			{
+			if (!crls)
+				{
+				crls = sk_X509_CRL_new_null();
+				if (!crls)
+					return NULL;
+				}
+			if (!sk_X509_CRL_push(crls, rch->d.crl))
+				{
+				sk_X509_CRL_pop_free(crls, X509_CRL_free);
+				return NULL;
+				}
+			CRYPTO_add(&rch->d.crl->references,
+						1, CRYPTO_LOCK_X509_CRL);
+			}
+		}
+	return crls;
+	}
diff --git a/main/openssl/crypto/cms/cms_pwri.c b/main/openssl/crypto/cms/cms_pwri.c
new file mode 100644
index 00000000..b79612a1
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_pwri.c
@@ -0,0 +1,454 @@
+/* crypto/cms/cms_pwri.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2009 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/rand.h>
+#include <openssl/aes.h>
+#include "cms_lcl.h"
+#include "asn1_locl.h"
+
+int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, 
+				unsigned char *pass, ossl_ssize_t passlen)
+	{
+	CMS_PasswordRecipientInfo *pwri;
+	if (ri->type != CMS_RECIPINFO_PASS)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD, CMS_R_NOT_PWRI);
+		return 0;
+		}
+
+	pwri = ri->d.pwri;
+	pwri->pass = pass;
+	if (pass && passlen < 0)
+		passlen = strlen((char *)pass);
+	pwri->passlen = passlen;
+	return 1;
+	}
+
+CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms,
+					int iter, int wrap_nid, int pbe_nid,
+					unsigned char *pass,
+					ossl_ssize_t passlen,
+					const EVP_CIPHER *kekciph)
+	{
+	CMS_RecipientInfo *ri = NULL;
+	CMS_EnvelopedData *env;
+	CMS_PasswordRecipientInfo *pwri;
+	EVP_CIPHER_CTX ctx;
+	X509_ALGOR *encalg = NULL;
+	unsigned char iv[EVP_MAX_IV_LENGTH];
+	int ivlen;
+	env = cms_get0_enveloped(cms);
+	if (!env)
+		goto err;
+
+	if (wrap_nid <= 0)
+		wrap_nid = NID_id_alg_PWRI_KEK;
+
+	if (pbe_nid <= 0)
+		pbe_nid = NID_id_pbkdf2;
+
+	/* Get from enveloped data */
+	if (kekciph == NULL)
+		kekciph = env->encryptedContentInfo->cipher;
+
+	if (kekciph == NULL)
+		{
+		CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, CMS_R_NO_CIPHER);
+		return NULL;
+		}
+	if (wrap_nid != NID_id_alg_PWRI_KEK)
+		{
+		CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+				CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM);
+		return NULL;
+		}
+
+	/* Setup algorithm identifier for cipher */
+	encalg = X509_ALGOR_new();
+	EVP_CIPHER_CTX_init(&ctx);
+
+	if (EVP_EncryptInit_ex(&ctx, kekciph, NULL, NULL, NULL) <= 0)
+		{
+		CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_EVP_LIB);
+		goto err;
+		}
+
+	ivlen = EVP_CIPHER_CTX_iv_length(&ctx);
+
+	if (ivlen > 0)
+		{
+		if (RAND_pseudo_bytes(iv, ivlen) <= 0)
+			goto err;
+		if (EVP_EncryptInit_ex(&ctx, NULL, NULL, NULL, iv) <= 0)
+			{
+			CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+							ERR_R_EVP_LIB);
+			goto err;
+			}
+		encalg->parameter = ASN1_TYPE_new();
+		if (!encalg->parameter)
+			{
+			CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+							ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		if (EVP_CIPHER_param_to_asn1(&ctx, encalg->parameter) <= 0)
+			{
+			CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+				CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+			goto err;
+			}
+		}
+
+
+	encalg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(&ctx));
+
+	EVP_CIPHER_CTX_cleanup(&ctx);
+
+	/* Initialize recipient info */
+	ri = M_ASN1_new_of(CMS_RecipientInfo);
+	if (!ri)
+		goto merr;
+
+	ri->d.pwri = M_ASN1_new_of(CMS_PasswordRecipientInfo);
+	if (!ri->d.pwri)
+		goto merr;
+	ri->type = CMS_RECIPINFO_PASS;
+
+	pwri = ri->d.pwri;
+	/* Since this is overwritten, free up empty structure already there */
+	X509_ALGOR_free(pwri->keyEncryptionAlgorithm);
+	pwri->keyEncryptionAlgorithm = X509_ALGOR_new();
+	if (!pwri->keyEncryptionAlgorithm)
+		goto merr;
+	pwri->keyEncryptionAlgorithm->algorithm = OBJ_nid2obj(wrap_nid);
+	pwri->keyEncryptionAlgorithm->parameter = ASN1_TYPE_new();
+	if (!pwri->keyEncryptionAlgorithm->parameter)
+		goto merr;
+
+        if(!ASN1_item_pack(encalg, ASN1_ITEM_rptr(X509_ALGOR),
+	    &pwri->keyEncryptionAlgorithm->parameter->value.sequence))
+		goto merr;
+        pwri->keyEncryptionAlgorithm->parameter->type = V_ASN1_SEQUENCE;
+
+	X509_ALGOR_free(encalg);
+	encalg = NULL;
+
+	/* Setup PBE algorithm */
+
+	pwri->keyDerivationAlgorithm = PKCS5_pbkdf2_set(iter, NULL, 0, -1, -1);
+
+	if (!pwri->keyDerivationAlgorithm)
+		goto err;
+
+	CMS_RecipientInfo_set0_password(ri, pass, passlen);
+	pwri->version = 0;
+
+	if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri))
+		goto merr;
+
+	return ri;
+
+	merr:
+	CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_MALLOC_FAILURE);
+	err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (ri)
+		M_ASN1_free_of(ri, CMS_RecipientInfo);
+	if (encalg)
+		X509_ALGOR_free(encalg);
+	return NULL;
+
+	}
+
+/* This is an implementation of the key wrapping mechanism in RFC3211,
+ * at some point this should go into EVP.
+ */
+
+static int kek_unwrap_key(unsigned char *out, size_t *outlen,
+		const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx)
+	{
+	size_t blocklen = EVP_CIPHER_CTX_block_size(ctx);
+	unsigned char *tmp;
+	int outl, rv = 0;
+	if (inlen < 2 * blocklen)
+		{
+		/* too small */
+		return 0;
+		}
+	if (inlen % blocklen)
+		{
+		/* Invalid size */
+		return 0;
+		}
+	tmp = OPENSSL_malloc(inlen);
+	/* setup IV by decrypting last two blocks */
+	EVP_DecryptUpdate(ctx, tmp + inlen - 2 * blocklen, &outl,
+				in  + inlen - 2 * blocklen, blocklen * 2);
+	/* Do a decrypt of last decrypted block to set IV to correct value
+	 * output it to start of buffer so we don't corrupt decrypted block
+	 * this works because buffer is at least two block lengths long.
+	 */
+	EVP_DecryptUpdate(ctx, tmp, &outl,
+				tmp  + inlen - blocklen, blocklen);
+	/* Can now decrypt first n - 1 blocks */
+	EVP_DecryptUpdate(ctx, tmp, &outl, in, inlen - blocklen);
+
+	/* Reset IV to original value */
+	EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, NULL);
+	/* Decrypt again */
+	EVP_DecryptUpdate(ctx, tmp, &outl, tmp, inlen);
+	/* Check check bytes */
+	if (((tmp[1] ^ tmp[4]) & (tmp[2] ^ tmp[5]) & (tmp[3] ^ tmp[6])) != 0xff)
+		{
+		/* Check byte failure */
+		goto err;
+		}
+	if (inlen < (size_t)(tmp[0] - 4 ))
+		{
+		/* Invalid length value */
+		goto err;
+		}
+	*outlen = (size_t)tmp[0];
+	memcpy(out, tmp + 4, *outlen);
+	rv = 1;
+	err:
+	OPENSSL_cleanse(tmp, inlen);
+	OPENSSL_free(tmp);
+	return rv;
+
+	}
+
+static int kek_wrap_key(unsigned char *out, size_t *outlen,
+		const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx)
+	{
+	size_t blocklen = EVP_CIPHER_CTX_block_size(ctx);
+	size_t olen;
+	int dummy;
+	/* First decide length of output buffer: need header and round up to
+	 * multiple of block length.
+	 */
+	olen = (inlen + 4 + blocklen - 1)/blocklen;
+	olen *= blocklen;
+	if (olen < 2 * blocklen)
+		{
+		/* Key too small */
+		return 0;
+		}
+	if (inlen > 0xFF)
+		{
+		/* Key too large */
+		return 0;
+		}
+	if (out)
+		{
+		/* Set header */
+		out[0] = (unsigned char)inlen;
+		out[1] = in[0] ^ 0xFF;
+		out[2] = in[1] ^ 0xFF;
+		out[3] = in[2] ^ 0xFF;
+		memcpy(out + 4, in, inlen);
+		/* Add random padding to end */
+		if (olen > inlen + 4)
+			RAND_pseudo_bytes(out + 4 + inlen, olen - 4 - inlen);
+		/* Encrypt twice */
+		EVP_EncryptUpdate(ctx, out, &dummy, out, olen);
+		EVP_EncryptUpdate(ctx, out, &dummy, out, olen);
+		}
+
+	*outlen = olen;
+
+	return 1;
+	}
+
+/* Encrypt/Decrypt content key in PWRI recipient info */
+
+int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
+							int en_de)
+	{
+	CMS_EncryptedContentInfo *ec;
+	CMS_PasswordRecipientInfo *pwri;
+	const unsigned char *p = NULL;
+	int plen;
+	int r = 0;
+	X509_ALGOR *algtmp, *kekalg = NULL;
+	EVP_CIPHER_CTX kekctx;
+	const EVP_CIPHER *kekcipher;
+	unsigned char *key = NULL;
+	size_t keylen;
+
+	ec = cms->d.envelopedData->encryptedContentInfo;
+
+	pwri = ri->d.pwri;
+	EVP_CIPHER_CTX_init(&kekctx);
+
+	if (!pwri->pass)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, CMS_R_NO_PASSWORD);
+		return 0;
+		}
+	algtmp = pwri->keyEncryptionAlgorithm;
+
+	if (!algtmp || OBJ_obj2nid(algtmp->algorithm) != NID_id_alg_PWRI_KEK)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+				CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM);
+		return 0;
+		}
+
+	if (algtmp->parameter->type == V_ASN1_SEQUENCE)
+		{
+		p = algtmp->parameter->value.sequence->data;
+		plen = algtmp->parameter->value.sequence->length;
+		kekalg = d2i_X509_ALGOR(NULL, &p, plen);
+		}
+	if (kekalg == NULL)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+				CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER);
+		return 0;
+		}
+
+	kekcipher = EVP_get_cipherbyobj(kekalg->algorithm);
+		
+	if(!kekcipher)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+				CMS_R_UNKNOWN_CIPHER);
+		goto err;
+		}
+
+	/* Fixup cipher based on AlgorithmIdentifier to set IV etc */
+	if (!EVP_CipherInit_ex(&kekctx, kekcipher, NULL, NULL, NULL, en_de))
+		goto err;
+	EVP_CIPHER_CTX_set_padding(&kekctx, 0);
+	if(EVP_CIPHER_asn1_to_param(&kekctx, kekalg->parameter) < 0)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+				CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+		goto err;
+		}
+
+	algtmp = pwri->keyDerivationAlgorithm;
+
+	/* Finish password based key derivation to setup key in "ctx" */
+
+	if (EVP_PBE_CipherInit(algtmp->algorithm,
+				(char *)pwri->pass, pwri->passlen,
+				algtmp->parameter, &kekctx, en_de) < 0)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, ERR_R_EVP_LIB);
+		goto err;
+		}
+
+	/* Finally wrap/unwrap the key */
+
+	if (en_de)
+		{
+
+		if (!kek_wrap_key(NULL, &keylen, ec->key, ec->keylen, &kekctx))
+			goto err;
+
+		key = OPENSSL_malloc(keylen);
+
+		if (!key)
+			goto err;
+
+		if (!kek_wrap_key(key, &keylen, ec->key, ec->keylen, &kekctx))
+			goto err;
+		pwri->encryptedKey->data = key;
+		pwri->encryptedKey->length = keylen;
+		}
+	else
+		{
+		key = OPENSSL_malloc(pwri->encryptedKey->length);
+
+		if (!key)
+			{
+			CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+							ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		if (!kek_unwrap_key(key, &keylen,
+					pwri->encryptedKey->data,
+					pwri->encryptedKey->length, &kekctx))
+			{
+			CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+							CMS_R_UNWRAP_FAILURE);
+			goto err;
+			}
+
+		ec->key = key;
+		ec->keylen = keylen;
+
+		}
+
+	r = 1;
+
+	err:
+
+	EVP_CIPHER_CTX_cleanup(&kekctx);
+
+	if (!r && key)
+		OPENSSL_free(key);
+	X509_ALGOR_free(kekalg);
+
+	return r;
+
+	}
diff --git a/main/openssl/crypto/cms/cms_sd.c b/main/openssl/crypto/cms/cms_sd.c
new file mode 100644
index 00000000..77fbd135
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_sd.c
@@ -0,0 +1,985 @@
+/* crypto/cms/cms_sd.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include "cms_lcl.h"
+#include "asn1_locl.h"
+
+/* CMS SignedData Utilities */
+
+DECLARE_ASN1_ITEM(CMS_SignedData)
+
+static CMS_SignedData *cms_get0_signed(CMS_ContentInfo *cms)
+	{
+	if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_signed)
+		{
+		CMSerr(CMS_F_CMS_GET0_SIGNED, CMS_R_CONTENT_TYPE_NOT_SIGNED_DATA);
+		return NULL;
+		}
+	return cms->d.signedData;
+	}
+
+static CMS_SignedData *cms_signed_data_init(CMS_ContentInfo *cms)
+	{
+	if (cms->d.other == NULL)
+		{
+		cms->d.signedData = M_ASN1_new_of(CMS_SignedData);
+		if (!cms->d.signedData)
+			{
+			CMSerr(CMS_F_CMS_SIGNED_DATA_INIT, ERR_R_MALLOC_FAILURE);
+			return NULL;
+			}
+		cms->d.signedData->version = 1;
+		cms->d.signedData->encapContentInfo->eContentType =
+						OBJ_nid2obj(NID_pkcs7_data);
+		cms->d.signedData->encapContentInfo->partial = 1;
+		ASN1_OBJECT_free(cms->contentType);
+		cms->contentType = OBJ_nid2obj(NID_pkcs7_signed);
+		return cms->d.signedData;
+		}
+	return cms_get0_signed(cms);
+	}
+
+/* Just initialize SignedData e.g. for certs only structure */
+
+int CMS_SignedData_init(CMS_ContentInfo *cms)
+	{
+	if (cms_signed_data_init(cms))
+		return 1;
+	else
+		return 0;
+	}
+
+/* Check structures and fixup version numbers (if necessary) */
+
+static void cms_sd_set_version(CMS_SignedData *sd)
+	{
+	int i;
+	CMS_CertificateChoices *cch;
+	CMS_RevocationInfoChoice *rch;
+	CMS_SignerInfo *si;
+
+	for (i = 0; i < sk_CMS_CertificateChoices_num(sd->certificates); i++)
+		{
+		cch = sk_CMS_CertificateChoices_value(sd->certificates, i);
+		if (cch->type == CMS_CERTCHOICE_OTHER)
+			{
+			if (sd->version < 5)
+				sd->version = 5;
+			}
+		else if (cch->type == CMS_CERTCHOICE_V2ACERT)
+			{
+			if (sd->version < 4)
+				sd->version = 4;
+			}
+		else if (cch->type == CMS_CERTCHOICE_V1ACERT)
+			{
+			if (sd->version < 3)
+				sd->version = 3;
+			}
+		}
+
+	for (i = 0; i < sk_CMS_RevocationInfoChoice_num(sd->crls); i++)
+		{
+		rch = sk_CMS_RevocationInfoChoice_value(sd->crls, i);
+		if (rch->type == CMS_REVCHOICE_OTHER)
+			{
+			if (sd->version < 5)
+				sd->version = 5;
+			}
+		}
+
+	if ((OBJ_obj2nid(sd->encapContentInfo->eContentType) != NID_pkcs7_data)
+			&& (sd->version < 3))
+		sd->version = 3;
+
+	for (i = 0; i < sk_CMS_SignerInfo_num(sd->signerInfos); i++)
+		{
+		si = sk_CMS_SignerInfo_value(sd->signerInfos, i);
+		if (si->sid->type == CMS_SIGNERINFO_KEYIDENTIFIER)
+			{
+			if (si->version < 3)
+				si->version = 3;
+			if (sd->version < 3)
+				sd->version = 3;
+			}
+		else
+			sd->version = 1;
+		}
+
+	if (sd->version < 1)
+		sd->version = 1;
+
+	}
+	
+/* Copy an existing messageDigest value */
+
+static int cms_copy_messageDigest(CMS_ContentInfo *cms, CMS_SignerInfo *si)
+	{
+	STACK_OF(CMS_SignerInfo) *sinfos;
+	CMS_SignerInfo *sitmp;
+	int i;
+	sinfos = CMS_get0_SignerInfos(cms);
+	for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+		{
+		ASN1_OCTET_STRING *messageDigest;
+		sitmp = sk_CMS_SignerInfo_value(sinfos, i);
+		if (sitmp == si)
+			continue;
+		if (CMS_signed_get_attr_count(sitmp) < 0)
+			continue;
+		if (OBJ_cmp(si->digestAlgorithm->algorithm,
+				sitmp->digestAlgorithm->algorithm))
+			continue;
+		messageDigest = CMS_signed_get0_data_by_OBJ(sitmp,
+					OBJ_nid2obj(NID_pkcs9_messageDigest),
+					-3, V_ASN1_OCTET_STRING);
+		if (!messageDigest)
+			{
+			CMSerr(CMS_F_CMS_COPY_MESSAGEDIGEST,
+				CMS_R_ERROR_READING_MESSAGEDIGEST_ATTRIBUTE);
+			return 0;
+			}
+
+		if (CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest,
+						V_ASN1_OCTET_STRING,
+						messageDigest, -1))
+			return 1;
+		else
+			return 0;
+		}
+		CMSerr(CMS_F_CMS_COPY_MESSAGEDIGEST, CMS_R_NO_MATCHING_DIGEST);
+		return 0;
+	}
+
+int cms_set1_SignerIdentifier(CMS_SignerIdentifier *sid, X509 *cert, int type)
+	{
+	switch(type)
+		{
+		case CMS_SIGNERINFO_ISSUER_SERIAL:
+		sid->d.issuerAndSerialNumber =
+			M_ASN1_new_of(CMS_IssuerAndSerialNumber);
+		if (!sid->d.issuerAndSerialNumber)
+			goto merr;
+		if (!X509_NAME_set(&sid->d.issuerAndSerialNumber->issuer,
+					X509_get_issuer_name(cert)))
+			goto merr;
+		if (!ASN1_STRING_copy(
+			sid->d.issuerAndSerialNumber->serialNumber,
+				X509_get_serialNumber(cert)))
+			goto merr;
+		break;
+
+		case CMS_SIGNERINFO_KEYIDENTIFIER:
+		if (!cert->skid)
+			{
+			CMSerr(CMS_F_CMS_SET1_SIGNERIDENTIFIER,
+					CMS_R_CERTIFICATE_HAS_NO_KEYID);
+			return 0;
+			}
+		sid->d.subjectKeyIdentifier = ASN1_STRING_dup(cert->skid);
+		if (!sid->d.subjectKeyIdentifier)
+			goto merr;
+		break;
+
+		default:
+		CMSerr(CMS_F_CMS_SET1_SIGNERIDENTIFIER, CMS_R_UNKNOWN_ID);
+		return 0;
+		}
+
+	sid->type = type;
+
+	return 1;
+
+	merr:
+	CMSerr(CMS_F_CMS_SET1_SIGNERIDENTIFIER, ERR_R_MALLOC_FAILURE);
+	return 0;
+
+	}
+
+int cms_SignerIdentifier_get0_signer_id(CMS_SignerIdentifier *sid,
+					ASN1_OCTET_STRING **keyid,
+					X509_NAME **issuer, ASN1_INTEGER **sno)
+	{
+	if (sid->type == CMS_SIGNERINFO_ISSUER_SERIAL)
+		{
+		if (issuer)
+			*issuer = sid->d.issuerAndSerialNumber->issuer;
+		if (sno)
+			*sno = sid->d.issuerAndSerialNumber->serialNumber;
+		}
+	else if (sid->type == CMS_SIGNERINFO_KEYIDENTIFIER)
+		{
+		if (keyid)
+			*keyid = sid->d.subjectKeyIdentifier;
+		}
+	else
+		return 0;
+	return 1;
+	}
+
+int cms_SignerIdentifier_cert_cmp(CMS_SignerIdentifier *sid, X509 *cert)
+	{
+	int ret;
+	if (sid->type == CMS_SIGNERINFO_ISSUER_SERIAL)
+		{
+		ret = X509_NAME_cmp(sid->d.issuerAndSerialNumber->issuer,
+					X509_get_issuer_name(cert));
+		if (ret)
+			return ret;
+		return ASN1_INTEGER_cmp(sid->d.issuerAndSerialNumber->serialNumber,
+					X509_get_serialNumber(cert));
+		}
+	else if (sid->type == CMS_SIGNERINFO_KEYIDENTIFIER)
+		{
+		X509_check_purpose(cert, -1, -1);
+		if (!cert->skid)
+			return -1;
+		return ASN1_OCTET_STRING_cmp(sid->d.subjectKeyIdentifier,
+							cert->skid);
+		}
+	else
+		return -1;
+	}
+
+CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
+			X509 *signer, EVP_PKEY *pk, const EVP_MD *md,
+			unsigned int flags)
+	{
+	CMS_SignedData *sd;
+	CMS_SignerInfo *si = NULL;
+	X509_ALGOR *alg;
+	int i, type;
+	if(!X509_check_private_key(signer, pk))
+		{
+		CMSerr(CMS_F_CMS_ADD1_SIGNER,
+			CMS_R_PRIVATE_KEY_DOES_NOT_MATCH_CERTIFICATE);
+                return NULL;
+		}
+	sd = cms_signed_data_init(cms);
+	if (!sd)
+		goto err;
+	si = M_ASN1_new_of(CMS_SignerInfo);
+	if (!si)
+		goto merr;
+	X509_check_purpose(signer, -1, -1);
+
+	CRYPTO_add(&pk->references, 1, CRYPTO_LOCK_EVP_PKEY);
+	CRYPTO_add(&signer->references, 1, CRYPTO_LOCK_X509);
+
+	si->pkey = pk;
+	si->signer = signer;
+
+	if (flags & CMS_USE_KEYID)
+		{
+		si->version = 3;
+		if (sd->version < 3)
+			sd->version = 3;
+		type = CMS_SIGNERINFO_KEYIDENTIFIER;
+		}
+	else
+		{
+		type = CMS_SIGNERINFO_ISSUER_SERIAL;
+		si->version = 1;
+		}
+
+	if (!cms_set1_SignerIdentifier(si->sid, signer, type))
+		goto err;
+
+	if (md == NULL)
+		{
+		int def_nid;
+		if (EVP_PKEY_get_default_digest_nid(pk, &def_nid) <= 0)
+			goto err;
+		md = EVP_get_digestbynid(def_nid);
+		if (md == NULL)
+			{
+			CMSerr(CMS_F_CMS_ADD1_SIGNER, CMS_R_NO_DEFAULT_DIGEST);
+			goto err;
+			}
+		}
+
+	if (!md)
+		{
+		CMSerr(CMS_F_CMS_ADD1_SIGNER, CMS_R_NO_DIGEST_SET);
+		goto err;
+		}
+
+	cms_DigestAlgorithm_set(si->digestAlgorithm, md);
+
+	/* See if digest is present in digestAlgorithms */
+	for (i = 0; i < sk_X509_ALGOR_num(sd->digestAlgorithms); i++)
+		{
+		ASN1_OBJECT *aoid;
+		alg = sk_X509_ALGOR_value(sd->digestAlgorithms, i);
+		X509_ALGOR_get0(&aoid, NULL, NULL, alg);
+		if (OBJ_obj2nid(aoid) == EVP_MD_type(md))
+			break;
+		}
+
+	if (i == sk_X509_ALGOR_num(sd->digestAlgorithms))
+		{
+		alg = X509_ALGOR_new();
+		if (!alg)
+			goto merr;
+		cms_DigestAlgorithm_set(alg, md);
+		if (!sk_X509_ALGOR_push(sd->digestAlgorithms, alg))
+			{
+			X509_ALGOR_free(alg);
+			goto merr;
+			}
+		}
+
+	if (pk->ameth && pk->ameth->pkey_ctrl)
+		{
+		i = pk->ameth->pkey_ctrl(pk, ASN1_PKEY_CTRL_CMS_SIGN,
+						0, si);
+		if (i == -2)
+			{
+			CMSerr(CMS_F_CMS_ADD1_SIGNER,
+				CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
+			goto err;
+			}
+		if (i <= 0)
+			{
+			CMSerr(CMS_F_CMS_ADD1_SIGNER, CMS_R_CTRL_FAILURE);
+			goto err;
+			}
+		}
+
+	if (!(flags & CMS_NOATTR))
+		{
+		/* Initialialize signed attributes strutucture so other
+		 * attributes such as signing time etc are added later
+		 * even if we add none here.
+		 */
+		if (!si->signedAttrs)
+			{
+			si->signedAttrs = sk_X509_ATTRIBUTE_new_null();
+			if (!si->signedAttrs)
+				goto merr;
+			}
+
+		if (!(flags & CMS_NOSMIMECAP))
+			{
+			STACK_OF(X509_ALGOR) *smcap = NULL;
+			i = CMS_add_standard_smimecap(&smcap);
+			if (i)
+				i = CMS_add_smimecap(si, smcap);
+			sk_X509_ALGOR_pop_free(smcap, X509_ALGOR_free);
+			if (!i)
+				goto merr;
+			}
+		if (flags & CMS_REUSE_DIGEST)
+			{
+			if (!cms_copy_messageDigest(cms, si))
+				goto err;
+			if (!(flags & CMS_PARTIAL) &&
+					!CMS_SignerInfo_sign(si))
+				goto err;
+			}
+		}
+
+	if (!(flags & CMS_NOCERTS))
+		{
+		/* NB ignore -1 return for duplicate cert */
+		if (!CMS_add1_cert(cms, signer))
+			goto merr;
+		}
+
+	if (!sd->signerInfos)
+		sd->signerInfos = sk_CMS_SignerInfo_new_null();
+	if (!sd->signerInfos ||
+		!sk_CMS_SignerInfo_push(sd->signerInfos, si))
+		goto merr;
+
+	return si;
+
+	merr:
+	CMSerr(CMS_F_CMS_ADD1_SIGNER, ERR_R_MALLOC_FAILURE);
+	err:
+	if (si)
+		M_ASN1_free_of(si, CMS_SignerInfo);
+	return NULL;
+
+	}
+
+static int cms_add1_signingTime(CMS_SignerInfo *si, ASN1_TIME *t)
+	{
+	ASN1_TIME *tt;
+	int r = 0;
+	if (t)
+		tt = t;
+	else
+		tt = X509_gmtime_adj(NULL, 0);
+
+	if (!tt)
+		goto merr;
+
+	if (CMS_signed_add1_attr_by_NID(si, NID_pkcs9_signingTime,
+						tt->type, tt, -1) <= 0)
+		goto merr;
+
+	r = 1;
+
+	merr:
+
+	if (!t)
+		ASN1_TIME_free(tt);
+
+	if (!r)
+		CMSerr(CMS_F_CMS_ADD1_SIGNINGTIME, ERR_R_MALLOC_FAILURE);
+
+	return r;
+
+	}
+
+STACK_OF(CMS_SignerInfo) *CMS_get0_SignerInfos(CMS_ContentInfo *cms)
+	{
+	CMS_SignedData *sd;
+	sd = cms_get0_signed(cms);
+	if (!sd)
+		return NULL;
+	return sd->signerInfos;
+	}
+
+STACK_OF(X509) *CMS_get0_signers(CMS_ContentInfo *cms)
+	{
+	STACK_OF(X509) *signers = NULL;
+	STACK_OF(CMS_SignerInfo) *sinfos;
+	CMS_SignerInfo *si;
+	int i;
+	sinfos = CMS_get0_SignerInfos(cms);
+	for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+		{
+		si = sk_CMS_SignerInfo_value(sinfos, i);
+		if (si->signer)
+			{
+			if (!signers)
+				{
+				signers = sk_X509_new_null();
+				if (!signers)
+					return NULL;
+				}
+			if (!sk_X509_push(signers, si->signer))
+				{
+				sk_X509_free(signers);
+				return NULL;
+				}
+			}
+		}
+	return signers;
+	}
+
+void CMS_SignerInfo_set1_signer_cert(CMS_SignerInfo *si, X509 *signer)
+	{
+	if (signer)
+		{
+		CRYPTO_add(&signer->references, 1, CRYPTO_LOCK_X509);
+		if (si->pkey)
+			EVP_PKEY_free(si->pkey);
+		si->pkey = X509_get_pubkey(signer);
+		}
+	if (si->signer)
+		X509_free(si->signer);
+	si->signer = signer;
+	}
+
+int CMS_SignerInfo_get0_signer_id(CMS_SignerInfo *si,
+					ASN1_OCTET_STRING **keyid,
+					X509_NAME **issuer, ASN1_INTEGER **sno)
+	{
+	return cms_SignerIdentifier_get0_signer_id(si->sid, keyid, issuer, sno);
+	}
+
+int CMS_SignerInfo_cert_cmp(CMS_SignerInfo *si, X509 *cert)
+	{
+	return cms_SignerIdentifier_cert_cmp(si->sid, cert);
+	}
+
+int CMS_set1_signers_certs(CMS_ContentInfo *cms, STACK_OF(X509) *scerts,
+				unsigned int flags)
+	{
+	CMS_SignedData *sd;
+	CMS_SignerInfo *si;
+	CMS_CertificateChoices *cch;
+	STACK_OF(CMS_CertificateChoices) *certs;
+	X509 *x;
+	int i, j;
+	int ret = 0;
+	sd = cms_get0_signed(cms);
+	if (!sd)
+		return -1;
+	certs = sd->certificates;
+	for (i = 0; i < sk_CMS_SignerInfo_num(sd->signerInfos); i++)
+		{
+		si = sk_CMS_SignerInfo_value(sd->signerInfos, i);
+		if (si->signer)
+			continue;
+
+		for (j = 0; j < sk_X509_num(scerts); j++)
+			{
+			x = sk_X509_value(scerts, j);
+			if (CMS_SignerInfo_cert_cmp(si, x) == 0)
+				{
+				CMS_SignerInfo_set1_signer_cert(si, x);
+				ret++;
+				break;
+				}
+			}
+
+		if (si->signer || (flags & CMS_NOINTERN))
+			continue;
+
+		for (j = 0; j < sk_CMS_CertificateChoices_num(certs); j++)
+			{
+			cch = sk_CMS_CertificateChoices_value(certs, j);
+			if (cch->type != 0)
+				continue;
+			x = cch->d.certificate;
+			if (CMS_SignerInfo_cert_cmp(si, x) == 0)
+				{
+				CMS_SignerInfo_set1_signer_cert(si, x);
+				ret++;
+				break;
+				}
+			}
+		}
+	return ret;
+	}
+
+void CMS_SignerInfo_get0_algs(CMS_SignerInfo *si, EVP_PKEY **pk, X509 **signer,
+					X509_ALGOR **pdig, X509_ALGOR **psig)
+	{
+	if (pk)
+		*pk = si->pkey;
+	if (signer)
+		*signer = si->signer;
+	if (pdig)
+		*pdig = si->digestAlgorithm;
+	if (psig)
+		*psig = si->signatureAlgorithm;
+	}
+
+static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
+					CMS_SignerInfo *si, BIO *chain)
+	{
+	EVP_MD_CTX mctx;
+	int r = 0;
+	EVP_MD_CTX_init(&mctx);
+
+
+	if (!si->pkey)
+		{
+		CMSerr(CMS_F_CMS_SIGNERINFO_CONTENT_SIGN, CMS_R_NO_PRIVATE_KEY);
+		return 0;
+		}
+
+	if (!cms_DigestAlgorithm_find_ctx(&mctx, chain, si->digestAlgorithm))
+		goto err;
+
+	/* If any signed attributes calculate and add messageDigest attribute */
+
+	if (CMS_signed_get_attr_count(si) >= 0)
+		{
+		ASN1_OBJECT *ctype =
+			cms->d.signedData->encapContentInfo->eContentType; 
+		unsigned char md[EVP_MAX_MD_SIZE];
+		unsigned int mdlen;
+		if (!EVP_DigestFinal_ex(&mctx, md, &mdlen))
+			goto err;
+		if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest,
+						V_ASN1_OCTET_STRING,
+						md, mdlen))
+			goto err;
+		/* Copy content type across */
+		if (CMS_signed_add1_attr_by_NID(si, NID_pkcs9_contentType,
+					V_ASN1_OBJECT, ctype, -1) <= 0)
+			goto err;
+		if (!CMS_SignerInfo_sign(si))
+			goto err;
+		}
+	else
+		{
+		unsigned char *sig;
+		unsigned int siglen;
+		sig = OPENSSL_malloc(EVP_PKEY_size(si->pkey));
+		if (!sig)
+			{
+			CMSerr(CMS_F_CMS_SIGNERINFO_CONTENT_SIGN,
+					ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		if (!EVP_SignFinal(&mctx, sig, &siglen, si->pkey))
+			{
+			CMSerr(CMS_F_CMS_SIGNERINFO_CONTENT_SIGN,
+					CMS_R_SIGNFINAL_ERROR);
+			OPENSSL_free(sig);
+			goto err;
+			}
+		ASN1_STRING_set0(si->signature, sig, siglen);
+		}
+
+	r = 1;
+
+	err:
+	EVP_MD_CTX_cleanup(&mctx);
+	return r;
+
+	}
+
+int cms_SignedData_final(CMS_ContentInfo *cms, BIO *chain)
+	{
+	STACK_OF(CMS_SignerInfo) *sinfos;
+	CMS_SignerInfo *si;
+	int i;
+	sinfos = CMS_get0_SignerInfos(cms);
+	for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+		{
+		si = sk_CMS_SignerInfo_value(sinfos, i);
+		if (!cms_SignerInfo_content_sign(cms, si, chain))
+			return 0;
+		}
+	cms->d.signedData->encapContentInfo->partial = 0;
+	return 1;
+	}
+
+int CMS_SignerInfo_sign(CMS_SignerInfo *si)
+	{
+	EVP_MD_CTX mctx;
+	EVP_PKEY_CTX *pctx;
+	unsigned char *abuf = NULL;
+	int alen;
+	size_t siglen;
+	const EVP_MD *md = NULL;
+
+	md = EVP_get_digestbyobj(si->digestAlgorithm->algorithm);
+	if (md == NULL)
+		return 0;
+
+	EVP_MD_CTX_init(&mctx);
+
+	if (CMS_signed_get_attr_by_NID(si, NID_pkcs9_signingTime, -1) < 0)
+		{
+		if (!cms_add1_signingTime(si, NULL))
+			goto err;
+		}
+
+	if (EVP_DigestSignInit(&mctx, &pctx, md, NULL, si->pkey) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_SIGN,
+				EVP_PKEY_CTRL_CMS_SIGN, 0, si) <= 0)
+		{
+		CMSerr(CMS_F_CMS_SIGNERINFO_SIGN, CMS_R_CTRL_ERROR);
+		goto err;
+		}
+
+	alen = ASN1_item_i2d((ASN1_VALUE *)si->signedAttrs,&abuf,
+				ASN1_ITEM_rptr(CMS_Attributes_Sign));
+	if(!abuf)
+		goto err;
+	if (EVP_DigestSignUpdate(&mctx, abuf, alen) <= 0)
+		goto err;
+	if (EVP_DigestSignFinal(&mctx, NULL, &siglen) <= 0)
+		goto err;
+	OPENSSL_free(abuf);
+	abuf = OPENSSL_malloc(siglen);
+	if(!abuf)
+		goto err;
+	if (EVP_DigestSignFinal(&mctx, abuf, &siglen) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_SIGN,
+				EVP_PKEY_CTRL_CMS_SIGN, 1, si) <= 0)
+		{
+		CMSerr(CMS_F_CMS_SIGNERINFO_SIGN, CMS_R_CTRL_ERROR);
+		goto err;
+		}
+
+	EVP_MD_CTX_cleanup(&mctx);
+
+	ASN1_STRING_set0(si->signature, abuf, siglen);
+
+	return 1;
+
+	err:
+	if (abuf)
+		OPENSSL_free(abuf);
+	EVP_MD_CTX_cleanup(&mctx);
+	return 0;
+
+	}
+
+int CMS_SignerInfo_verify(CMS_SignerInfo *si)
+	{
+	EVP_MD_CTX mctx;
+	EVP_PKEY_CTX *pctx;
+	unsigned char *abuf = NULL;
+	int alen, r = -1;
+	const EVP_MD *md = NULL;
+
+	if (!si->pkey)
+		{
+		CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY, CMS_R_NO_PUBLIC_KEY);
+		return -1;
+		}
+
+	md = EVP_get_digestbyobj(si->digestAlgorithm->algorithm);
+	if (md == NULL)
+		return -1;
+	EVP_MD_CTX_init(&mctx);
+	if (EVP_DigestVerifyInit(&mctx, &pctx, md, NULL, si->pkey) <= 0)
+		goto err;
+
+	alen = ASN1_item_i2d((ASN1_VALUE *)si->signedAttrs,&abuf,
+				ASN1_ITEM_rptr(CMS_Attributes_Verify));
+	if(!abuf)
+		goto err;
+	r = EVP_DigestVerifyUpdate(&mctx, abuf, alen);
+	OPENSSL_free(abuf);
+	if (r <= 0)
+		{
+		r = -1;
+		goto err;
+		}
+	r = EVP_DigestVerifyFinal(&mctx,
+			si->signature->data, si->signature->length);
+	if (r <= 0)
+		CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY, CMS_R_VERIFICATION_FAILURE);
+	err:
+	EVP_MD_CTX_cleanup(&mctx);
+	return r;
+	}
+
+/* Create a chain of digest BIOs from a CMS ContentInfo */
+
+BIO *cms_SignedData_init_bio(CMS_ContentInfo *cms)
+	{
+	int i;
+	CMS_SignedData *sd;
+	BIO *chain = NULL;
+	sd = cms_get0_signed(cms);
+	if (!sd)
+		return NULL;
+	if (cms->d.signedData->encapContentInfo->partial)
+		cms_sd_set_version(sd);
+	for (i = 0; i < sk_X509_ALGOR_num(sd->digestAlgorithms); i++)
+		{
+		X509_ALGOR *digestAlgorithm;
+		BIO *mdbio;
+		digestAlgorithm = sk_X509_ALGOR_value(sd->digestAlgorithms, i);
+		mdbio = cms_DigestAlgorithm_init_bio(digestAlgorithm);
+		if (!mdbio)
+			goto err;	
+		if (chain)
+			 BIO_push(chain, mdbio);
+		else
+			chain = mdbio;
+		}
+	return chain;
+	err:
+	if (chain)
+		BIO_free_all(chain);
+	return NULL;
+	}
+
+int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain)
+	{
+	ASN1_OCTET_STRING *os = NULL;
+	EVP_MD_CTX mctx;
+	int r = -1;
+	EVP_MD_CTX_init(&mctx);
+	/* If we have any signed attributes look for messageDigest value */
+	if (CMS_signed_get_attr_count(si) >= 0)
+		{
+		os = CMS_signed_get0_data_by_OBJ(si,
+					OBJ_nid2obj(NID_pkcs9_messageDigest),
+					-3, V_ASN1_OCTET_STRING);
+		if (!os)
+			{
+			CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+				CMS_R_ERROR_READING_MESSAGEDIGEST_ATTRIBUTE);
+			goto err;
+			}
+		}
+
+	if (!cms_DigestAlgorithm_find_ctx(&mctx, chain, si->digestAlgorithm))
+		goto err;
+
+	/* If messageDigest found compare it */
+
+	if (os)
+		{
+		unsigned char mval[EVP_MAX_MD_SIZE];
+		unsigned int mlen;
+		if (EVP_DigestFinal_ex(&mctx, mval, &mlen) <= 0)
+			{
+			CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+				CMS_R_UNABLE_TO_FINALIZE_CONTEXT);
+			goto err;
+			}
+		if (mlen != (unsigned int)os->length)
+			{
+			CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+				CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH);
+			goto err;
+			}
+
+		if (memcmp(mval, os->data, mlen))
+			{
+			CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+				CMS_R_VERIFICATION_FAILURE);
+			r = 0;
+			}
+		else
+			r = 1;
+		}
+	else
+		{
+		r = EVP_VerifyFinal(&mctx, si->signature->data,
+					si->signature->length, si->pkey);
+		if (r <= 0)
+			{
+			CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+				CMS_R_VERIFICATION_FAILURE);
+			r = 0;
+			}
+		}
+
+	err:
+	EVP_MD_CTX_cleanup(&mctx);
+	return r;
+
+	}
+
+int CMS_add_smimecap(CMS_SignerInfo *si, STACK_OF(X509_ALGOR) *algs)
+	{
+	unsigned char *smder = NULL;
+	int smderlen, r;
+	smderlen = i2d_X509_ALGORS(algs, &smder);
+	if (smderlen <= 0)
+		return 0;
+	r = CMS_signed_add1_attr_by_NID(si, NID_SMIMECapabilities,
+					V_ASN1_SEQUENCE, smder, smderlen);
+	OPENSSL_free(smder);
+	return r;
+	}
+
+int CMS_add_simple_smimecap(STACK_OF(X509_ALGOR) **algs,
+				int algnid, int keysize)
+	{
+	X509_ALGOR *alg;
+	ASN1_INTEGER *key = NULL;
+	if (keysize > 0)
+		{
+		key = ASN1_INTEGER_new();
+		if (!key || !ASN1_INTEGER_set(key, keysize))
+			return 0;
+		}
+	alg = X509_ALGOR_new();
+	if (!alg)
+		{
+		if (key)
+			ASN1_INTEGER_free(key);
+		return 0;
+		}
+		
+	X509_ALGOR_set0(alg, OBJ_nid2obj(algnid),
+				key ? V_ASN1_INTEGER : V_ASN1_UNDEF, key);
+	if (!*algs)
+		*algs = sk_X509_ALGOR_new_null();
+	if (!*algs || !sk_X509_ALGOR_push(*algs, alg))
+		{
+		X509_ALGOR_free(alg);
+		return 0;
+		}
+	return 1;
+	}
+
+/* Check to see if a cipher exists and if so add S/MIME capabilities */
+
+static int cms_add_cipher_smcap(STACK_OF(X509_ALGOR) **sk, int nid, int arg)
+	{
+	if (EVP_get_cipherbynid(nid))
+		return CMS_add_simple_smimecap(sk, nid, arg);
+	return 1;
+	}
+
+static int cms_add_digest_smcap(STACK_OF(X509_ALGOR) **sk, int nid, int arg)
+	{
+	if (EVP_get_digestbynid(nid))
+		return CMS_add_simple_smimecap(sk, nid, arg);
+	return 1;
+	}
+
+int CMS_add_standard_smimecap(STACK_OF(X509_ALGOR) **smcap)
+	{
+	if (!cms_add_cipher_smcap(smcap, NID_aes_256_cbc, -1)
+		|| !cms_add_digest_smcap(smcap, NID_id_GostR3411_94, -1)
+		|| !cms_add_cipher_smcap(smcap, NID_id_Gost28147_89, -1)
+		|| !cms_add_cipher_smcap(smcap, NID_aes_192_cbc, -1)
+		|| !cms_add_cipher_smcap(smcap, NID_aes_128_cbc, -1)
+		|| !cms_add_cipher_smcap(smcap, NID_des_ede3_cbc, -1)
+		|| !cms_add_cipher_smcap(smcap, NID_rc2_cbc, 128)
+		|| !cms_add_cipher_smcap(smcap, NID_rc2_cbc, 64)
+		|| !cms_add_cipher_smcap(smcap, NID_des_cbc, -1)
+		|| !cms_add_cipher_smcap(smcap, NID_rc2_cbc, 40))
+		return 0;
+	return 1;
+	}
diff --git a/main/openssl/crypto/cms/cms_smime.c b/main/openssl/crypto/cms/cms_smime.c
new file mode 100644
index 00000000..8c56e3a8
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_smime.c
@@ -0,0 +1,850 @@
+/* crypto/cms/cms_smime.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include "cms_lcl.h"
+
+static int cms_copy_content(BIO *out, BIO *in, unsigned int flags)
+	{
+	unsigned char buf[4096];
+	int r = 0, i;
+	BIO *tmpout = NULL;
+
+	if (out == NULL)
+		tmpout = BIO_new(BIO_s_null());
+	else if (flags & CMS_TEXT)
+		{
+		tmpout = BIO_new(BIO_s_mem());
+		BIO_set_mem_eof_return(tmpout, 0);
+		}
+	else
+		tmpout = out;
+
+	if(!tmpout)
+		{
+		CMSerr(CMS_F_CMS_COPY_CONTENT,ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	/* Read all content through chain to process digest, decrypt etc */
+	for (;;)
+	{
+		i=BIO_read(in,buf,sizeof(buf));
+		if (i <= 0)
+			{
+			if (BIO_method_type(in) == BIO_TYPE_CIPHER)
+				{
+				if (!BIO_get_cipher_status(in))
+					goto err;
+				}
+			if (i < 0)
+				goto err;
+			break;
+			}
+				
+		if (tmpout && (BIO_write(tmpout, buf, i) != i))
+			goto err;
+	}
+
+	if(flags & CMS_TEXT)
+		{
+		if(!SMIME_text(tmpout, out))
+			{
+			CMSerr(CMS_F_CMS_COPY_CONTENT,CMS_R_SMIME_TEXT_ERROR);
+			goto err;
+			}
+		}
+
+	r = 1;
+
+	err:
+	if (tmpout && (tmpout != out))
+		BIO_free(tmpout);
+	return r;
+
+	}
+
+static int check_content(CMS_ContentInfo *cms)
+	{
+	ASN1_OCTET_STRING **pos = CMS_get0_content(cms);
+	if (!pos || !*pos)
+		{
+		CMSerr(CMS_F_CHECK_CONTENT, CMS_R_NO_CONTENT);
+		return 0;
+		}
+	return 1;
+	}
+
+static void do_free_upto(BIO *f, BIO *upto)
+	{
+	if (upto)
+		{
+		BIO *tbio;
+		do 
+			{
+			tbio = BIO_pop(f);
+			BIO_free(f);
+			f = tbio;
+			}
+		while (f != upto);
+		}
+	else
+		BIO_free_all(f);
+	}
+
+int CMS_data(CMS_ContentInfo *cms, BIO *out, unsigned int flags)
+	{
+	BIO *cont;
+	int r;
+	if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_pkcs7_data)
+		{
+		CMSerr(CMS_F_CMS_DATA, CMS_R_TYPE_NOT_DATA);
+		return 0;
+		}
+	cont = CMS_dataInit(cms, NULL);
+	if (!cont)
+		return 0;
+	r = cms_copy_content(out, cont, flags);
+	BIO_free_all(cont);
+	return r;
+	}
+
+CMS_ContentInfo *CMS_data_create(BIO *in, unsigned int flags)
+	{
+	CMS_ContentInfo *cms;
+	cms = cms_Data_create();
+	if (!cms)
+		return NULL;
+
+	if ((flags & CMS_STREAM) || CMS_final(cms, in, NULL, flags))
+		return cms;
+
+	CMS_ContentInfo_free(cms);
+
+	return NULL;
+	}
+
+int CMS_digest_verify(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+							unsigned int flags)
+	{
+	BIO *cont;
+	int r;
+	if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_pkcs7_digest)
+		{
+		CMSerr(CMS_F_CMS_DIGEST_VERIFY, CMS_R_TYPE_NOT_DIGESTED_DATA);
+		return 0;
+		}
+
+	if (!dcont && !check_content(cms))
+		return 0;
+
+	cont = CMS_dataInit(cms, dcont);
+	if (!cont)
+		return 0;
+	r = cms_copy_content(out, cont, flags);
+	if (r)
+		r = cms_DigestedData_do_final(cms, cont, 1);
+	do_free_upto(cont, dcont);
+	return r;
+	}
+
+CMS_ContentInfo *CMS_digest_create(BIO *in, const EVP_MD *md,
+					unsigned int flags)
+	{
+	CMS_ContentInfo *cms;
+	if (!md)
+		md = EVP_sha1();
+	cms = cms_DigestedData_create(md);
+	if (!cms)
+		return NULL;
+
+	if(!(flags & CMS_DETACHED))
+		CMS_set_detached(cms, 0);
+
+	if ((flags & CMS_STREAM) || CMS_final(cms, in, NULL, flags))
+		return cms;
+
+	CMS_ContentInfo_free(cms);
+	return NULL;
+	}
+
+int CMS_EncryptedData_decrypt(CMS_ContentInfo *cms,
+				const unsigned char *key, size_t keylen,
+				BIO *dcont, BIO *out, unsigned int flags)
+	{
+	BIO *cont;
+	int r;
+	if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_pkcs7_encrypted)
+		{
+		CMSerr(CMS_F_CMS_ENCRYPTEDDATA_DECRYPT,
+					CMS_R_TYPE_NOT_ENCRYPTED_DATA);
+		return 0;
+		}
+
+	if (!dcont && !check_content(cms))
+		return 0;
+
+	if (CMS_EncryptedData_set1_key(cms, NULL, key, keylen) <= 0)
+		return 0;
+	cont = CMS_dataInit(cms, dcont);
+	if (!cont)
+		return 0;
+	r = cms_copy_content(out, cont, flags);
+	do_free_upto(cont, dcont);
+	return r;
+	}
+
+CMS_ContentInfo *CMS_EncryptedData_encrypt(BIO *in, const EVP_CIPHER *cipher,
+					const unsigned char *key, size_t keylen,
+					unsigned int flags)
+	{
+	CMS_ContentInfo *cms;
+	if (!cipher)
+		{
+		CMSerr(CMS_F_CMS_ENCRYPTEDDATA_ENCRYPT, CMS_R_NO_CIPHER);
+		return NULL;
+		}
+	cms = CMS_ContentInfo_new();
+	if (!cms)
+		return NULL;
+	if (!CMS_EncryptedData_set1_key(cms, cipher, key, keylen))
+		return NULL;
+
+	if(!(flags & CMS_DETACHED))
+		CMS_set_detached(cms, 0);
+
+	if ((flags & (CMS_STREAM|CMS_PARTIAL))
+		|| CMS_final(cms, in, NULL, flags))
+		return cms;
+
+	CMS_ContentInfo_free(cms);
+	return NULL;
+	}
+
+static int cms_signerinfo_verify_cert(CMS_SignerInfo *si,
+					X509_STORE *store,
+					STACK_OF(X509) *certs,
+					STACK_OF(X509_CRL) *crls,
+					unsigned int flags)
+	{
+	X509_STORE_CTX ctx;
+	X509 *signer;
+	int i, j, r = 0;
+	CMS_SignerInfo_get0_algs(si, NULL, &signer, NULL, NULL);
+	if (!X509_STORE_CTX_init(&ctx, store, signer, certs))
+		{
+		CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CERT,
+						CMS_R_STORE_INIT_ERROR);
+		goto err;
+		}
+	X509_STORE_CTX_set_default(&ctx, "smime_sign");
+	if (crls)
+		X509_STORE_CTX_set0_crls(&ctx, crls);
+
+	i = X509_verify_cert(&ctx);
+	if (i <= 0)
+		{
+		j = X509_STORE_CTX_get_error(&ctx);
+		CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CERT,
+						CMS_R_CERTIFICATE_VERIFY_ERROR);
+		ERR_add_error_data(2, "Verify error:",
+					 X509_verify_cert_error_string(j));
+		goto err;
+		}
+	r = 1;
+	err:
+	X509_STORE_CTX_cleanup(&ctx);
+	return r;
+
+	}
+
+int CMS_verify(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
+		 X509_STORE *store, BIO *dcont, BIO *out, unsigned int flags)
+	{
+	CMS_SignerInfo *si;
+	STACK_OF(CMS_SignerInfo) *sinfos;
+	STACK_OF(X509) *cms_certs = NULL;
+	STACK_OF(X509_CRL) *crls = NULL;
+	X509 *signer;
+	int i, scount = 0, ret = 0;
+	BIO *cmsbio = NULL, *tmpin = NULL;
+
+	if (!dcont && !check_content(cms))
+		return 0;
+
+	/* Attempt to find all signer certificates */
+
+	sinfos = CMS_get0_SignerInfos(cms);
+
+	if (sk_CMS_SignerInfo_num(sinfos) <= 0)
+		{
+		CMSerr(CMS_F_CMS_VERIFY, CMS_R_NO_SIGNERS);
+		goto err;
+		}
+
+	for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+		{
+		si = sk_CMS_SignerInfo_value(sinfos, i);
+		CMS_SignerInfo_get0_algs(si, NULL, &signer, NULL, NULL);
+		if (signer)
+			scount++;
+		}
+
+	if (scount != sk_CMS_SignerInfo_num(sinfos))
+		scount += CMS_set1_signers_certs(cms, certs, flags);
+
+	if (scount != sk_CMS_SignerInfo_num(sinfos))
+		{
+		CMSerr(CMS_F_CMS_VERIFY, CMS_R_SIGNER_CERTIFICATE_NOT_FOUND);
+		goto err;
+		}
+
+	/* Attempt to verify all signers certs */
+
+	if (!(flags & CMS_NO_SIGNER_CERT_VERIFY))
+		{
+		cms_certs = CMS_get1_certs(cms);
+		if (!(flags & CMS_NOCRL))
+			crls = CMS_get1_crls(cms);
+		for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+			{
+			si = sk_CMS_SignerInfo_value(sinfos, i);
+			if (!cms_signerinfo_verify_cert(si, store,
+							cms_certs, crls, flags))
+				goto err;
+			}
+		}
+
+	/* Attempt to verify all SignerInfo signed attribute signatures */
+
+	if (!(flags & CMS_NO_ATTR_VERIFY))
+		{
+		for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+			{
+			si = sk_CMS_SignerInfo_value(sinfos, i);
+			if (CMS_signed_get_attr_count(si) < 0)
+				continue;
+			if (CMS_SignerInfo_verify(si) <= 0)
+				goto err;
+			}
+		}
+
+	/* Performance optimization: if the content is a memory BIO then
+	 * store its contents in a temporary read only memory BIO. This
+	 * avoids potentially large numbers of slow copies of data which will
+	 * occur when reading from a read write memory BIO when signatures
+	 * are calculated.
+	 */
+
+	if (dcont && (BIO_method_type(dcont) == BIO_TYPE_MEM))
+		{
+		char *ptr;
+		long len;
+		len = BIO_get_mem_data(dcont, &ptr);
+		tmpin = BIO_new_mem_buf(ptr, len);
+		if (tmpin == NULL)
+			{
+			CMSerr(CMS_F_CMS_VERIFY,ERR_R_MALLOC_FAILURE);
+			return 0;
+			}
+		}
+	else
+		tmpin = dcont;
+		
+
+	cmsbio=CMS_dataInit(cms, tmpin);
+	if (!cmsbio)
+		goto err;
+
+	if (!cms_copy_content(out, cmsbio, flags))
+		goto err;
+
+	if (!(flags & CMS_NO_CONTENT_VERIFY))
+		{
+		for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+			{
+			si = sk_CMS_SignerInfo_value(sinfos, i);
+			if (CMS_SignerInfo_verify_content(si, cmsbio) <= 0)
+				{
+				CMSerr(CMS_F_CMS_VERIFY,
+					CMS_R_CONTENT_VERIFY_ERROR);
+				goto err;
+				}
+			}
+		}
+
+	ret = 1;
+
+	err:
+	
+	if (dcont && (tmpin == dcont))
+		do_free_upto(cmsbio, dcont);
+	else
+		BIO_free_all(cmsbio);
+
+	if (cms_certs)
+		sk_X509_pop_free(cms_certs, X509_free);
+	if (crls)
+		sk_X509_CRL_pop_free(crls, X509_CRL_free);
+
+	return ret;
+	}
+
+int CMS_verify_receipt(CMS_ContentInfo *rcms, CMS_ContentInfo *ocms,
+			STACK_OF(X509) *certs,
+			X509_STORE *store, unsigned int flags)
+	{
+	int r;
+	flags &= ~(CMS_DETACHED|CMS_TEXT);
+	r = CMS_verify(rcms, certs, store, NULL, NULL, flags);
+	if (r <= 0)
+		return r;
+	return cms_Receipt_verify(rcms, ocms);
+	}
+
+CMS_ContentInfo *CMS_sign(X509 *signcert, EVP_PKEY *pkey, STACK_OF(X509) *certs,
+						BIO *data, unsigned int flags)
+	{
+	CMS_ContentInfo *cms;
+	int i;
+
+	cms = CMS_ContentInfo_new();
+	if (!cms || !CMS_SignedData_init(cms))
+		goto merr;
+
+	if (pkey && !CMS_add1_signer(cms, signcert, pkey, NULL, flags))
+		{
+		CMSerr(CMS_F_CMS_SIGN, CMS_R_ADD_SIGNER_ERROR);
+		goto err;
+		}
+
+	for (i = 0; i < sk_X509_num(certs); i++)
+		{
+		X509 *x = sk_X509_value(certs, i);
+		if (!CMS_add1_cert(cms, x))
+			goto merr;
+		}
+
+	if(!(flags & CMS_DETACHED))
+		CMS_set_detached(cms, 0);
+
+	if ((flags & (CMS_STREAM|CMS_PARTIAL))
+		|| CMS_final(cms, data, NULL, flags))
+		return cms;
+	else
+		goto err;
+
+	merr:
+	CMSerr(CMS_F_CMS_SIGN, ERR_R_MALLOC_FAILURE);
+
+	err:
+	if (cms)
+		CMS_ContentInfo_free(cms);
+	return NULL;
+	}
+
+CMS_ContentInfo *CMS_sign_receipt(CMS_SignerInfo *si,
+					X509 *signcert, EVP_PKEY *pkey,
+					STACK_OF(X509) *certs,
+					unsigned int flags)
+	{
+	CMS_SignerInfo *rct_si;
+	CMS_ContentInfo *cms = NULL;
+	ASN1_OCTET_STRING **pos, *os;
+	BIO *rct_cont = NULL;
+	int r = 0;
+
+	flags &= ~(CMS_STREAM|CMS_TEXT);
+	/* Not really detached but avoids content being allocated */
+	flags |= CMS_PARTIAL|CMS_BINARY|CMS_DETACHED;
+	if (!pkey || !signcert)
+		{
+		CMSerr(CMS_F_CMS_SIGN_RECEIPT, CMS_R_NO_KEY_OR_CERT);
+		return NULL;
+		}
+
+	/* Initialize signed data */
+
+	cms = CMS_sign(NULL, NULL, certs, NULL, flags);
+	if (!cms)
+		goto err;
+
+	/* Set inner content type to signed receipt */
+	if (!CMS_set1_eContentType(cms, OBJ_nid2obj(NID_id_smime_ct_receipt)))
+		goto err;
+
+	rct_si = CMS_add1_signer(cms, signcert, pkey, NULL, flags);
+	if (!rct_si)
+		{
+		CMSerr(CMS_F_CMS_SIGN_RECEIPT, CMS_R_ADD_SIGNER_ERROR);
+		goto err;
+		}
+
+	os = cms_encode_Receipt(si);
+
+	if (!os)
+		goto err;
+
+	/* Set content to digest */
+	rct_cont = BIO_new_mem_buf(os->data, os->length);
+	if (!rct_cont)
+		goto err;
+
+	/* Add msgSigDigest attribute */
+
+	if (!cms_msgSigDigest_add1(rct_si, si))
+		goto err;
+
+	/* Finalize structure */
+	if (!CMS_final(cms, rct_cont, NULL, flags))
+		goto err;
+
+	/* Set embedded content */
+	pos = CMS_get0_content(cms);
+	*pos = os;
+
+	r = 1;
+
+	err:
+	if (rct_cont)
+		BIO_free(rct_cont);
+	if (r)
+		return cms;
+	CMS_ContentInfo_free(cms);
+	return NULL;
+
+	}
+
+CMS_ContentInfo *CMS_encrypt(STACK_OF(X509) *certs, BIO *data,
+				const EVP_CIPHER *cipher, unsigned int flags)
+	{
+	CMS_ContentInfo *cms;
+	int i;
+	X509 *recip;
+	cms = CMS_EnvelopedData_create(cipher);
+	if (!cms)
+		goto merr;
+	for (i = 0; i < sk_X509_num(certs); i++)
+		{
+		recip = sk_X509_value(certs, i);
+		if (!CMS_add1_recipient_cert(cms, recip, flags))
+			{
+			CMSerr(CMS_F_CMS_ENCRYPT, CMS_R_RECIPIENT_ERROR);
+			goto err;
+			}
+		}
+
+	if(!(flags & CMS_DETACHED))
+		CMS_set_detached(cms, 0);
+
+	if ((flags & (CMS_STREAM|CMS_PARTIAL))
+		|| CMS_final(cms, data, NULL, flags))
+		return cms;
+	else
+		goto err;
+
+	merr:
+	CMSerr(CMS_F_CMS_ENCRYPT, ERR_R_MALLOC_FAILURE);
+	err:
+	if (cms)
+		CMS_ContentInfo_free(cms);
+	return NULL;
+	}
+
+int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert)
+	{
+	STACK_OF(CMS_RecipientInfo) *ris;
+	CMS_RecipientInfo *ri;
+	int i, r;
+	int debug = 0;
+	ris = CMS_get0_RecipientInfos(cms);
+	if (ris)
+		debug = cms->d.envelopedData->encryptedContentInfo->debug;
+	for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++)
+		{
+		ri = sk_CMS_RecipientInfo_value(ris, i);
+		if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_TRANS)
+				continue;
+		/* If we have a cert try matching RecipientInfo
+		 * otherwise try them all.
+		 */
+		if (!cert || (CMS_RecipientInfo_ktri_cert_cmp(ri, cert) == 0))
+			{
+			CMS_RecipientInfo_set0_pkey(ri, pk);
+			r = CMS_RecipientInfo_decrypt(cms, ri);
+			CMS_RecipientInfo_set0_pkey(ri, NULL);
+			if (cert)
+				{
+				/* If not debugging clear any error and
+				 * return success to avoid leaking of
+				 * information useful to MMA
+				 */
+				if (!debug)
+					{
+					ERR_clear_error();
+					return 1;
+					}
+				if (r > 0)
+					return 1;
+				CMSerr(CMS_F_CMS_DECRYPT_SET1_PKEY,
+						CMS_R_DECRYPT_ERROR);
+				return 0;
+				}
+			/* If no cert and not debugging don't leave loop
+			 * after first successful decrypt. Always attempt
+			 * to decrypt all recipients to avoid leaking timing
+			 * of a successful decrypt.
+			 */
+			else if (r > 0 && debug)
+				return 1;
+			}
+		}
+	/* If no cert and not debugging always return success */
+	if (!cert && !debug)
+		{
+		ERR_clear_error();
+		return 1;
+		}
+
+	CMSerr(CMS_F_CMS_DECRYPT_SET1_PKEY, CMS_R_NO_MATCHING_RECIPIENT);
+	return 0;
+
+	}
+
+int CMS_decrypt_set1_key(CMS_ContentInfo *cms, 
+				unsigned char *key, size_t keylen,
+				unsigned char *id, size_t idlen)
+	{
+	STACK_OF(CMS_RecipientInfo) *ris;
+	CMS_RecipientInfo *ri;
+	int i, r;
+	ris = CMS_get0_RecipientInfos(cms);
+	for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++)
+		{
+		ri = sk_CMS_RecipientInfo_value(ris, i);
+		if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_KEK)
+				continue;
+
+		/* If we have an id try matching RecipientInfo
+		 * otherwise try them all.
+		 */
+		if (!id || (CMS_RecipientInfo_kekri_id_cmp(ri, id, idlen) == 0))
+			{
+			CMS_RecipientInfo_set0_key(ri, key, keylen);
+			r = CMS_RecipientInfo_decrypt(cms, ri);
+			CMS_RecipientInfo_set0_key(ri, NULL, 0);
+			if (r > 0)
+				return 1;
+			if (id)
+				{
+				CMSerr(CMS_F_CMS_DECRYPT_SET1_KEY,
+						CMS_R_DECRYPT_ERROR);
+				return 0;
+				}
+			ERR_clear_error();
+			}
+		}
+
+	CMSerr(CMS_F_CMS_DECRYPT_SET1_KEY, CMS_R_NO_MATCHING_RECIPIENT);
+	return 0;
+
+	}
+
+int CMS_decrypt_set1_password(CMS_ContentInfo *cms, 
+				unsigned char *pass, ossl_ssize_t passlen)
+	{
+	STACK_OF(CMS_RecipientInfo) *ris;
+	CMS_RecipientInfo *ri;
+	int i, r;
+	ris = CMS_get0_RecipientInfos(cms);
+	for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++)
+		{
+		ri = sk_CMS_RecipientInfo_value(ris, i);
+		if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_PASS)
+				continue;
+		CMS_RecipientInfo_set0_password(ri, pass, passlen);
+		r = CMS_RecipientInfo_decrypt(cms, ri);
+		CMS_RecipientInfo_set0_password(ri, NULL, 0);
+		if (r > 0)
+			return 1;
+		}
+
+	CMSerr(CMS_F_CMS_DECRYPT_SET1_PASSWORD, CMS_R_NO_MATCHING_RECIPIENT);
+	return 0;
+
+	}
+	
+int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert,
+				BIO *dcont, BIO *out,
+				unsigned int flags)
+	{
+	int r;
+	BIO *cont;
+	if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_pkcs7_enveloped)
+		{
+		CMSerr(CMS_F_CMS_DECRYPT, CMS_R_TYPE_NOT_ENVELOPED_DATA);
+		return 0;
+		}
+	if (!dcont && !check_content(cms))
+		return 0;
+	if (flags & CMS_DEBUG_DECRYPT)
+		cms->d.envelopedData->encryptedContentInfo->debug = 1;
+	else
+		cms->d.envelopedData->encryptedContentInfo->debug = 0;
+	if (!pk && !cert && !dcont && !out)
+		return 1;
+	if (pk && !CMS_decrypt_set1_pkey(cms, pk, cert))
+		return 0;
+	cont = CMS_dataInit(cms, dcont);
+	if (!cont)
+		return 0;
+	r = cms_copy_content(out, cont, flags);
+	do_free_upto(cont, dcont);
+	return r;
+	}
+
+int CMS_final(CMS_ContentInfo *cms, BIO *data, BIO *dcont, unsigned int flags)
+	{
+	BIO *cmsbio;
+	int ret = 0;
+	if (!(cmsbio = CMS_dataInit(cms, dcont)))
+		{
+		CMSerr(CMS_F_CMS_FINAL,ERR_R_MALLOC_FAILURE);
+		return 0;
+		}
+
+	SMIME_crlf_copy(data, cmsbio, flags);
+
+	(void)BIO_flush(cmsbio);
+
+
+        if (!CMS_dataFinal(cms, cmsbio))
+		{
+		CMSerr(CMS_F_CMS_FINAL,CMS_R_CMS_DATAFINAL_ERROR);
+		goto err;
+		}
+
+	ret = 1;
+
+	err:
+	do_free_upto(cmsbio, dcont);
+
+	return ret;
+
+	}
+
+#ifdef ZLIB
+
+int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+							unsigned int flags)
+	{
+	BIO *cont;
+	int r;
+	if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_id_smime_ct_compressedData)
+		{
+		CMSerr(CMS_F_CMS_UNCOMPRESS,
+					CMS_R_TYPE_NOT_COMPRESSED_DATA);
+		return 0;
+		}
+
+	if (!dcont && !check_content(cms))
+		return 0;
+
+	cont = CMS_dataInit(cms, dcont);
+	if (!cont)
+		return 0;
+	r = cms_copy_content(out, cont, flags);
+	do_free_upto(cont, dcont);
+	return r;
+	}
+
+CMS_ContentInfo *CMS_compress(BIO *in, int comp_nid, unsigned int flags)
+	{
+	CMS_ContentInfo *cms;
+	if (comp_nid <= 0)
+		comp_nid = NID_zlib_compression;
+	cms = cms_CompressedData_create(comp_nid);
+	if (!cms)
+		return NULL;
+
+	if(!(flags & CMS_DETACHED))
+		CMS_set_detached(cms, 0);
+
+	if ((flags & CMS_STREAM) || CMS_final(cms, in, NULL, flags))
+		return cms;
+
+	CMS_ContentInfo_free(cms);
+	return NULL;
+	}
+
+#else
+
+int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+							unsigned int flags)
+	{
+	CMSerr(CMS_F_CMS_UNCOMPRESS, CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM);
+	return 0;
+	}
+
+CMS_ContentInfo *CMS_compress(BIO *in, int comp_nid, unsigned int flags)
+	{
+	CMSerr(CMS_F_CMS_COMPRESS, CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM);
+	return NULL;
+	}
+
+#endif
diff --git a/main/openssl/crypto/comp/c_rle.c b/main/openssl/crypto/comp/c_rle.c
index 18bceae5..47dfb67f 100644
--- a/main/openssl/crypto/comp/c_rle.c
+++ b/main/openssl/crypto/comp/c_rle.c
@@ -30,7 +30,7 @@ static int rle_compress_block(COMP_CTX *ctx, unsigned char *out,
 	{
 	/* int i; */
 
-	if (olen < (ilen+1))
+	if (ilen == 0 || olen < (ilen-1))
 		{
 		/* ZZZZZZZZZZZZZZZZZZZZZZ */
 		return(-1);
@@ -46,7 +46,7 @@ static int rle_expand_block(COMP_CTX *ctx, unsigned char *out,
 	{
 	int i;
 
-	if (ilen == 0 || olen < (ilen-1))
+	if (olen < (ilen-1))
 		{
 		/* ZZZZZZZZZZZZZZZZZZZZZZ */
 		return(-1);
diff --git a/main/openssl/crypto/conf/conf_mall.c b/main/openssl/crypto/conf/conf_mall.c
index c6f4cb2d..213890e0 100644
--- a/main/openssl/crypto/conf/conf_mall.c
+++ b/main/openssl/crypto/conf/conf_mall.c
@@ -76,5 +76,6 @@ void OPENSSL_load_builtin_modules(void)
 #ifndef OPENSSL_NO_ENGINE
 	ENGINE_add_conf_module();
 #endif
+	EVP_add_alg_module();
 	}
 
diff --git a/main/openssl/crypto/cpt_err.c b/main/openssl/crypto/cpt_err.c
index 139b9284..289005f6 100644
--- a/main/openssl/crypto/cpt_err.c
+++ b/main/openssl/crypto/cpt_err.c
@@ -1,6 +1,6 @@
 /* crypto/cpt_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -76,6 +76,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
 {ERR_FUNC(CRYPTO_F_CRYPTO_SET_EX_DATA),	"CRYPTO_set_ex_data"},
 {ERR_FUNC(CRYPTO_F_DEF_ADD_INDEX),	"DEF_ADD_INDEX"},
 {ERR_FUNC(CRYPTO_F_DEF_GET_CLASS),	"DEF_GET_CLASS"},
+{ERR_FUNC(CRYPTO_F_FIPS_MODE_SET),	"FIPS_mode_set"},
 {ERR_FUNC(CRYPTO_F_INT_DUP_EX_DATA),	"INT_DUP_EX_DATA"},
 {ERR_FUNC(CRYPTO_F_INT_FREE_EX_DATA),	"INT_FREE_EX_DATA"},
 {ERR_FUNC(CRYPTO_F_INT_NEW_EX_DATA),	"INT_NEW_EX_DATA"},
@@ -84,6 +85,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
 
 static ERR_STRING_DATA CRYPTO_str_reasons[]=
 	{
+{ERR_REASON(CRYPTO_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"},
 {ERR_REASON(CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK),"no dynlock create callback"},
 {0,NULL}
 	};
diff --git a/main/openssl/crypto/cryptlib.c b/main/openssl/crypto/cryptlib.c
index 24fe123e..304c6b70 100644
--- a/main/openssl/crypto/cryptlib.c
+++ b/main/openssl/crypto/cryptlib.c
@@ -409,6 +409,10 @@ int (*CRYPTO_get_add_lock_callback(void))(int *num,int mount,int type,
 void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
 					      const char *file,int line))
 	{
+	/* Calling this here ensures initialisation before any threads
+	 * are started.
+	 */
+	OPENSSL_init();
 	locking_callback=func;
 	}
 
@@ -500,7 +504,7 @@ void CRYPTO_THREADID_current(CRYPTO_THREADID *id)
 	CRYPTO_THREADID_set_numeric(id, (unsigned long)find_thread(NULL));
 #else
 	/* For everything else, default to using the address of 'errno' */
-	CRYPTO_THREADID_set_pointer(id, &errno);
+	CRYPTO_THREADID_set_pointer(id, (void*)&errno);
 #endif
 	}
 
@@ -661,28 +665,53 @@ const char *CRYPTO_get_lock_name(int type)
 	defined(__INTEL__) || \
 	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
 
-unsigned long  OPENSSL_ia32cap_P=0;
-unsigned long *OPENSSL_ia32cap_loc(void) { return &OPENSSL_ia32cap_P; }
+unsigned int  OPENSSL_ia32cap_P[2];
+unsigned long *OPENSSL_ia32cap_loc(void)
+{   if (sizeof(long)==4)
+	/*
+	 * If 32-bit application pulls address of OPENSSL_ia32cap_P[0]
+	 * clear second element to maintain the illusion that vector
+	 * is 32-bit.
+	 */
+	OPENSSL_ia32cap_P[1]=0;
+    return (unsigned long *)OPENSSL_ia32cap_P;
+}
 
 #if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY)
 #define OPENSSL_CPUID_SETUP
+#if defined(_WIN32)
+typedef unsigned __int64 IA32CAP;
+#else
+typedef unsigned long long IA32CAP;
+#endif
 void OPENSSL_cpuid_setup(void)
 { static int trigger=0;
-  unsigned long OPENSSL_ia32_cpuid(void);
+  IA32CAP OPENSSL_ia32_cpuid(void);
+  IA32CAP vec;
   char *env;
 
     if (trigger)	return;
 
     trigger=1;
-    if ((env=getenv("OPENSSL_ia32cap")))
-	OPENSSL_ia32cap_P = strtoul(env,NULL,0)|(1<<10);
+    if ((env=getenv("OPENSSL_ia32cap"))) {
+	int off = (env[0]=='~')?1:0;
+#if defined(_WIN32)
+	if (!sscanf(env+off,"%I64i",&vec)) vec = strtoul(env+off,NULL,0);
+#else
+	if (!sscanf(env+off,"%lli",(long long *)&vec)) vec = strtoul(env+off,NULL,0);
+#endif
+	if (off) vec = OPENSSL_ia32_cpuid()&~vec;
+    }
     else
-	OPENSSL_ia32cap_P = OPENSSL_ia32_cpuid()|(1<<10);
+	vec = OPENSSL_ia32_cpuid();
+
     /*
      * |(1<<10) sets a reserved bit to signal that variable
      * was initialized already... This is to avoid interference
      * with cpuid snippets in ELF .init segment.
      */
+    OPENSSL_ia32cap_P[0] = (unsigned int)vec|(1<<10);
+    OPENSSL_ia32cap_P[1] = (unsigned int)(vec>>32);
 }
 #endif
 
@@ -896,3 +925,16 @@ void OpenSSLDie(const char *file,int line,const char *assertion)
 	}
 
 void *OPENSSL_stderr(void)	{ return stderr; }
+
+int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len)
+	{
+	size_t i;
+	const unsigned char *a = in_a;
+	const unsigned char *b = in_b;
+	unsigned char x = 0;
+
+	for (i = 0; i < len; i++)
+		x |= a[i] ^ b[i];
+
+	return x;
+	}
diff --git a/main/openssl/crypto/cryptlib.h b/main/openssl/crypto/cryptlib.h
index fc249c57..d26f9630 100644
--- a/main/openssl/crypto/cryptlib.h
+++ b/main/openssl/crypto/cryptlib.h
@@ -99,8 +99,8 @@ extern "C" {
 #define HEX_SIZE(type)		(sizeof(type)*2)
 
 void OPENSSL_cpuid_setup(void);
-extern unsigned long OPENSSL_ia32cap_P;
-void OPENSSL_showfatal(const char *,...);
+extern unsigned int OPENSSL_ia32cap_P[];
+void OPENSSL_showfatal(const char *fmta,...);
 void *OPENSSL_stderr(void);
 extern int OPENSSL_NONPIC_relocated;
 
diff --git a/main/openssl/crypto/crypto.h b/main/openssl/crypto/crypto.h
index b0360cec..f92fc518 100644
--- a/main/openssl/crypto/crypto.h
+++ b/main/openssl/crypto/crypto.h
@@ -488,10 +488,10 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
 				    long (**go)(void));
 
 void *CRYPTO_malloc_locked(int num, const char *file, int line);
-void CRYPTO_free_locked(void *);
+void CRYPTO_free_locked(void *ptr);
 void *CRYPTO_malloc(int num, const char *file, int line);
 char *CRYPTO_strdup(const char *str, const char *file, int line);
-void CRYPTO_free(void *);
+void CRYPTO_free(void *ptr);
 void *CRYPTO_realloc(void *addr,int num, const char *file, int line);
 void *CRYPTO_realloc_clean(void *addr,int old_num,int num,const char *file,
 			   int line);
@@ -547,6 +547,40 @@ unsigned long *OPENSSL_ia32cap_loc(void);
 #define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
 int OPENSSL_isservice(void);
 
+int FIPS_mode(void);
+int FIPS_mode_set(int r);
+
+void OPENSSL_init(void);
+
+#define fips_md_init(alg) fips_md_init_ctx(alg, alg)
+
+#ifdef OPENSSL_FIPS
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c) \
+	{ \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to digest " #alg " forbidden in FIPS mode!"); \
+	return private_##alg##_Init(c); \
+	} \
+	int private_##alg##_Init(cx##_CTX *c)
+
+#define fips_cipher_abort(alg) \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to cipher " #alg " forbidden in FIPS mode!")
+
+#else
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c)
+#define fips_cipher_abort(alg) while(0)
+#endif
+
+/* CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It
+ * takes an amount of time dependent on |len|, but independent of the contents
+ * of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a
+ * defined order as the return value when a != b is undefined, other than to be
+ * non-zero. */
+int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -562,11 +596,13 @@ void ERR_load_CRYPTO_strings(void);
 #define CRYPTO_F_CRYPTO_SET_EX_DATA			 102
 #define CRYPTO_F_DEF_ADD_INDEX				 104
 #define CRYPTO_F_DEF_GET_CLASS				 105
+#define CRYPTO_F_FIPS_MODE_SET				 109
 #define CRYPTO_F_INT_DUP_EX_DATA			 106
 #define CRYPTO_F_INT_FREE_EX_DATA			 107
 #define CRYPTO_F_INT_NEW_EX_DATA			 108
 
 /* Reason codes. */
+#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED		 101
 #define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK		 100
 
 #ifdef  __cplusplus
diff --git a/main/openssl/crypto/des/asm/crypt586.S b/main/openssl/crypto/des/asm/crypt586.S
new file mode 100644
index 00000000..fb321ba9
--- /dev/null
+++ b/main/openssl/crypto/des/asm/crypt586.S
@@ -0,0 +1,879 @@
+.file	"crypt586.s"
+.text
+.globl	fcrypt_body
+.type	fcrypt_body,@function
+.align	16
+fcrypt_body:
+.L_fcrypt_body_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+
+	xorl	%edi,%edi
+	xorl	%esi,%esi
+	call	.L000PIC_me_up
+.L000PIC_me_up:
+	popl	%edx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
+	movl	DES_SPtrans@GOT(%edx),%edx
+	pushl	%edx
+	movl	28(%esp),%ebp
+	pushl	$25
+.L001start:
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	4(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	8(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	12(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	16(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	20(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	24(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	28(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	32(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	36(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	40(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	44(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	48(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	52(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	56(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	60(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	64(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	68(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	72(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	76(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	80(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	84(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	88(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	92(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	96(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	100(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	104(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	108(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	112(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	116(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	120(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	124(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+	movl	(%esp),%ebx
+	movl	%edi,%eax
+	decl	%ebx
+	movl	%esi,%edi
+	movl	%eax,%esi
+	movl	%ebx,(%esp)
+	jnz	.L001start
+
+
+	movl	28(%esp),%edx
+	rorl	$1,%edi
+	movl	%esi,%eax
+	xorl	%edi,%esi
+	andl	$0xaaaaaaaa,%esi
+	xorl	%esi,%eax
+	xorl	%esi,%edi
+
+	roll	$23,%eax
+	movl	%eax,%esi
+	xorl	%edi,%eax
+	andl	$0x03fc03fc,%eax
+	xorl	%eax,%esi
+	xorl	%eax,%edi
+
+	roll	$10,%esi
+	movl	%esi,%eax
+	xorl	%edi,%esi
+	andl	$0x33333333,%esi
+	xorl	%esi,%eax
+	xorl	%esi,%edi
+
+	roll	$18,%edi
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	andl	$0xfff0000f,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%eax
+
+	roll	$12,%esi
+	movl	%esi,%edi
+	xorl	%eax,%esi
+	andl	$0xf0f0f0f0,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%eax
+
+	rorl	$4,%eax
+	movl	%eax,(%edx)
+	movl	%edi,4(%edx)
+	addl	$8,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	fcrypt_body,.-.L_fcrypt_body_begin
diff --git a/main/openssl/crypto/des/asm/des-586.S b/main/openssl/crypto/des/asm/des-586.S
new file mode 100644
index 00000000..2fbd340d
--- /dev/null
+++ b/main/openssl/crypto/des/asm/des-586.S
@@ -0,0 +1,1837 @@
+.file	"des-586.s"
+.text
+.globl	DES_SPtrans
+.type	_x86_DES_encrypt,@function
+.align	16
+_x86_DES_encrypt:
+	pushl	%ecx
+
+	movl	(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	4(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	8(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	12(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	16(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	20(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	24(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	28(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	32(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	36(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	40(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	44(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	48(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	52(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	56(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	60(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	64(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	68(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	72(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	76(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	80(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	84(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	88(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	92(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	96(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	100(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	104(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	108(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	112(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	116(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	120(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	124(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+	addl	$4,%esp
+	ret
+.size	_x86_DES_encrypt,.-_x86_DES_encrypt
+.type	_x86_DES_decrypt,@function
+.align	16
+_x86_DES_decrypt:
+	pushl	%ecx
+
+	movl	120(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	124(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	112(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	116(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	104(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	108(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	96(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	100(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	88(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	92(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	80(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	84(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	72(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	76(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	64(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	68(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	56(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	60(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	48(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	52(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	40(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	44(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	32(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	36(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	24(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	28(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	16(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	20(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	8(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	12(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	4(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+	addl	$4,%esp
+	ret
+.size	_x86_DES_decrypt,.-_x86_DES_decrypt
+.globl	DES_encrypt1
+.type	DES_encrypt1,@function
+.align	16
+DES_encrypt1:
+.L_DES_encrypt1_begin:
+	pushl	%esi
+	pushl	%edi
+
+
+	movl	12(%esp),%esi
+	xorl	%ecx,%ecx
+	pushl	%ebx
+	pushl	%ebp
+	movl	(%esi),%eax
+	movl	28(%esp),%ebx
+	movl	4(%esi),%edi
+
+
+	roll	$4,%eax
+	movl	%eax,%esi
+	xorl	%edi,%eax
+	andl	$0xf0f0f0f0,%eax
+	xorl	%eax,%esi
+	xorl	%eax,%edi
+
+	roll	$20,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0xfff0000f,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$14,%eax
+	movl	%eax,%edi
+	xorl	%esi,%eax
+	andl	$0x33333333,%eax
+	xorl	%eax,%edi
+	xorl	%eax,%esi
+
+	roll	$22,%esi
+	movl	%esi,%eax
+	xorl	%edi,%esi
+	andl	$0x03fc03fc,%esi
+	xorl	%esi,%eax
+	xorl	%esi,%edi
+
+	roll	$9,%eax
+	movl	%eax,%esi
+	xorl	%edi,%eax
+	andl	$0xaaaaaaaa,%eax
+	xorl	%eax,%esi
+	xorl	%eax,%edi
+
+	roll	$1,%edi
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	DES_SPtrans-.L000pic_point(%ebp),%ebp
+	movl	24(%esp),%ecx
+	cmpl	$0,%ebx
+	je	.L001decrypt
+	call	_x86_DES_encrypt
+	jmp	.L002done
+.L001decrypt:
+	call	_x86_DES_decrypt
+.L002done:
+
+
+	movl	20(%esp),%edx
+	rorl	$1,%esi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$23,%eax
+	movl	%eax,%edi
+	xorl	%esi,%eax
+	andl	$0x03fc03fc,%eax
+	xorl	%eax,%edi
+	xorl	%eax,%esi
+
+	roll	$10,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$18,%esi
+	movl	%esi,%edi
+	xorl	%eax,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%eax
+
+	roll	$12,%edi
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%eax
+
+	rorl	$4,%eax
+	movl	%eax,(%edx)
+	movl	%esi,4(%edx)
+	popl	%ebp
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	ret
+.size	DES_encrypt1,.-.L_DES_encrypt1_begin
+.globl	DES_encrypt2
+.type	DES_encrypt2,@function
+.align	16
+DES_encrypt2:
+.L_DES_encrypt2_begin:
+	pushl	%esi
+	pushl	%edi
+
+
+	movl	12(%esp),%eax
+	xorl	%ecx,%ecx
+	pushl	%ebx
+	pushl	%ebp
+	movl	(%eax),%esi
+	movl	28(%esp),%ebx
+	roll	$3,%esi
+	movl	4(%eax),%edi
+	roll	$3,%edi
+	call	.L003pic_point
+.L003pic_point:
+	popl	%ebp
+	leal	DES_SPtrans-.L003pic_point(%ebp),%ebp
+	movl	24(%esp),%ecx
+	cmpl	$0,%ebx
+	je	.L004decrypt
+	call	_x86_DES_encrypt
+	jmp	.L005done
+.L004decrypt:
+	call	_x86_DES_decrypt
+.L005done:
+
+
+	rorl	$3,%edi
+	movl	20(%esp),%eax
+	rorl	$3,%esi
+	movl	%edi,(%eax)
+	movl	%esi,4(%eax)
+	popl	%ebp
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	ret
+.size	DES_encrypt2,.-.L_DES_encrypt2_begin
+.globl	DES_encrypt3
+.type	DES_encrypt3,@function
+.align	16
+DES_encrypt3:
+.L_DES_encrypt3_begin:
+	pushl	%ebx
+	movl	8(%esp),%ebx
+	pushl	%ebp
+	pushl	%esi
+	pushl	%edi
+
+
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+	subl	$12,%esp
+
+
+	roll	$4,%edi
+	movl	%edi,%edx
+	xorl	%esi,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%edx
+	xorl	%edi,%esi
+
+	roll	$20,%esi
+	movl	%esi,%edi
+	xorl	%edx,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%edx
+
+	roll	$14,%edi
+	movl	%edi,%esi
+	xorl	%edx,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%edx
+
+	roll	$22,%edx
+	movl	%edx,%edi
+	xorl	%esi,%edx
+	andl	$0x03fc03fc,%edx
+	xorl	%edx,%edi
+	xorl	%edx,%esi
+
+	roll	$9,%edi
+	movl	%edi,%edx
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%edx
+	xorl	%edi,%esi
+
+	rorl	$3,%edx
+	rorl	$2,%esi
+	movl	%esi,4(%ebx)
+	movl	36(%esp),%eax
+	movl	%edx,(%ebx)
+	movl	40(%esp),%edi
+	movl	44(%esp),%esi
+	movl	$1,8(%esp)
+	movl	%eax,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	movl	$0,8(%esp)
+	movl	%edi,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	movl	$1,8(%esp)
+	movl	%esi,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	addl	$12,%esp
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+
+
+	roll	$2,%esi
+	roll	$3,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$23,%eax
+	movl	%eax,%edi
+	xorl	%esi,%eax
+	andl	$0x03fc03fc,%eax
+	xorl	%eax,%edi
+	xorl	%eax,%esi
+
+	roll	$10,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$18,%esi
+	movl	%esi,%edi
+	xorl	%eax,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%eax
+
+	roll	$12,%edi
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%eax
+
+	rorl	$4,%eax
+	movl	%eax,(%ebx)
+	movl	%esi,4(%ebx)
+	popl	%edi
+	popl	%esi
+	popl	%ebp
+	popl	%ebx
+	ret
+.size	DES_encrypt3,.-.L_DES_encrypt3_begin
+.globl	DES_decrypt3
+.type	DES_decrypt3,@function
+.align	16
+DES_decrypt3:
+.L_DES_decrypt3_begin:
+	pushl	%ebx
+	movl	8(%esp),%ebx
+	pushl	%ebp
+	pushl	%esi
+	pushl	%edi
+
+
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+	subl	$12,%esp
+
+
+	roll	$4,%edi
+	movl	%edi,%edx
+	xorl	%esi,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%edx
+	xorl	%edi,%esi
+
+	roll	$20,%esi
+	movl	%esi,%edi
+	xorl	%edx,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%edx
+
+	roll	$14,%edi
+	movl	%edi,%esi
+	xorl	%edx,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%edx
+
+	roll	$22,%edx
+	movl	%edx,%edi
+	xorl	%esi,%edx
+	andl	$0x03fc03fc,%edx
+	xorl	%edx,%edi
+	xorl	%edx,%esi
+
+	roll	$9,%edi
+	movl	%edi,%edx
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%edx
+	xorl	%edi,%esi
+
+	rorl	$3,%edx
+	rorl	$2,%esi
+	movl	%esi,4(%ebx)
+	movl	36(%esp),%esi
+	movl	%edx,(%ebx)
+	movl	40(%esp),%edi
+	movl	44(%esp),%eax
+	movl	$0,8(%esp)
+	movl	%eax,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	movl	$1,8(%esp)
+	movl	%edi,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	movl	$0,8(%esp)
+	movl	%esi,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	addl	$12,%esp
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+
+
+	roll	$2,%esi
+	roll	$3,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$23,%eax
+	movl	%eax,%edi
+	xorl	%esi,%eax
+	andl	$0x03fc03fc,%eax
+	xorl	%eax,%edi
+	xorl	%eax,%esi
+
+	roll	$10,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$18,%esi
+	movl	%esi,%edi
+	xorl	%eax,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%eax
+
+	roll	$12,%edi
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%eax
+
+	rorl	$4,%eax
+	movl	%eax,(%ebx)
+	movl	%esi,4(%ebx)
+	popl	%edi
+	popl	%esi
+	popl	%ebp
+	popl	%ebx
+	ret
+.size	DES_decrypt3,.-.L_DES_decrypt3_begin
+.globl	DES_ncbc_encrypt
+.type	DES_ncbc_encrypt,@function
+.align	16
+DES_ncbc_encrypt:
+.L_DES_ncbc_encrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%ebp
+
+	movl	36(%esp),%ebx
+	movl	(%ebx),%esi
+	movl	4(%ebx),%edi
+	pushl	%edi
+	pushl	%esi
+	pushl	%edi
+	pushl	%esi
+	movl	%esp,%ebx
+	movl	36(%esp),%esi
+	movl	40(%esp),%edi
+
+	movl	56(%esp),%ecx
+
+	pushl	%ecx
+
+	movl	52(%esp),%eax
+	pushl	%eax
+	pushl	%ebx
+	cmpl	$0,%ecx
+	jz	.L006decrypt
+	andl	$4294967288,%ebp
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	jz	.L007encrypt_finish
+.L008encrypt_loop:
+	movl	(%esi),%ecx
+	movl	4(%esi),%edx
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	movl	%eax,12(%esp)
+	movl	%ebx,16(%esp)
+	call	.L_DES_encrypt1_begin
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L008encrypt_loop
+.L007encrypt_finish:
+	movl	56(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L009finish
+	call	.L010PIC_point
+.L010PIC_point:
+	popl	%edx
+	leal	.L011cbc_enc_jmp_table-.L010PIC_point(%edx),%ecx
+	movl	(%ecx,%ebp,4),%ebp
+	addl	%edx,%ebp
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	jmp	*%ebp
+.L012ej7:
+	movb	6(%esi),%dh
+	shll	$8,%edx
+.L013ej6:
+	movb	5(%esi),%dh
+.L014ej5:
+	movb	4(%esi),%dl
+.L015ej4:
+	movl	(%esi),%ecx
+	jmp	.L016ejend
+.L017ej3:
+	movb	2(%esi),%ch
+	shll	$8,%ecx
+.L018ej2:
+	movb	1(%esi),%ch
+.L019ej1:
+	movb	(%esi),%cl
+.L016ejend:
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	movl	%eax,12(%esp)
+	movl	%ebx,16(%esp)
+	call	.L_DES_encrypt1_begin
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	jmp	.L009finish
+.L006decrypt:
+	andl	$4294967288,%ebp
+	movl	20(%esp),%eax
+	movl	24(%esp),%ebx
+	jz	.L020decrypt_finish
+.L021decrypt_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,12(%esp)
+	movl	%ebx,16(%esp)
+	call	.L_DES_encrypt1_begin
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	movl	20(%esp),%ecx
+	movl	24(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%ecx,(%edi)
+	movl	%edx,4(%edi)
+	movl	%eax,20(%esp)
+	movl	%ebx,24(%esp)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L021decrypt_loop
+.L020decrypt_finish:
+	movl	56(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L009finish
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,12(%esp)
+	movl	%ebx,16(%esp)
+	call	.L_DES_encrypt1_begin
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	movl	20(%esp),%ecx
+	movl	24(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+.L022dj7:
+	rorl	$16,%edx
+	movb	%dl,6(%edi)
+	shrl	$16,%edx
+.L023dj6:
+	movb	%dh,5(%edi)
+.L024dj5:
+	movb	%dl,4(%edi)
+.L025dj4:
+	movl	%ecx,(%edi)
+	jmp	.L026djend
+.L027dj3:
+	rorl	$16,%ecx
+	movb	%cl,2(%edi)
+	shll	$16,%ecx
+.L028dj2:
+	movb	%ch,1(%esi)
+.L029dj1:
+	movb	%cl,(%esi)
+.L026djend:
+	jmp	.L009finish
+.L009finish:
+	movl	64(%esp),%ecx
+	addl	$28,%esp
+	movl	%eax,(%ecx)
+	movl	%ebx,4(%ecx)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L011cbc_enc_jmp_table:
+.long	0
+.long	.L019ej1-.L010PIC_point
+.long	.L018ej2-.L010PIC_point
+.long	.L017ej3-.L010PIC_point
+.long	.L015ej4-.L010PIC_point
+.long	.L014ej5-.L010PIC_point
+.long	.L013ej6-.L010PIC_point
+.long	.L012ej7-.L010PIC_point
+.align	64
+.size	DES_ncbc_encrypt,.-.L_DES_ncbc_encrypt_begin
+.globl	DES_ede3_cbc_encrypt
+.type	DES_ede3_cbc_encrypt,@function
+.align	16
+DES_ede3_cbc_encrypt:
+.L_DES_ede3_cbc_encrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%ebp
+
+	movl	44(%esp),%ebx
+	movl	(%ebx),%esi
+	movl	4(%ebx),%edi
+	pushl	%edi
+	pushl	%esi
+	pushl	%edi
+	pushl	%esi
+	movl	%esp,%ebx
+	movl	36(%esp),%esi
+	movl	40(%esp),%edi
+
+	movl	64(%esp),%ecx
+
+	movl	56(%esp),%eax
+	pushl	%eax
+
+	movl	56(%esp),%eax
+	pushl	%eax
+
+	movl	56(%esp),%eax
+	pushl	%eax
+	pushl	%ebx
+	cmpl	$0,%ecx
+	jz	.L030decrypt
+	andl	$4294967288,%ebp
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	jz	.L031encrypt_finish
+.L032encrypt_loop:
+	movl	(%esi),%ecx
+	movl	4(%esi),%edx
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	call	.L_DES_encrypt3_begin
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L032encrypt_loop
+.L031encrypt_finish:
+	movl	60(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L033finish
+	call	.L034PIC_point
+.L034PIC_point:
+	popl	%edx
+	leal	.L035cbc_enc_jmp_table-.L034PIC_point(%edx),%ecx
+	movl	(%ecx,%ebp,4),%ebp
+	addl	%edx,%ebp
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	jmp	*%ebp
+.L036ej7:
+	movb	6(%esi),%dh
+	shll	$8,%edx
+.L037ej6:
+	movb	5(%esi),%dh
+.L038ej5:
+	movb	4(%esi),%dl
+.L039ej4:
+	movl	(%esi),%ecx
+	jmp	.L040ejend
+.L041ej3:
+	movb	2(%esi),%ch
+	shll	$8,%ecx
+.L042ej2:
+	movb	1(%esi),%ch
+.L043ej1:
+	movb	(%esi),%cl
+.L040ejend:
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	call	.L_DES_encrypt3_begin
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	jmp	.L033finish
+.L030decrypt:
+	andl	$4294967288,%ebp
+	movl	24(%esp),%eax
+	movl	28(%esp),%ebx
+	jz	.L044decrypt_finish
+.L045decrypt_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	call	.L_DES_decrypt3_begin
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	movl	24(%esp),%ecx
+	movl	28(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%ecx,(%edi)
+	movl	%edx,4(%edi)
+	movl	%eax,24(%esp)
+	movl	%ebx,28(%esp)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L045decrypt_loop
+.L044decrypt_finish:
+	movl	60(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L033finish
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	call	.L_DES_decrypt3_begin
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	movl	24(%esp),%ecx
+	movl	28(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+.L046dj7:
+	rorl	$16,%edx
+	movb	%dl,6(%edi)
+	shrl	$16,%edx
+.L047dj6:
+	movb	%dh,5(%edi)
+.L048dj5:
+	movb	%dl,4(%edi)
+.L049dj4:
+	movl	%ecx,(%edi)
+	jmp	.L050djend
+.L051dj3:
+	rorl	$16,%ecx
+	movb	%cl,2(%edi)
+	shll	$16,%ecx
+.L052dj2:
+	movb	%ch,1(%esi)
+.L053dj1:
+	movb	%cl,(%esi)
+.L050djend:
+	jmp	.L033finish
+.L033finish:
+	movl	76(%esp),%ecx
+	addl	$32,%esp
+	movl	%eax,(%ecx)
+	movl	%ebx,4(%ecx)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L035cbc_enc_jmp_table:
+.long	0
+.long	.L043ej1-.L034PIC_point
+.long	.L042ej2-.L034PIC_point
+.long	.L041ej3-.L034PIC_point
+.long	.L039ej4-.L034PIC_point
+.long	.L038ej5-.L034PIC_point
+.long	.L037ej6-.L034PIC_point
+.long	.L036ej7-.L034PIC_point
+.align	64
+.size	DES_ede3_cbc_encrypt,.-.L_DES_ede3_cbc_encrypt_begin
+.align	64
+DES_SPtrans:
+.long	34080768,524288,33554434,34080770
+.long	33554432,526338,524290,33554434
+.long	526338,34080768,34078720,2050
+.long	33556482,33554432,0,524290
+.long	524288,2,33556480,526336
+.long	34080770,34078720,2050,33556480
+.long	2,2048,526336,34078722
+.long	2048,33556482,34078722,0
+.long	0,34080770,33556480,524290
+.long	34080768,524288,2050,33556480
+.long	34078722,2048,526336,33554434
+.long	526338,2,33554434,34078720
+.long	34080770,526336,34078720,33556482
+.long	33554432,2050,524290,0
+.long	524288,33554432,33556482,34080768
+.long	2,34078722,2048,526338
+.long	1074823184,0,1081344,1074790400
+.long	1073741840,32784,1073774592,1081344
+.long	32768,1074790416,16,1073774592
+.long	1048592,1074823168,1074790400,16
+.long	1048576,1073774608,1074790416,32768
+.long	1081360,1073741824,0,1048592
+.long	1073774608,1081360,1074823168,1073741840
+.long	1073741824,1048576,32784,1074823184
+.long	1048592,1074823168,1073774592,1081360
+.long	1074823184,1048592,1073741840,0
+.long	1073741824,32784,1048576,1074790416
+.long	32768,1073741824,1081360,1073774608
+.long	1074823168,32768,0,1073741840
+.long	16,1074823184,1081344,1074790400
+.long	1074790416,1048576,32784,1073774592
+.long	1073774608,16,1074790400,1081344
+.long	67108865,67371264,256,67109121
+.long	262145,67108864,67109121,262400
+.long	67109120,262144,67371008,1
+.long	67371265,257,1,67371009
+.long	0,262145,67371264,256
+.long	257,67371265,262144,67108865
+.long	67371009,67109120,262401,67371008
+.long	262400,0,67108864,262401
+.long	67371264,256,1,262144
+.long	257,262145,67371008,67109121
+.long	0,67371264,262400,67371009
+.long	262145,67108864,67371265,1
+.long	262401,67108865,67108864,67371265
+.long	262144,67109120,67109121,262400
+.long	67109120,0,67371009,257
+.long	67108865,262401,256,67371008
+.long	4198408,268439552,8,272633864
+.long	0,272629760,268439560,4194312
+.long	272633856,268435464,268435456,4104
+.long	268435464,4198408,4194304,268435456
+.long	272629768,4198400,4096,8
+.long	4198400,268439560,272629760,4096
+.long	4104,0,4194312,272633856
+.long	268439552,272629768,272633864,4194304
+.long	272629768,4104,4194304,268435464
+.long	4198400,268439552,8,272629760
+.long	268439560,0,4096,4194312
+.long	0,272629768,272633856,4096
+.long	268435456,272633864,4198408,4194304
+.long	272633864,8,268439552,4198408
+.long	4194312,4198400,272629760,268439560
+.long	4104,268435456,268435464,272633856
+.long	134217728,65536,1024,134284320
+.long	134283296,134218752,66592,134283264
+.long	65536,32,134217760,66560
+.long	134218784,134283296,134284288,0
+.long	66560,134217728,65568,1056
+.long	134218752,66592,0,134217760
+.long	32,134218784,134284320,65568
+.long	134283264,1024,1056,134284288
+.long	134284288,134218784,65568,134283264
+.long	65536,32,134217760,134218752
+.long	134217728,66560,134284320,0
+.long	66592,134217728,1024,65568
+.long	134218784,1024,0,134284320
+.long	134283296,134284288,1056,65536
+.long	66560,134283296,134218752,1056
+.long	32,66592,134283264,134217760
+.long	2147483712,2097216,0,2149588992
+.long	2097216,8192,2147491904,2097152
+.long	8256,2149589056,2105344,2147483648
+.long	2147491840,2147483712,2149580800,2105408
+.long	2097152,2147491904,2149580864,0
+.long	8192,64,2149588992,2149580864
+.long	2149589056,2149580800,2147483648,8256
+.long	64,2105344,2105408,2147491840
+.long	8256,2147483648,2147491840,2105408
+.long	2149588992,2097216,0,2147491840
+.long	2147483648,8192,2149580864,2097152
+.long	2097216,2149589056,2105344,64
+.long	2149589056,2105344,2097152,2147491904
+.long	2147483712,2149580800,2105408,0
+.long	8192,2147483712,2147491904,2149588992
+.long	2149580800,8256,64,2149580864
+.long	16384,512,16777728,16777220
+.long	16794116,16388,16896,0
+.long	16777216,16777732,516,16793600
+.long	4,16794112,16793600,516
+.long	16777732,16384,16388,16794116
+.long	0,16777728,16777220,16896
+.long	16793604,16900,16794112,4
+.long	16900,16793604,512,16777216
+.long	16900,16793600,16793604,516
+.long	16384,512,16777216,16793604
+.long	16777732,16900,16896,0
+.long	512,16777220,4,16777728
+.long	0,16777732,16777728,16896
+.long	516,16384,16794116,16777216
+.long	16794112,4,16388,16794116
+.long	16777220,16794112,16793600,16388
+.long	545259648,545390592,131200,0
+.long	537001984,8388736,545259520,545390720
+.long	128,536870912,8519680,131200
+.long	8519808,537002112,536871040,545259520
+.long	131072,8519808,8388736,537001984
+.long	545390720,536871040,0,8519680
+.long	536870912,8388608,537002112,545259648
+.long	8388608,131072,545390592,128
+.long	8388608,131072,536871040,545390720
+.long	131200,536870912,0,8519680
+.long	545259648,537002112,537001984,8388736
+.long	545390592,128,8388736,537001984
+.long	545390720,8388608,545259520,536871040
+.long	8519680,131200,537002112,545259520
+.long	128,545390592,8519808,0
+.long	536870912,545259648,131072,8519808
diff --git a/main/openssl/crypto/des/des.h b/main/openssl/crypto/des/des.h
index 92b66635..1eaedcbd 100644
--- a/main/openssl/crypto/des/des.h
+++ b/main/openssl/crypto/des/des.h
@@ -224,6 +224,9 @@ int DES_set_key(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_key_sched(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_set_key_checked(const_DES_cblock *key,DES_key_schedule *schedule);
 void DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#ifdef OPENSSL_FIPS
+void private_DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#endif
 void DES_string_to_key(const char *str,DES_cblock *key);
 void DES_string_to_2keys(const char *str,DES_cblock *key1,DES_cblock *key2);
 void DES_cfb64_encrypt(const unsigned char *in,unsigned char *out,long length,
diff --git a/main/openssl/crypto/des/set_key.c b/main/openssl/crypto/des/set_key.c
index 3004cc3a..da4d62e1 100644
--- a/main/openssl/crypto/des/set_key.c
+++ b/main/openssl/crypto/des/set_key.c
@@ -63,6 +63,7 @@
  * 1.1 added norm_expand_bits
  * 1.0 First working version
  */
+#include <openssl/crypto.h>
 #include "des_locl.h"
 
 OPENSSL_IMPLEMENT_GLOBAL(int,DES_check_key,0)	/* defaults to false */
@@ -335,6 +336,13 @@ int DES_set_key_checked(const_DES_cblock *key, DES_key_schedule *schedule)
 	}
 
 void DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(DES);
+	private_DES_set_key_unchecked(key, schedule);
+	}
+void private_DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#endif
 	{
 	static const int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0};
 	register DES_LONG c,d,t,s,t2;
diff --git a/main/openssl/crypto/des/str2key.c b/main/openssl/crypto/des/str2key.c
index 9c2054bd..1077f99d 100644
--- a/main/openssl/crypto/des/str2key.c
+++ b/main/openssl/crypto/des/str2key.c
@@ -56,8 +56,8 @@
  * [including the GNU Public Licence.]
  */
 
-#include "des_locl.h"
 #include <openssl/crypto.h>
+#include "des_locl.h"
 
 void DES_string_to_key(const char *str, DES_cblock *key)
 	{
diff --git a/main/openssl/crypto/dh/dh.h b/main/openssl/crypto/dh/dh.h
index 849309a4..ea59e610 100644
--- a/main/openssl/crypto/dh/dh.h
+++ b/main/openssl/crypto/dh/dh.h
@@ -86,6 +86,21 @@
                                        * be used for all exponents.
                                        */
 
+/* If this flag is set the DH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DH_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DH_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -230,6 +245,9 @@ void ERR_load_DH_strings(void);
 #define DH_F_COMPUTE_KEY				 102
 #define DH_F_DHPARAMS_PRINT_FP				 101
 #define DH_F_DH_BUILTIN_GENPARAMS			 106
+#define DH_F_DH_COMPUTE_KEY				 114
+#define DH_F_DH_GENERATE_KEY				 115
+#define DH_F_DH_GENERATE_PARAMETERS_EX			 116
 #define DH_F_DH_NEW_METHOD				 105
 #define DH_F_DH_PARAM_DECODE				 107
 #define DH_F_DH_PRIV_DECODE				 110
@@ -249,7 +267,9 @@ void ERR_load_DH_strings(void);
 #define DH_R_DECODE_ERROR				 104
 #define DH_R_INVALID_PUBKEY				 102
 #define DH_R_KEYS_NOT_SET				 108
+#define DH_R_KEY_SIZE_TOO_SMALL				 110
 #define DH_R_MODULUS_TOO_LARGE				 103
+#define DH_R_NON_FIPS_METHOD				 111
 #define DH_R_NO_PARAMETERS_SET				 107
 #define DH_R_NO_PRIVATE_VALUE				 100
 #define DH_R_PARAMETER_ENCODING_ERROR			 105
diff --git a/main/openssl/crypto/dh/dh_ameth.c b/main/openssl/crypto/dh/dh_ameth.c
index 377caf96..02ec2d47 100644
--- a/main/openssl/crypto/dh/dh_ameth.c
+++ b/main/openssl/crypto/dh/dh_ameth.c
@@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth =
 	dh_copy_parameters,
 	dh_cmp_parameters,
 	dh_param_print,
+	0,
 
 	int_dh_free,
 	0
diff --git a/main/openssl/crypto/dh/dh_err.c b/main/openssl/crypto/dh/dh_err.c
index d5cf0c22..56d3df73 100644
--- a/main/openssl/crypto/dh/dh_err.c
+++ b/main/openssl/crypto/dh/dh_err.c
@@ -1,6 +1,6 @@
 /* crypto/dh/dh_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -73,6 +73,9 @@ static ERR_STRING_DATA DH_str_functs[]=
 {ERR_FUNC(DH_F_COMPUTE_KEY),	"COMPUTE_KEY"},
 {ERR_FUNC(DH_F_DHPARAMS_PRINT_FP),	"DHparams_print_fp"},
 {ERR_FUNC(DH_F_DH_BUILTIN_GENPARAMS),	"DH_BUILTIN_GENPARAMS"},
+{ERR_FUNC(DH_F_DH_COMPUTE_KEY),	"DH_compute_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_KEY),	"DH_generate_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_PARAMETERS_EX),	"DH_generate_parameters_ex"},
 {ERR_FUNC(DH_F_DH_NEW_METHOD),	"DH_new_method"},
 {ERR_FUNC(DH_F_DH_PARAM_DECODE),	"DH_PARAM_DECODE"},
 {ERR_FUNC(DH_F_DH_PRIV_DECODE),	"DH_PRIV_DECODE"},
@@ -95,7 +98,9 @@ static ERR_STRING_DATA DH_str_reasons[]=
 {ERR_REASON(DH_R_DECODE_ERROR)           ,"decode error"},
 {ERR_REASON(DH_R_INVALID_PUBKEY)         ,"invalid public key"},
 {ERR_REASON(DH_R_KEYS_NOT_SET)           ,"keys not set"},
+{ERR_REASON(DH_R_KEY_SIZE_TOO_SMALL)     ,"key size too small"},
 {ERR_REASON(DH_R_MODULUS_TOO_LARGE)      ,"modulus too large"},
+{ERR_REASON(DH_R_NON_FIPS_METHOD)        ,"non fips method"},
 {ERR_REASON(DH_R_NO_PARAMETERS_SET)      ,"no parameters set"},
 {ERR_REASON(DH_R_NO_PRIVATE_VALUE)       ,"no private value"},
 {ERR_REASON(DH_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
diff --git a/main/openssl/crypto/dh/dh_gen.c b/main/openssl/crypto/dh/dh_gen.c
index cfd5b118..7b1fe9c9 100644
--- a/main/openssl/crypto/dh/dh_gen.c
+++ b/main/openssl/crypto/dh/dh_gen.c
@@ -66,12 +66,29 @@
 #include <openssl/bn.h>
 #include <openssl/dh.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 static int dh_builtin_genparams(DH *ret, int prime_len, int generator, BN_GENCB *cb);
 
 int DH_generate_parameters_ex(DH *ret, int prime_len, int generator, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ret->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(ret->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_GENERATE_PARAMETERS_EX, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	if(ret->meth->generate_params)
 		return ret->meth->generate_params(ret, prime_len, generator, cb);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_dh_generate_parameters_ex(ret, prime_len,
+							generator, cb);
+#endif
 	return dh_builtin_genparams(ret, prime_len, generator, cb);
 	}
 
diff --git a/main/openssl/crypto/dh/dh_key.c b/main/openssl/crypto/dh/dh_key.c
index e7db4403..89a74db4 100644
--- a/main/openssl/crypto/dh/dh_key.c
+++ b/main/openssl/crypto/dh/dh_key.c
@@ -73,11 +73,27 @@ static int dh_finish(DH *dh);
 
 int DH_generate_key(DH *dh)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_GENERATE_KEY, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	return dh->meth->generate_key(dh);
 	}
 
 int DH_compute_key(unsigned char *key, const BIGNUM *pub_key, DH *dh)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_COMPUTE_KEY, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	return dh->meth->compute_key(key, pub_key, dh);
 	}
 
@@ -138,8 +154,21 @@ static int generate_key(DH *dh)
 
 	if (generate_new_key)
 		{
-		l = dh->length ? dh->length : BN_num_bits(dh->p)-1; /* secret exponent length */
-		if (!BN_rand(priv_key, l, 0, 0)) goto err;
+		if (dh->q)
+			{
+			do
+				{
+				if (!BN_rand_range(priv_key, dh->q))
+					goto err;
+				}
+			while (BN_is_zero(priv_key) || BN_is_one(priv_key));
+			}
+		else
+			{
+			/* secret exponent length */
+			l = dh->length ? dh->length : BN_num_bits(dh->p)-1;
+			if (!BN_rand(priv_key, l, 0, 0)) goto err;
+			}
 		}
 
 	{
diff --git a/main/openssl/crypto/dh/dh_lib.c b/main/openssl/crypto/dh/dh_lib.c
index 7aef080e..00218f2b 100644
--- a/main/openssl/crypto/dh/dh_lib.c
+++ b/main/openssl/crypto/dh/dh_lib.c
@@ -64,6 +64,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char DH_version[]="Diffie-Hellman" OPENSSL_VERSION_PTEXT;
 
 static const DH_METHOD *default_DH_method = NULL;
@@ -76,7 +80,16 @@ void DH_set_default_method(const DH_METHOD *meth)
 const DH_METHOD *DH_get_default_method(void)
 	{
 	if(!default_DH_method)
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_dh_openssl();
+		else
+			return DH_OpenSSL();
+#else
 		default_DH_method = DH_OpenSSL();
+#endif
+		}
 	return default_DH_method;
 	}
 
@@ -156,7 +169,7 @@ DH *DH_new_method(ENGINE *engine)
 	ret->counter = NULL;
 	ret->method_mont_p=NULL;
 	ret->references = 1;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~DH_FLAG_NON_FIPS_ALLOW;
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DH, ret, &ret->ex_data);
 	if ((ret->meth->init != NULL) && !ret->meth->init(ret))
 		{
diff --git a/main/openssl/crypto/dsa/dsa.h b/main/openssl/crypto/dsa/dsa.h
index ac50a5c8..7531c653 100644
--- a/main/openssl/crypto/dsa/dsa.h
+++ b/main/openssl/crypto/dsa/dsa.h
@@ -96,6 +96,25 @@
                                               * faster variable sliding window method to
                                               * be used for all exponents.
                                               */
+#define DSA_FLAG_NONCE_FROM_HASH	0x04 /* Causes the DSA nonce to be calculated
+						from SHA512(private_key + H(message) +
+						random). This strengthens DSA against a
+						weak PRNG. */
+
+/* If this flag is set the DSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DSA_FLAG_NON_FIPS_ALLOW			0x0400
 
 #ifdef  __cplusplus
 extern "C" {
@@ -115,8 +134,9 @@ struct dsa_method
 	{
 	const char *name;
 	DSA_SIG * (*dsa_do_sign)(const unsigned char *dgst, int dlen, DSA *dsa);
-	int (*dsa_sign_setup)(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp,
-								BIGNUM **rp);
+	int (*dsa_sign_setup)(DSA *dsa, BN_CTX *ctx_in,
+			      BIGNUM **kinvp, BIGNUM **rp,
+			      const unsigned char *dgst, int dlen);
 	int (*dsa_do_verify)(const unsigned char *dgst, int dgst_len,
 			     DSA_SIG *sig, DSA *dsa);
 	int (*dsa_mod_exp)(DSA *dsa, BIGNUM *rr, BIGNUM *a1, BIGNUM *p1,
@@ -272,6 +292,8 @@ void ERR_load_DSA_strings(void);
 #define DSA_F_DSAPARAMS_PRINT_FP			 101
 #define DSA_F_DSA_DO_SIGN				 112
 #define DSA_F_DSA_DO_VERIFY				 113
+#define DSA_F_DSA_GENERATE_KEY				 124
+#define DSA_F_DSA_GENERATE_PARAMETERS_EX		 123
 #define DSA_F_DSA_NEW_METHOD				 103
 #define DSA_F_DSA_PARAM_DECODE				 119
 #define DSA_F_DSA_PRINT_FP				 105
@@ -282,6 +304,7 @@ void ERR_load_DSA_strings(void);
 #define DSA_F_DSA_SIGN					 106
 #define DSA_F_DSA_SIGN_SETUP				 107
 #define DSA_F_DSA_SIG_NEW				 109
+#define DSA_F_DSA_SIG_PRINT				 125
 #define DSA_F_DSA_VERIFY				 108
 #define DSA_F_I2D_DSA_SIG				 111
 #define DSA_F_OLD_DSA_PRIV_DECODE			 122
@@ -298,6 +321,9 @@ void ERR_load_DSA_strings(void);
 #define DSA_R_INVALID_DIGEST_TYPE			 106
 #define DSA_R_MISSING_PARAMETERS			 101
 #define DSA_R_MODULUS_TOO_LARGE				 103
+#define DSA_R_NEED_NEW_SETUP_VALUES			 110
+#define DSA_R_NONCE_CANNOT_BE_PRECOMPUTED		 112
+#define DSA_R_NON_FIPS_DSA_METHOD			 111
 #define DSA_R_NO_PARAMETERS_SET				 107
 #define DSA_R_PARAMETER_ENCODING_ERROR			 105
 
diff --git a/main/openssl/crypto/dsa/dsa_ameth.c b/main/openssl/crypto/dsa/dsa_ameth.c
index 6413aae4..376156ec 100644
--- a/main/openssl/crypto/dsa/dsa_ameth.c
+++ b/main/openssl/crypto/dsa/dsa_ameth.c
@@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder)
 	return i2d_DSAPrivateKey(pkey->pkey.dsa, pder);
 	}
 
+static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+					const ASN1_STRING *sig,
+					int indent, ASN1_PCTX *pctx)
+	{
+	DSA_SIG *dsa_sig;
+	const unsigned char *p;
+	if (!sig)
+		{
+		if (BIO_puts(bp, "\n") <= 0)
+			return 0;
+		else
+			return 1;
+		}
+	p = sig->data;
+	dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length);
+	if (dsa_sig)
+		{
+		int rv = 0;
+		size_t buf_len = 0;
+		unsigned char *m=NULL;
+		update_buflen(dsa_sig->r, &buf_len);
+		update_buflen(dsa_sig->s, &buf_len);
+		m = OPENSSL_malloc(buf_len+10);
+		if (m == NULL)
+			{
+			DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+
+		if (BIO_write(bp, "\n", 1) != 1)
+			goto err;
+
+		if (!ASN1_bn_print(bp,"r:   ",dsa_sig->r,m,indent))
+			goto err;
+		if (!ASN1_bn_print(bp,"s:   ",dsa_sig->s,m,indent))
+			goto err;
+		rv = 1;
+		err:
+		if (m)
+			OPENSSL_free(m);
+		DSA_SIG_free(dsa_sig);
+		return rv;
+		}
+	return X509_signature_dump(bp, sig, indent);
+	}
+
 static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 	{
 	switch (op)
@@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] =
 		dsa_copy_parameters,
 		dsa_cmp_parameters,
 		dsa_param_print,
+		dsa_sig_print,
 
 		int_dsa_free,
 		dsa_pkey_ctrl,
diff --git a/main/openssl/crypto/dsa/dsa_asn1.c b/main/openssl/crypto/dsa/dsa_asn1.c
index c37460b2..60585343 100644
--- a/main/openssl/crypto/dsa/dsa_asn1.c
+++ b/main/openssl/crypto/dsa/dsa_asn1.c
@@ -61,6 +61,7 @@
 #include <openssl/dsa.h>
 #include <openssl/asn1.h>
 #include <openssl/asn1t.h>
+#include <openssl/rand.h>
 
 /* Override the default new methods */
 static int sig_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -87,7 +88,7 @@ ASN1_SEQUENCE_cb(DSA_SIG, sig_cb) = {
 	ASN1_SIMPLE(DSA_SIG, s, CBIGNUM)
 } ASN1_SEQUENCE_END_cb(DSA_SIG, DSA_SIG)
 
-IMPLEMENT_ASN1_FUNCTIONS_const(DSA_SIG)
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(DSA_SIG, DSA_SIG, DSA_SIG)
 
 /* Override the default free and new methods */
 static int dsa_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -148,3 +149,40 @@ DSA *DSAparams_dup(DSA *dsa)
 	{
 	return ASN1_item_dup(ASN1_ITEM_rptr(DSAparams), dsa);
 	}
+
+int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
+	     unsigned int *siglen, DSA *dsa)
+	{
+	DSA_SIG *s;
+	RAND_seed(dgst, dlen);
+	s=DSA_do_sign(dgst,dlen,dsa);
+	if (s == NULL)
+		{
+		*siglen=0;
+		return(0);
+		}
+	*siglen=i2d_DSA_SIG(s,&sig);
+	DSA_SIG_free(s);
+	return(1);
+	}
+
+/* data has already been hashed (probably with SHA or SHA-1). */
+/* returns
+ *      1: correct signature
+ *      0: incorrect signature
+ *     -1: error
+ */
+int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
+	     const unsigned char *sigbuf, int siglen, DSA *dsa)
+	{
+	DSA_SIG *s;
+	int ret=-1;
+
+	s = DSA_SIG_new();
+	if (s == NULL) return(ret);
+	if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
+	ret=DSA_do_verify(dgst,dgst_len,s,dsa);
+err:
+	DSA_SIG_free(s);
+	return(ret);
+	}
diff --git a/main/openssl/crypto/dsa/dsa_err.c b/main/openssl/crypto/dsa/dsa_err.c
index bba984e9..e6171cce 100644
--- a/main/openssl/crypto/dsa/dsa_err.c
+++ b/main/openssl/crypto/dsa/dsa_err.c
@@ -1,6 +1,6 @@
 /* crypto/dsa/dsa_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -76,6 +76,8 @@ static ERR_STRING_DATA DSA_str_functs[]=
 {ERR_FUNC(DSA_F_DSAPARAMS_PRINT_FP),	"DSAparams_print_fp"},
 {ERR_FUNC(DSA_F_DSA_DO_SIGN),	"DSA_do_sign"},
 {ERR_FUNC(DSA_F_DSA_DO_VERIFY),	"DSA_do_verify"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_KEY),	"DSA_generate_key"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_PARAMETERS_EX),	"DSA_generate_parameters_ex"},
 {ERR_FUNC(DSA_F_DSA_NEW_METHOD),	"DSA_new_method"},
 {ERR_FUNC(DSA_F_DSA_PARAM_DECODE),	"DSA_PARAM_DECODE"},
 {ERR_FUNC(DSA_F_DSA_PRINT_FP),	"DSA_print_fp"},
@@ -86,6 +88,7 @@ static ERR_STRING_DATA DSA_str_functs[]=
 {ERR_FUNC(DSA_F_DSA_SIGN),	"DSA_sign"},
 {ERR_FUNC(DSA_F_DSA_SIGN_SETUP),	"DSA_sign_setup"},
 {ERR_FUNC(DSA_F_DSA_SIG_NEW),	"DSA_SIG_new"},
+{ERR_FUNC(DSA_F_DSA_SIG_PRINT),	"DSA_SIG_PRINT"},
 {ERR_FUNC(DSA_F_DSA_VERIFY),	"DSA_verify"},
 {ERR_FUNC(DSA_F_I2D_DSA_SIG),	"i2d_DSA_SIG"},
 {ERR_FUNC(DSA_F_OLD_DSA_PRIV_DECODE),	"OLD_DSA_PRIV_DECODE"},
@@ -105,6 +108,9 @@ static ERR_STRING_DATA DSA_str_reasons[]=
 {ERR_REASON(DSA_R_INVALID_DIGEST_TYPE)   ,"invalid digest type"},
 {ERR_REASON(DSA_R_MISSING_PARAMETERS)    ,"missing parameters"},
 {ERR_REASON(DSA_R_MODULUS_TOO_LARGE)     ,"modulus too large"},
+{ERR_REASON(DSA_R_NEED_NEW_SETUP_VALUES) ,"need new setup values"},
+{ERR_REASON(DSA_R_NONCE_CANNOT_BE_PRECOMPUTED),"nonce cannot be precomputed"},
+{ERR_REASON(DSA_R_NON_FIPS_DSA_METHOD)   ,"non fips dsa method"},
 {ERR_REASON(DSA_R_NO_PARAMETERS_SET)     ,"no parameters set"},
 {ERR_REASON(DSA_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
 {0,NULL}
diff --git a/main/openssl/crypto/dsa/dsa_gen.c b/main/openssl/crypto/dsa/dsa_gen.c
index cb0b4538..c398761d 100644
--- a/main/openssl/crypto/dsa/dsa_gen.c
+++ b/main/openssl/crypto/dsa/dsa_gen.c
@@ -81,13 +81,33 @@
 #include <openssl/sha.h>
 #include "dsa_locl.h"
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 int DSA_generate_parameters_ex(DSA *ret, int bits,
 		const unsigned char *seed_in, int seed_len,
 		int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ret->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(ret->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_GENERATE_PARAMETERS_EX, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
+		}
+#endif
 	if(ret->meth->dsa_paramgen)
 		return ret->meth->dsa_paramgen(ret, bits, seed_in, seed_len,
 				counter_ret, h_ret, cb);
+#ifdef OPENSSL_FIPS
+	else if (FIPS_mode())
+		{
+		return FIPS_dsa_generate_parameters_ex(ret, bits, 
+							seed_in, seed_len,
+							counter_ret, h_ret, cb);
+		}
+#endif
 	else
 		{
 		const EVP_MD *evpmd;
@@ -105,12 +125,13 @@ int DSA_generate_parameters_ex(DSA *ret, int bits,
 			}
 
 		return dsa_builtin_paramgen(ret, bits, qbits, evpmd,
-				seed_in, seed_len, counter_ret, h_ret, cb);
+			seed_in, seed_len, NULL, counter_ret, h_ret, cb);
 		}
 	}
 
 int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 	const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+	unsigned char *seed_out,
 	int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
 	{
 	int ok=0;
@@ -201,8 +222,10 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 				}
 
 			/* step 2 */
-			EVP_Digest(seed, qsize, md,   NULL, evpmd, NULL);
-			EVP_Digest(buf,  qsize, buf2, NULL, evpmd, NULL);
+			if (!EVP_Digest(seed, qsize, md,   NULL, evpmd, NULL))
+				goto err;
+			if (!EVP_Digest(buf,  qsize, buf2, NULL, evpmd, NULL))
+				goto err;
 			for (i = 0; i < qsize; i++)
 				md[i]^=buf2[i];
 
@@ -251,7 +274,9 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 						break;
 					}
 
-				EVP_Digest(buf, qsize, md ,NULL, evpmd, NULL);
+				if (!EVP_Digest(buf, qsize, md ,NULL, evpmd,
+									NULL))
+					goto err;
 
 				/* step 8 */
 				if (!BN_bin2bn(md, qsize, r0))
@@ -332,6 +357,8 @@ err:
 			}
 		if (counter_ret != NULL) *counter_ret=counter;
 		if (h_ret != NULL) *h_ret=h;
+		if (seed_out)
+			memcpy(seed_out, seed, qsize);
 		}
 	if(ctx)
 		{
diff --git a/main/openssl/crypto/dsa/dsa_key.c b/main/openssl/crypto/dsa/dsa_key.c
index c4aa86bc..9cf669b9 100644
--- a/main/openssl/crypto/dsa/dsa_key.c
+++ b/main/openssl/crypto/dsa/dsa_key.c
@@ -64,12 +64,28 @@
 #include <openssl/dsa.h>
 #include <openssl/rand.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 static int dsa_builtin_keygen(DSA *dsa);
 
 int DSA_generate_key(DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_GENERATE_KEY, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
+		}
+#endif
 	if(dsa->meth->dsa_keygen)
 		return dsa->meth->dsa_keygen(dsa);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_dsa_generate_key(dsa);
+#endif
 	return dsa_builtin_keygen(dsa);
 	}
 
diff --git a/main/openssl/crypto/dsa/dsa_lib.c b/main/openssl/crypto/dsa/dsa_lib.c
index e9b75902..96d8d0c4 100644
--- a/main/openssl/crypto/dsa/dsa_lib.c
+++ b/main/openssl/crypto/dsa/dsa_lib.c
@@ -70,6 +70,10 @@
 #include <openssl/dh.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char DSA_version[]="DSA" OPENSSL_VERSION_PTEXT;
 
 static const DSA_METHOD *default_DSA_method = NULL;
@@ -82,7 +86,16 @@ void DSA_set_default_method(const DSA_METHOD *meth)
 const DSA_METHOD *DSA_get_default_method(void)
 	{
 	if(!default_DSA_method)
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_dsa_openssl();
+		else
+			return DSA_OpenSSL();
+#else
 		default_DSA_method = DSA_OpenSSL();
+#endif
+		}
 	return default_DSA_method;
 	}
 
@@ -163,7 +176,7 @@ DSA *DSA_new_method(ENGINE *engine)
 	ret->method_mont_p=NULL;
 
 	ret->references=1;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~DSA_FLAG_NON_FIPS_ALLOW;
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DSA, ret, &ret->ex_data);
 	if ((ret->meth->init != NULL) && !ret->meth->init(ret))
 		{
@@ -276,7 +289,8 @@ void *DSA_get_ex_data(DSA *d, int idx)
 DH *DSA_dup_DH(const DSA *r)
 	{
 	/* DSA has p, q, g, optional pub_key, optional priv_key.
-	 * DH has p, optional length, g, optional pub_key, optional priv_key.
+	 * DH has p, optional length, g, optional pub_key, optional priv_key,
+	 * optional q.
 	 */ 
 
 	DH *ret = NULL;
@@ -290,7 +304,11 @@ DH *DSA_dup_DH(const DSA *r)
 		if ((ret->p = BN_dup(r->p)) == NULL)
 			goto err;
 	if (r->q != NULL)
+		{
 		ret->length = BN_num_bits(r->q);
+		if ((ret->q = BN_dup(r->q)) == NULL)
+			goto err;
+		}
 	if (r->g != NULL)
 		if ((ret->g = BN_dup(r->g)) == NULL)
 			goto err;
diff --git a/main/openssl/crypto/dsa/dsa_locl.h b/main/openssl/crypto/dsa/dsa_locl.h
index 2b8cfee3..21e2e452 100644
--- a/main/openssl/crypto/dsa/dsa_locl.h
+++ b/main/openssl/crypto/dsa/dsa_locl.h
@@ -56,4 +56,5 @@
 
 int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 	const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+	unsigned char *seed_out,
 	int *counter_ret, unsigned long *h_ret, BN_GENCB *cb);
diff --git a/main/openssl/crypto/dsa/dsa_ossl.c b/main/openssl/crypto/dsa/dsa_ossl.c
index a3ddd7d2..177fc545 100644
--- a/main/openssl/crypto/dsa/dsa_ossl.c
+++ b/main/openssl/crypto/dsa/dsa_ossl.c
@@ -67,7 +67,9 @@
 #include <openssl/asn1.h>
 
 static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa);
-static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp);
+static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in,
+			  BIGNUM **kinvp, BIGNUM **rp,
+			  const unsigned char *dgst, int dlen);
 static int dsa_do_verify(const unsigned char *dgst, int dgst_len, DSA_SIG *sig,
 			 DSA *dsa);
 static int dsa_init(DSA *dsa);
@@ -136,6 +138,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	BN_CTX *ctx=NULL;
 	int reason=ERR_R_BN_LIB;
 	DSA_SIG *ret=NULL;
+	int noredo = 0;
 
 	BN_init(&m);
 	BN_init(&xr);
@@ -150,10 +153,11 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	if (s == NULL) goto err;
 	ctx=BN_CTX_new();
 	if (ctx == NULL) goto err;
-
+redo:
 	if ((dsa->kinv == NULL) || (dsa->r == NULL))
 		{
-		if (!DSA_sign_setup(dsa,ctx,&kinv,&r)) goto err;
+		if (!dsa->meth->dsa_sign_setup(dsa,ctx,&kinv,&r,dgst,dlen))
+			goto err;
 		}
 	else
 		{
@@ -161,6 +165,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 		dsa->kinv=NULL;
 		r=dsa->r;
 		dsa->r=NULL;
+		noredo = 1;
 		}
 
 	
@@ -181,6 +186,18 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 
 	ret=DSA_SIG_new();
 	if (ret == NULL) goto err;
+	/* Redo if r or s is zero as required by FIPS 186-3: this is
+	 * very unlikely.
+	 */
+	if (BN_is_zero(r) || BN_is_zero(s))
+		{
+		if (noredo)
+			{
+			reason = DSA_R_NEED_NEW_SETUP_VALUES;
+			goto err;
+			}
+		goto redo;
+		}
 	ret->r = r;
 	ret->s = s;
 	
@@ -199,7 +216,9 @@ err:
 	return(ret);
 	}
 
-static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
+static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in,
+			  BIGNUM **kinvp, BIGNUM **rp,
+			  const unsigned char *dgst, int dlen)
 	{
 	BN_CTX *ctx;
 	BIGNUM k,kq,*K,*kinv=NULL,*r=NULL;
@@ -225,8 +244,21 @@ static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
 
 	/* Get random k */
 	do
-		if (!BN_rand_range(&k, dsa->q)) goto err;
-	while (BN_is_zero(&k));
+		{
+#ifndef OPENSSL_NO_SHA512
+		if (dsa->flags & DSA_FLAG_NONCE_FROM_HASH)
+			{
+			/* If DSA_FLAG_NONCE_FROM_HASH is set then we calculate k from
+			 * SHA512(private_key + H(message) + random). This protects the
+			 * private key from a weak PRNG. */
+			if (!BN_generate_dsa_nonce(&k, dsa->q, dsa->priv_key, dgst,
+						   dlen, ctx))
+				goto err;
+			}
+		else
+#endif
+			if (!BN_rand_range(&k, dsa->q)) goto err;
+		} while (BN_is_zero(&k));
 	if ((dsa->flags & DSA_FLAG_NO_EXP_CONSTTIME) == 0)
 		{
 		BN_set_flags(&k, BN_FLG_CONSTTIME);
diff --git a/main/openssl/crypto/dsa/dsa_pmeth.c b/main/openssl/crypto/dsa/dsa_pmeth.c
index e2df54fe..715d8d67 100644
--- a/main/openssl/crypto/dsa/dsa_pmeth.c
+++ b/main/openssl/crypto/dsa/dsa_pmeth.c
@@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 		    EVP_MD_type((const EVP_MD *)p2) != NID_dsa    &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA    &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
-		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256)
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha512)
 			{
 			DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE);
 			return 0;
@@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
 	if (!dsa)
 		return 0;
 	ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd,
-	                           NULL, 0, NULL, NULL, pcb);
+	                           NULL, 0, NULL, NULL, NULL, pcb);
 	if (ret)
 		EVP_PKEY_assign_DSA(pkey, dsa);
 	else
diff --git a/main/openssl/crypto/dsa/dsa_sign.c b/main/openssl/crypto/dsa/dsa_sign.c
index 17555e58..8ace300a 100644
--- a/main/openssl/crypto/dsa/dsa_sign.c
+++ b/main/openssl/crypto/dsa/dsa_sign.c
@@ -61,30 +61,61 @@
 #include "cryptlib.h"
 #include <openssl/dsa.h>
 #include <openssl/rand.h>
+#include <openssl/bn.h>
 
 DSA_SIG * DSA_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_DO_SIGN, DSA_R_NON_FIPS_DSA_METHOD);
+		return NULL;
+		}
+#endif
 	return dsa->meth->dsa_do_sign(dgst, dlen, dsa);
 	}
 
-int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
-	     unsigned int *siglen, DSA *dsa)
+int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
 	{
-	DSA_SIG *s;
-	RAND_seed(dgst, dlen);
-	s=DSA_do_sign(dgst,dlen,dsa);
-	if (s == NULL)
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
 		{
-		*siglen=0;
-		return(0);
+		DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
 		}
-	*siglen=i2d_DSA_SIG(s,&sig);
-	DSA_SIG_free(s);
-	return(1);
+#endif
+	if (dsa->flags & DSA_FLAG_NONCE_FROM_HASH)
+		{
+		/* You cannot precompute the DSA nonce if it is required to
+		 * depend on the message. */
+		DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_NONCE_CANNOT_BE_PRECOMPUTED);
+		return 0;
+		}
+	return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp, NULL, 0);
 	}
 
-int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
+DSA_SIG *DSA_SIG_new(void)
 	{
-	return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp);
+	DSA_SIG *sig;
+	sig = OPENSSL_malloc(sizeof(DSA_SIG));
+	if (!sig)
+		return NULL;
+	sig->r = NULL;
+	sig->s = NULL;
+	return sig;
+	}
+
+void DSA_SIG_free(DSA_SIG *sig)
+	{
+	if (sig)
+		{
+		if (sig->r)
+			BN_free(sig->r);
+		if (sig->s)
+			BN_free(sig->s);
+		OPENSSL_free(sig);
+		}
 	}
 
diff --git a/main/openssl/crypto/dsa/dsa_vrf.c b/main/openssl/crypto/dsa/dsa_vrf.c
index 226a75ff..674cb5fa 100644
--- a/main/openssl/crypto/dsa/dsa_vrf.c
+++ b/main/openssl/crypto/dsa/dsa_vrf.c
@@ -64,26 +64,13 @@
 int DSA_do_verify(const unsigned char *dgst, int dgst_len, DSA_SIG *sig,
 		  DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_DO_VERIFY, DSA_R_NON_FIPS_DSA_METHOD);
+		return -1;
+		}
+#endif
 	return dsa->meth->dsa_do_verify(dgst, dgst_len, sig, dsa);
 	}
-
-/* data has already been hashed (probably with SHA or SHA-1). */
-/* returns
- *      1: correct signature
- *      0: incorrect signature
- *     -1: error
- */
-int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
-	     const unsigned char *sigbuf, int siglen, DSA *dsa)
-	{
-	DSA_SIG *s;
-	int ret=-1;
-
-	s = DSA_SIG_new();
-	if (s == NULL) return(ret);
-	if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
-	ret=DSA_do_verify(dgst,dgst_len,s,dsa);
-err:
-	DSA_SIG_free(s);
-	return(ret);
-	}
diff --git a/main/openssl/crypto/dso/dso_dlfcn.c b/main/openssl/crypto/dso/dso_dlfcn.c
index c2bc6176..5f225480 100644
--- a/main/openssl/crypto/dso/dso_dlfcn.c
+++ b/main/openssl/crypto/dso/dso_dlfcn.c
@@ -86,7 +86,8 @@ DSO_METHOD *DSO_METHOD_dlfcn(void)
 # if defined(_AIX) || defined(__CYGWIN__) || \
      defined(__SCO_VERSION__) || defined(_SCO_ELF) || \
      (defined(__osf__) && !defined(RTLD_NEXT))     || \
-     (defined(__OpenBSD__) && !defined(RTLD_SELF))
+     (defined(__OpenBSD__) && !defined(RTLD_SELF)) || \
+	defined(__ANDROID__)
 #  undef HAVE_DLINFO
 # endif
 #endif
diff --git a/main/openssl/crypto/dso/dso_lib.c b/main/openssl/crypto/dso/dso_lib.c
index 8a15b794..78015298 100644
--- a/main/openssl/crypto/dso/dso_lib.c
+++ b/main/openssl/crypto/dso/dso_lib.c
@@ -237,11 +237,19 @@ DSO *DSO_load(DSO *dso, const char *filename, DSO_METHOD *meth, int flags)
 	if(ret->meth->dso_load == NULL)
 		{
 		DSOerr(DSO_F_DSO_LOAD,DSO_R_UNSUPPORTED);
+		/* Make sure we unset the filename on failure, because we use
+		 * this to determine when the DSO has been loaded above. */
+		OPENSSL_free(ret->filename);
+		ret->filename = NULL;
 		goto err;
 		}
 	if(!ret->meth->dso_load(ret))
 		{
 		DSOerr(DSO_F_DSO_LOAD,DSO_R_LOAD_FAILED);
+		/* Make sure we unset the filename on failure, because we use
+		 * this to determine when the DSO has been loaded above. */
+		OPENSSL_free(ret->filename);
+		ret->filename = NULL;
 		goto err;
 		}
 	/* Load succeeded */
diff --git a/main/openssl/crypto/ec/ec.h b/main/openssl/crypto/ec/ec.h
index ee707813..d008a0da 100644
--- a/main/openssl/crypto/ec/ec.h
+++ b/main/openssl/crypto/ec/ec.h
@@ -151,7 +151,24 @@ const EC_METHOD *EC_GFp_mont_method(void);
  */
 const EC_METHOD *EC_GFp_nist_method(void);
 
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/** Returns 64-bit optimized methods for nistp224
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp224_method(void);
+
+/** Returns 64-bit optimized methods for nistp256
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp256_method(void);
+
+/** Returns 64-bit optimized methods for nistp521
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp521_method(void);
+#endif
 
+#ifndef OPENSSL_NO_EC2M
 /********************************************************************/ 
 /*           EC_METHOD for curves over GF(2^m)                      */
 /********************************************************************/
@@ -161,6 +178,8 @@ const EC_METHOD *EC_GFp_nist_method(void);
  */
 const EC_METHOD *EC_GF2m_simple_method(void);
 
+#endif
+
 
 /********************************************************************/
 /*                   EC_GROUP functions                             */
@@ -255,10 +274,10 @@ int EC_GROUP_get_curve_name(const EC_GROUP *group);
 void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag);
 int EC_GROUP_get_asn1_flag(const EC_GROUP *group);
 
-void EC_GROUP_set_point_conversion_form(EC_GROUP *, point_conversion_form_t);
+void EC_GROUP_set_point_conversion_form(EC_GROUP *group, point_conversion_form_t form);
 point_conversion_form_t EC_GROUP_get_point_conversion_form(const EC_GROUP *);
 
-unsigned char *EC_GROUP_get0_seed(const EC_GROUP *);
+unsigned char *EC_GROUP_get0_seed(const EC_GROUP *x);
 size_t EC_GROUP_get_seed_len(const EC_GROUP *);
 size_t EC_GROUP_set_seed(EC_GROUP *, const unsigned char *, size_t len);
 
@@ -282,6 +301,7 @@ int EC_GROUP_set_curve_GFp(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, co
  */
 int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
 /** Sets the parameter of a ec over GF2m defined by y^2 + x*y = x^3 + a*x^2 + b
  *  \param  group  EC_GROUP object
  *  \param  p      BIGNUM with the polynomial defining the underlying field
@@ -301,7 +321,7 @@ int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, c
  *  \return 1 on success and 0 if an error occured
  */
 int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Returns the number of bits needed to represent a field element 
  *  \param  group  EC_GROUP object
  *  \return number of bits needed to represent a field element
@@ -342,7 +362,7 @@ int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx);
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Creates a new EC_GROUP object with the specified parameters defined
  *  over GF2m (defined by the equation y^2 + x*y = x^3 + a*x^2 + b)
  *  \param  p    BIGNUM with the polynomial defining the underlying field
@@ -352,7 +372,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Creates a EC_GROUP object with a curve specified by a NID
  *  \param  nid  NID of the OID of the curve name
  *  \return newly created EC_GROUP object with specified curve or NULL
@@ -481,7 +501,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group,
  */
 int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Sets the affine coordinates of a EC_POINT over GF2m
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -514,7 +534,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group,
  */
 int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#endif
 /** Encodes a EC_POINT object to a octet string
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -606,8 +626,8 @@ int EC_POINT_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_CTX *c
  */
 int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx);
 
-int EC_POINT_make_affine(const EC_GROUP *, EC_POINT *, BN_CTX *);
-int EC_POINTs_make_affine(const EC_GROUP *, size_t num, EC_POINT *[], BN_CTX *);
+int EC_POINT_make_affine(const EC_GROUP *group, EC_POINT *point, BN_CTX *ctx);
+int EC_POINTs_make_affine(const EC_GROUP *group, size_t num, EC_POINT *points[], BN_CTX *ctx);
 
 /** Computes r = generator * n sum_{i=0}^num p[i] * m[i]
  *  \param  group  underlying EC_GROUP object
@@ -653,9 +673,11 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group);
 /* EC_GROUP_get_basis_type() returns the NID of the basis type
  * used to represent the field elements */
 int EC_GROUP_get_basis_type(const EC_GROUP *);
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1, 
 	unsigned int *k2, unsigned int *k3);
+#endif
 
 #define OPENSSL_EC_NAMED_CURVE	0x001
 
@@ -689,11 +711,21 @@ typedef struct ec_key_st EC_KEY;
 #define EC_PKEY_NO_PARAMETERS	0x001
 #define EC_PKEY_NO_PUBKEY	0x002
 
+/* some values for the flags field */
+#define EC_FLAG_NON_FIPS_ALLOW	0x1
+#define EC_FLAG_FIPS_CHECKED	0x2
+
 /** Creates a new EC_KEY object.
  *  \return EC_KEY object or NULL if an error occurred.
  */
 EC_KEY *EC_KEY_new(void);
 
+int EC_KEY_get_flags(const EC_KEY *key);
+
+void EC_KEY_set_flags(EC_KEY *key, int flags);
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags);
+
 /** Creates a new EC_KEY object using a named curve as underlying
  *  EC_GROUP object.
  *  \param  nid  NID of the named curve.
@@ -768,16 +800,35 @@ const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key);
 int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub);
 
 unsigned EC_KEY_get_enc_flags(const EC_KEY *key);
-void EC_KEY_set_enc_flags(EC_KEY *, unsigned int);
-point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *);
-void EC_KEY_set_conv_form(EC_KEY *, point_conversion_form_t);
+void EC_KEY_set_enc_flags(EC_KEY *eckey, unsigned int flags);
+point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key);
+void EC_KEY_set_conv_form(EC_KEY *eckey, point_conversion_form_t cform);
 /* functions to set/get method specific data  */
-void *EC_KEY_get_key_method_data(EC_KEY *, 
+void *EC_KEY_get_key_method_data(EC_KEY *key, 
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
-void EC_KEY_insert_key_method_data(EC_KEY *, void *data,
+/** Sets the key method data of an EC_KEY object, if none has yet been set.
+ *  \param  key              EC_KEY object
+ *  \param  data             opaque data to install.
+ *  \param  dup_func         a function that duplicates |data|.
+ *  \param  free_func        a function that frees |data|.
+ *  \param  clear_free_func  a function that wipes and frees |data|.
+ *  \return the previously set data pointer, or NULL if |data| was inserted.
+ */
+void *EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
 /* wrapper functions for the underlying EC_GROUP object */
-void EC_KEY_set_asn1_flag(EC_KEY *, int);
+void EC_KEY_set_asn1_flag(EC_KEY *eckey, int asn1_flag);
+
+/** Sets whether ECDSA operations with the given key will calculate their k
+ * value from SHA512(private_key + message + random) in order to protect
+ * against a weak PRNG.
+ * \param  on  Whether to calculate k from a hash or not
+ */
+void EC_KEY_set_nonce_from_hash(EC_KEY *key, int on);
+
+/** Returns the value of nonce_from_hash
+ */
+int EC_KEY_get_nonce_from_hash(const EC_KEY *key);
 
 /** Creates a table of pre-computed multiples of the generator to 
  *  accelerate further EC_KEY operations.
@@ -799,6 +850,15 @@ int EC_KEY_generate_key(EC_KEY *key);
  */
 int EC_KEY_check_key(const EC_KEY *key);
 
+/** Sets a public key from affine coordindates performing
+ *  neccessary NIST PKV tests.
+ *  \param  key  the EC_KEY object
+ *  \param  x    public key x coordinate
+ *  \param  y    public key y coordinate
+ *  \return 1 on success and 0 otherwise.
+ */
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+
 
 /********************************************************************/
 /*        de- and encoding functions for SEC1 ECPrivateKey          */
@@ -926,6 +986,7 @@ void ERR_load_EC_strings(void);
 /* Error codes for the EC functions. */
 
 /* Function codes. */
+#define EC_F_BN_TO_FELEM				 224
 #define EC_F_COMPUTE_WNAF				 143
 #define EC_F_D2I_ECPARAMETERS				 144
 #define EC_F_D2I_ECPKPARAMETERS				 145
@@ -968,6 +1029,15 @@ void ERR_load_EC_strings(void);
 #define EC_F_EC_GFP_MONT_FIELD_SQR			 132
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE		 189
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP		 135
+#define EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE		 225
+#define EC_F_EC_GFP_NISTP224_POINTS_MUL			 228
+#define EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES 226
+#define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE		 230
+#define EC_F_EC_GFP_NISTP256_POINTS_MUL			 231
+#define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232
+#define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE		 233
+#define EC_F_EC_GFP_NISTP521_POINTS_MUL			 234
+#define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235
 #define EC_F_EC_GFP_NIST_FIELD_MUL			 200
 #define EC_F_EC_GFP_NIST_FIELD_SQR			 201
 #define EC_F_EC_GFP_NIST_GROUP_SET_CURVE		 202
@@ -1010,6 +1080,7 @@ void ERR_load_EC_strings(void);
 #define EC_F_EC_KEY_NEW					 182
 #define EC_F_EC_KEY_PRINT				 180
 #define EC_F_EC_KEY_PRINT_FP				 181
+#define EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES	 229
 #define EC_F_EC_POINTS_MAKE_AFFINE			 136
 #define EC_F_EC_POINT_ADD				 112
 #define EC_F_EC_POINT_CMP				 113
@@ -1040,6 +1111,9 @@ void ERR_load_EC_strings(void);
 #define EC_F_I2D_ECPKPARAMETERS				 191
 #define EC_F_I2D_ECPRIVATEKEY				 192
 #define EC_F_I2O_ECPUBLICKEY				 151
+#define EC_F_NISTP224_PRE_COMP_NEW			 227
+#define EC_F_NISTP256_PRE_COMP_NEW			 236
+#define EC_F_NISTP521_PRE_COMP_NEW			 237
 #define EC_F_O2I_ECPUBLICKEY				 152
 #define EC_F_OLD_EC_PRIV_DECODE				 222
 #define EC_F_PKEY_EC_CTRL				 197
@@ -1052,12 +1126,15 @@ void ERR_load_EC_strings(void);
 /* Reason codes. */
 #define EC_R_ASN1_ERROR					 115
 #define EC_R_ASN1_UNKNOWN_FIELD				 116
+#define EC_R_BIGNUM_OUT_OF_RANGE			 144
 #define EC_R_BUFFER_TOO_SMALL				 100
+#define EC_R_COORDINATES_OUT_OF_RANGE			 146
 #define EC_R_D2I_ECPKPARAMETERS_FAILURE			 117
 #define EC_R_DECODE_ERROR				 142
 #define EC_R_DISCRIMINANT_IS_ZERO			 118
 #define EC_R_EC_GROUP_NEW_BY_NAME_FAILURE		 119
 #define EC_R_FIELD_TOO_LARGE				 143
+#define EC_R_GF2M_NOT_SUPPORTED				 147
 #define EC_R_GROUP2PKPARAMETERS_FAILURE			 120
 #define EC_R_I2D_ECPKPARAMETERS_FAILURE			 121
 #define EC_R_INCOMPATIBLE_OBJECTS			 101
@@ -1092,6 +1169,7 @@ void ERR_load_EC_strings(void);
 #define EC_R_UNKNOWN_GROUP				 129
 #define EC_R_UNKNOWN_ORDER				 114
 #define EC_R_UNSUPPORTED_FIELD				 131
+#define EC_R_WRONG_CURVE_PARAMETERS			 145
 #define EC_R_WRONG_ORDER				 130
 
 #ifdef  __cplusplus
diff --git a/main/openssl/crypto/ec/ec2_mult.c b/main/openssl/crypto/ec/ec2_mult.c
index e12b9b28..26f4a783 100644
--- a/main/openssl/crypto/ec/ec2_mult.c
+++ b/main/openssl/crypto/ec/ec2_mult.c
@@ -71,6 +71,8 @@
 
 #include "ec_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
 
 /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective 
  * coordinates.
@@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group)
 	{
 	return ec_wNAF_have_precompute_mult(group);
  	}
+
+#endif
diff --git a/main/openssl/crypto/ec/ec2_oct.c b/main/openssl/crypto/ec/ec2_oct.c
new file mode 100644
index 00000000..f1d75e5d
--- /dev/null
+++ b/main/openssl/crypto/ec/ec2_oct.c
@@ -0,0 +1,407 @@
+/* crypto/ec/ec2_oct.c */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * The Elliptic Curve Public-Key Crypto Library (ECC Code) included
+ * herein is developed by SUN MICROSYSTEMS, INC., and is contributed
+ * to the OpenSSL project.
+ *
+ * The ECC Code is licensed pursuant to the OpenSSL open source
+ * license provided below.
+ *
+ * The software is originally written by Sheueling Chang Shantz and
+ * Douglas Stebila of Sun Microsystems Laboratories.
+ *
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <openssl/err.h>
+
+#include "ec_lcl.h"
+
+#ifndef OPENSSL_NO_EC2M
+
+/* Calculates and sets the affine coordinates of an EC_POINT from the given
+ * compressed coordinates.  Uses algorithm 2.3.4 of SEC 1. 
+ * Note that the simple implementation only uses affine coordinates.
+ *
+ * The method is from the following publication:
+ * 
+ *     Harper, Menezes, Vanstone:
+ *     "Public-Key Cryptosystems with Very Small Key Lengths",
+ *     EUROCRYPT '92, Springer-Verlag LNCS 658,
+ *     published February 1993
+ *
+ * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
+ * the same method, but claim no priority date earlier than July 29, 1994
+ * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
+ */
+int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+	{
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *tmp, *x, *y, *z;
+	int ret = 0, z0;
+
+	/* clear error queue */
+	ERR_clear_error();
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	y_bit = (y_bit != 0) ? 1 : 0;
+
+	BN_CTX_start(ctx);
+	tmp = BN_CTX_get(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	z = BN_CTX_get(ctx);
+	if (z == NULL) goto err;
+
+	if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
+	if (BN_is_zero(x))
+		{
+		if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
+		}
+	else
+		{
+		if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
+		if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
+		if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
+		if (!BN_GF2m_add(tmp, x, tmp)) goto err;
+		if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
+			{
+			unsigned long err = ERR_peek_last_error();
+			
+			if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
+				{
+				ERR_clear_error();
+				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+				}
+			else
+				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+			goto err;
+			}
+		z0 = (BN_is_odd(z)) ? 1 : 0;
+		if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
+		if (z0 != y_bit)
+			{
+			if (!BN_GF2m_add(y, y, x)) goto err;
+			}
+		}
+
+	if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+	ret = 1;
+
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+
+/* Converts an EC_POINT to an octet string.  
+ * If buf is NULL, the encoded length will be returned.
+ * If the length len of buf is smaller than required an error will be returned.
+ */
+size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+	unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	size_t ret;
+	BN_CTX *new_ctx = NULL;
+	int used_ctx = 0;
+	BIGNUM *x, *y, *yxi;
+	size_t field_len, i, skip;
+
+	if ((form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+		goto err;
+		}
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		/* encodes to a single 0 octet */
+		if (buf != NULL)
+			{
+			if (len < 1)
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+				return 0;
+				}
+			buf[0] = 0;
+			}
+		return 1;
+		}
+
+
+	/* ret := required output buffer length */
+	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	/* if 'buf' is NULL, just return required length */
+	if (buf != NULL)
+		{
+		if (len < ret)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+			goto err;
+			}
+
+		if (ctx == NULL)
+			{
+			ctx = new_ctx = BN_CTX_new();
+			if (ctx == NULL)
+				return 0;
+			}
+
+		BN_CTX_start(ctx);
+		used_ctx = 1;
+		x = BN_CTX_get(ctx);
+		y = BN_CTX_get(ctx);
+		yxi = BN_CTX_get(ctx);
+		if (yxi == NULL) goto err;
+
+		if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+		buf[0] = form;
+		if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
+			{
+			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+			if (BN_is_odd(yxi)) buf[0]++;
+			}
+
+		i = 1;
+		
+		skip = field_len - BN_num_bytes(x);
+		if (skip > field_len)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		while (skip > 0)
+			{
+			buf[i++] = 0;
+			skip--;
+			}
+		skip = BN_bn2bin(x, buf + i);
+		i += skip;
+		if (i != 1 + field_len)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+
+		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+			{
+			skip = field_len - BN_num_bytes(y);
+			if (skip > field_len)
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			while (skip > 0)
+				{
+				buf[i++] = 0;
+				skip--;
+				}
+			skip = BN_bn2bin(y, buf + i);
+			i += skip;
+			}
+
+		if (i != ret)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		}
+	
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+
+ err:
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return 0;
+	}
+
+
+/* Converts an octet string representation to an EC_POINT. 
+ * Note that the simple implementation only uses affine coordinates.
+ */
+int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+	const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	point_conversion_form_t form;
+	int y_bit;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y, *yxi;
+	size_t field_len, enc_len;
+	int ret = 0;
+
+	if (len == 0)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+		return 0;
+		}
+	form = buf[0];
+	y_bit = form & 1;
+	form = form & ~1U;
+	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (form == 0)
+		{
+		if (len != 1)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			return 0;
+			}
+
+		return EC_POINT_set_to_infinity(group, point);
+		}
+	
+	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	if (len != enc_len)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	BN_CTX_start(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	yxi = BN_CTX_get(ctx);
+	if (yxi == NULL) goto err;
+
+	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+	if (BN_ucmp(x, &group->field) >= 0)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		goto err;
+		}
+
+	if (form == POINT_CONVERSION_COMPRESSED)
+		{
+		if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+		if (BN_ucmp(y, &group->field) >= 0)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			goto err;
+			}
+		if (form == POINT_CONVERSION_HYBRID)
+			{
+			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+			if (y_bit != BN_is_odd(yxi))
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+				goto err;
+				}
+			}
+
+		if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+		}
+	
+	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+		goto err;
+		}
+
+	ret = 1;
+	
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+#endif
diff --git a/main/openssl/crypto/ec/ec2_smpl.c b/main/openssl/crypto/ec/ec2_smpl.c
index af94458c..e0e59c7d 100644
--- a/main/openssl/crypto/ec/ec2_smpl.c
+++ b/main/openssl/crypto/ec/ec2_smpl.c
@@ -71,10 +71,20 @@
 
 #include "ec_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 
 const EC_METHOD *EC_GF2m_simple_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gf2m_simple_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_characteristic_two_field,
 		ec_GF2m_simple_group_init,
 		ec_GF2m_simple_group_finish,
@@ -93,9 +103,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
 		0 /* get_Jprojective_coordinates_GFp */,
 		ec_GF2m_simple_point_set_affine_coordinates,
 		ec_GF2m_simple_point_get_affine_coordinates,
-		ec_GF2m_simple_set_compressed_coordinates,
-		ec_GF2m_simple_point2oct,
-		ec_GF2m_simple_oct2point,
+		0,0,0,
 		ec_GF2m_simple_add,
 		ec_GF2m_simple_dbl,
 		ec_GF2m_simple_invert,
@@ -118,6 +126,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 
@@ -405,340 +414,6 @@ int ec_GF2m_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_
 	return ret;
 	}
 
-
-/* Calculates and sets the affine coordinates of an EC_POINT from the given
- * compressed coordinates.  Uses algorithm 2.3.4 of SEC 1. 
- * Note that the simple implementation only uses affine coordinates.
- *
- * The method is from the following publication:
- * 
- *     Harper, Menezes, Vanstone:
- *     "Public-Key Cryptosystems with Very Small Key Lengths",
- *     EUROCRYPT '92, Springer-Verlag LNCS 658,
- *     published February 1993
- *
- * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
- * the same method, but claim no priority date earlier than July 29, 1994
- * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
- */
-int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
-	{
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *tmp, *x, *y, *z;
-	int ret = 0, z0;
-
-	/* clear error queue */
-	ERR_clear_error();
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	y_bit = (y_bit != 0) ? 1 : 0;
-
-	BN_CTX_start(ctx);
-	tmp = BN_CTX_get(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	z = BN_CTX_get(ctx);
-	if (z == NULL) goto err;
-
-	if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
-	if (BN_is_zero(x))
-		{
-		if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
-		}
-	else
-		{
-		if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
-		if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
-		if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
-		if (!BN_GF2m_add(tmp, x, tmp)) goto err;
-		if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
-			{
-			unsigned long err = ERR_peek_last_error();
-			
-			if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
-				{
-				ERR_clear_error();
-				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-				}
-			else
-				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
-			goto err;
-			}
-		z0 = (BN_is_odd(z)) ? 1 : 0;
-		if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
-		if (z0 != y_bit)
-			{
-			if (!BN_GF2m_add(y, y, x)) goto err;
-			}
-		}
-
-	if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
-	ret = 1;
-
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
-/* Converts an EC_POINT to an octet string.  
- * If buf is NULL, the encoded length will be returned.
- * If the length len of buf is smaller than required an error will be returned.
- */
-size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-	unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	size_t ret;
-	BN_CTX *new_ctx = NULL;
-	int used_ctx = 0;
-	BIGNUM *x, *y, *yxi;
-	size_t field_len, i, skip;
-
-	if ((form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
-		goto err;
-		}
-
-	if (EC_POINT_is_at_infinity(group, point))
-		{
-		/* encodes to a single 0 octet */
-		if (buf != NULL)
-			{
-			if (len < 1)
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-				return 0;
-				}
-			buf[0] = 0;
-			}
-		return 1;
-		}
-
-
-	/* ret := required output buffer length */
-	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
-	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	/* if 'buf' is NULL, just return required length */
-	if (buf != NULL)
-		{
-		if (len < ret)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-			goto err;
-			}
-
-		if (ctx == NULL)
-			{
-			ctx = new_ctx = BN_CTX_new();
-			if (ctx == NULL)
-				return 0;
-			}
-
-		BN_CTX_start(ctx);
-		used_ctx = 1;
-		x = BN_CTX_get(ctx);
-		y = BN_CTX_get(ctx);
-		yxi = BN_CTX_get(ctx);
-		if (yxi == NULL) goto err;
-
-		if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
-		buf[0] = form;
-		if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
-			{
-			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
-			if (BN_is_odd(yxi)) buf[0]++;
-			}
-
-		i = 1;
-		
-		skip = field_len - BN_num_bytes(x);
-		if (skip > field_len)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		while (skip > 0)
-			{
-			buf[i++] = 0;
-			skip--;
-			}
-		skip = BN_bn2bin(x, buf + i);
-		i += skip;
-		if (i != 1 + field_len)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-
-		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
-			{
-			skip = field_len - BN_num_bytes(y);
-			if (skip > field_len)
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-				goto err;
-				}
-			while (skip > 0)
-				{
-				buf[i++] = 0;
-				skip--;
-				}
-			skip = BN_bn2bin(y, buf + i);
-			i += skip;
-			}
-
-		if (i != ret)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		}
-	
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-
- err:
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return 0;
-	}
-
-
-/* Converts an octet string representation to an EC_POINT. 
- * Note that the simple implementation only uses affine coordinates.
- */
-int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
-	const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	point_conversion_form_t form;
-	int y_bit;
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *x, *y, *yxi;
-	size_t field_len, enc_len;
-	int ret = 0;
-
-	if (len == 0)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
-		return 0;
-		}
-	form = buf[0];
-	y_bit = form & 1;
-	form = form & ~1U;
-	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (form == 0)
-		{
-		if (len != 1)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			return 0;
-			}
-
-		return EC_POINT_set_to_infinity(group, point);
-		}
-	
-	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
-	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	if (len != enc_len)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	BN_CTX_start(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	yxi = BN_CTX_get(ctx);
-	if (yxi == NULL) goto err;
-
-	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
-	if (BN_ucmp(x, &group->field) >= 0)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		goto err;
-		}
-
-	if (form == POINT_CONVERSION_COMPRESSED)
-		{
-		if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
-		if (BN_ucmp(y, &group->field) >= 0)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			goto err;
-			}
-		if (form == POINT_CONVERSION_HYBRID)
-			{
-			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
-			if (y_bit != BN_is_odd(yxi))
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-				goto err;
-				}
-			}
-
-		if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-		}
-	
-	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
-		goto err;
-		}
-
-	ret = 1;
-	
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
 /* Computes a + b and stores the result in r.  r could be a or b, a could be b.
  * Uses algorithm A.10.2 of IEEE P1363.
  */
@@ -887,7 +562,7 @@ int ec_GF2m_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_
 	field_sqr = group->meth->field_sqr;	
 
 	/* only support affine coordinates */
-	if (!point->Z_is_one) goto err;
+	if (!point->Z_is_one) return -1;
 
 	if (ctx == NULL)
 		{
@@ -1040,3 +715,5 @@ int ec_GF2m_simple_field_div(const EC_GROUP *group, BIGNUM *r, const BIGNUM *a,
 	{
 	return BN_GF2m_mod_div(r, a, b, &group->field, ctx);
 	}
+
+#endif
diff --git a/main/openssl/crypto/ec/ec_ameth.c b/main/openssl/crypto/ec/ec_ameth.c
index c00f7d74..0ce45240 100644
--- a/main/openssl/crypto/ec/ec_ameth.c
+++ b/main/openssl/crypto/ec/ec_ameth.c
@@ -88,7 +88,7 @@ static int eckey_param2type(int *pptype, void **ppval, EC_KEY *ec_key)
 		if (!pstr)
 			return 0;
 		pstr->length = i2d_ECParameters(ec_key, &pstr->data);
-		if (pstr->length < 0)
+		if (pstr->length <= 0)
 			{
 			ASN1_STRING_free(pstr);
 			ECerr(EC_F_ECKEY_PARAM2TYPE, ERR_R_EC_LIB);
@@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth =
 	ec_copy_parameters,
 	ec_cmp_parameters,
 	eckey_param_print,
+	0,
 
 	int_ec_free,
 	ec_pkey_ctrl,
diff --git a/main/openssl/crypto/ec/ec_asn1.c b/main/openssl/crypto/ec/ec_asn1.c
index ae555398..145807b6 100644
--- a/main/openssl/crypto/ec/ec_asn1.c
+++ b/main/openssl/crypto/ec/ec_asn1.c
@@ -83,13 +83,14 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group)
 		/* everything else is currently not supported */
 		return 0;
 	}
-
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
 	{
 	if (group == NULL)
 		return 0;
 
-	if (EC_GROUP_method_of(group)->group_set_curve != ec_GF2m_simple_group_set_curve
+	if (EC_METHOD_get_field_type(EC_GROUP_method_of(group)) !=
+	    NID_X9_62_characteristic_two_field
 	    || !((group->poly[0] != 0) && (group->poly[1] != 0) && (group->poly[2] == 0)))
 		{
 		ECerr(EC_F_EC_GROUP_GET_TRINOMIAL_BASIS, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
@@ -101,14 +102,14 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
 
 	return 1;
 	}
-
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
 	unsigned int *k2, unsigned int *k3)
 	{
 	if (group == NULL)
 		return 0;
 
-	if (EC_GROUP_method_of(group)->group_set_curve != ec_GF2m_simple_group_set_curve
+	if (EC_METHOD_get_field_type(EC_GROUP_method_of(group)) !=
+	    NID_X9_62_characteristic_two_field
 	    || !((group->poly[0] != 0) && (group->poly[1] != 0) && (group->poly[2] != 0) && (group->poly[3] != 0) && (group->poly[4] == 0)))
 		{
 		ECerr(EC_F_EC_GROUP_GET_PENTANOMIAL_BASIS, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
@@ -124,7 +125,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
 
 	return 1;
 	}
-
+#endif
 
 
 /* some structures needed for the asn1 encoding */
@@ -340,6 +341,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
 			}
 		}
 	else	/* nid == NID_X9_62_characteristic_two_field */
+#ifdef OPENSSL_NO_EC2M
+		{
+		ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED);
+		goto err;
+		}
+#else
 		{
 		int		field_type;
 		X9_62_CHARACTERISTIC_TWO *char_two;
@@ -419,6 +426,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
 				}
 			}
 		}
+#endif
 
 	ok = 1;
 
@@ -456,6 +464,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else	/* nid == NID_X9_62_characteristic_two_field */
 		{
 		if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL))
@@ -464,7 +473,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
 			goto err;
 			}
 		}
-
+#endif
 	len_1 = (size_t)BN_num_bytes(tmp_1);
 	len_2 = (size_t)BN_num_bytes(tmp_2);
 
@@ -775,8 +784,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
 
 	/* get the field parameters */
 	tmp = OBJ_obj2nid(params->fieldID->fieldType);
-
 	if (tmp == NID_X9_62_characteristic_two_field)
+#ifdef OPENSSL_NO_EC2M
+		{
+		ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED);
+		goto err;
+		}
+#else
 		{
 		X9_62_CHARACTERISTIC_TWO *char_two;
 
@@ -862,6 +876,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
 		/* create the EC_GROUP structure */
 		ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL);
 		}
+#endif
 	else if (tmp == NID_X9_62_prime_field)
 		{
 		/* we have a curve over a prime field */
@@ -1065,6 +1080,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len)
 	if ((group = ec_asn1_pkparameters2group(params)) == NULL)
 		{
 		ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE);
+		ECPKPARAMETERS_free(params);
 		return NULL; 
 		}
 
diff --git a/main/openssl/crypto/ec/ec_curve.c b/main/openssl/crypto/ec/ec_curve.c
index 23274e40..c72fb269 100644
--- a/main/openssl/crypto/ec/ec_curve.c
+++ b/main/openssl/crypto/ec/ec_curve.c
@@ -3,7 +3,7 @@
  * Written by Nils Larsch for the OpenSSL project.
  */
 /* ====================================================================
- * Copyright (c) 1998-2004 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -72,6 +72,7 @@
 #include "ec_lcl.h"
 #include <openssl/err.h>
 #include <openssl/obj_mac.h>
+#include <openssl/opensslconf.h>
 
 typedef struct {
 	int	field_type,	/* either NID_X9_62_prime_field or
@@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; }
 	  0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D }
 	};
 
+#ifndef OPENSSL_NO_EC2M
+
 /* characteristic two curves */
 static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; }
 	_EC_SECG_CHAR2_113R1 = {
@@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; }
 	{ 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76,	/* seed */
 	  0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD,
 
- 	  0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,	/* p */
+	  0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,	/* p */
 	  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
 	  0x07,
 	  0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9,	/* a */
@@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; }
 	  0xBA,0xFC,0xA7,0x5E }
 	};
 
+#endif
+
 typedef struct _ec_list_element_st {
 	int	nid;
 	const EC_CURVE_DATA *data;
+	const EC_METHOD *(*meth)(void);
 	const char *comment;
 	} ec_list_element;
 
 static const ec_list_element curve_list[] = {
-	/* prime field curves */	
+	/* prime field curves */
 	/* secg curves */
-	{ NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"},
-	{ NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"},
-	{ NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"},
-	{ NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"},
-	{ NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"},
-	{ NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"},
-	{ NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"},
+	{ NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+	{ NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" },
+	{ NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" },
+	{ NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" },
+	{ NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" },
+	{ NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" },
+	{ NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
 	/* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */
-	{ NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"},
-	{ NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"},
-	{ NID_secp224r1, &_EC_NIST_PRIME_224.h,   "NIST/SECG curve over a 224 bit prime field"},
-	{ NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"},
+	{ NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" },
+	{ NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" },
+#else
+	{ NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" },
+#endif
+	{ NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" },
 	/* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */
-	{ NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"},
-	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"},
+	{ NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" },
+#else
+	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" },
+#endif
 	/* X9.62 curves */
-	{ NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"},
-	{ NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"},
-	{ NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"},
-	{ NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"},
+	{ NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" },
+	{ NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" },
+	{ NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" },
+	{ NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" },
+	{ NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" },
+	{ NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" },
+#else
+	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" },
+#endif
+#ifndef OPENSSL_NO_EC2M
 	/* characteristic two field curves */
 	/* NIST/SECG curves */
-	{ NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"},
-	{ NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"},
-	{ NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"},
-	{ NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"},
-	{ NID_sect163k1, &_EC_NIST_CHAR2_163K.h,  "NIST/SECG/WTLS curve over a 163 bit binary field" },
-	{ NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"},
-	{ NID_sect163r2, &_EC_NIST_CHAR2_163B.h,  "NIST/SECG curve over a 163 bit binary field" },
-	{ NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"},
-	{ NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"},
-	{ NID_sect233k1, &_EC_NIST_CHAR2_233K.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
-	{ NID_sect233r1, &_EC_NIST_CHAR2_233B.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
-	{ NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"},
-	{ NID_sect283k1, &_EC_NIST_CHAR2_283K.h,  "NIST/SECG curve over a 283 bit binary field" },
-	{ NID_sect283r1, &_EC_NIST_CHAR2_283B.h,  "NIST/SECG curve over a 283 bit binary field" },
-	{ NID_sect409k1, &_EC_NIST_CHAR2_409K.h,  "NIST/SECG curve over a 409 bit binary field" },
-	{ NID_sect409r1, &_EC_NIST_CHAR2_409B.h,  "NIST/SECG curve over a 409 bit binary field" },
-	{ NID_sect571k1, &_EC_NIST_CHAR2_571K.h,  "NIST/SECG curve over a 571 bit binary field" },
-	{ NID_sect571r1, &_EC_NIST_CHAR2_571B.h,  "NIST/SECG curve over a 571 bit binary field" },
+	{ NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" },
+	{ NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" },
+	{ NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+	{ NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" },
+	{ NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" },
+	{ NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" },
+	{ NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" },
+	{ NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" },
+	{ NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+	{ NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+	{ NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+	{ NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+	{ NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" },
+	{ NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" },
 	/* X9.62 curves */
-	{ NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"},
-	{ NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"},
-	{ NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"},
-	{ NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"},
-	{ NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"},
-	{ NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"},
-	{ NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"},
+	{ NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" },
+	{ NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" },
+	{ NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" },
+	{ NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" },
+	{ NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" },
+	{ NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" },
+	{ NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" },
 	/* the WAP/WTLS curves
 	 * [unlike SECG, spec has its own OIDs for curves from X9.62] */
-	{ NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h,   "NIST/SECG/WTLS curve over a 163 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h,  "SECG curve over a 113 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h,  "SECG/WTLS curve over a 112 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h,  "SECG/WTLS curve over a 160 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" },
-	{ NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"},
+	{ NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+#endif
+	{ NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
+	{ NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+#endif
+	{ NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
 	/* IPSec curves */
-	{ NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
-	{ NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
+	{ NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n"
+	  "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+	{ NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n"
+	  "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+#endif
 };
 
 #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element))
 
-static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
+static EC_GROUP *ec_group_new_from_data(const ec_list_element curve)
 	{
 	EC_GROUP *group=NULL;
 	EC_POINT *P=NULL;
 	BN_CTX	 *ctx=NULL;
-	BIGNUM 	 *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
+	BIGNUM	 *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
 	int	 ok=0;
 	int	 seed_len,param_len;
+	const EC_METHOD *meth;
+	const EC_CURVE_DATA *data;
 	const unsigned char *params;
 
 	if ((ctx = BN_CTX_new()) == NULL)
@@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 		goto err;
 		}
 
+	data = curve.data;
 	seed_len  = data->seed_len;
 	param_len = data->param_len;
-	params    = (const unsigned char *)(data+1);	/* skip header */
-	params   += seed_len;				/* skip seed   */
+	params	  = (const unsigned char *)(data+1);	/* skip header */
+	params	 += seed_len;				/* skip seed   */
 
 	if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL))
 		|| !(a = BN_bin2bn(params+1*param_len, param_len, NULL))
@@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 		goto err;
 		}
 
-	if (data->field_type == NID_X9_62_prime_field)
+	if (curve.meth != 0)
+		{
+		meth = curve.meth();
+		if (((group = EC_GROUP_new(meth)) == NULL) ||
+			(!(group->meth->group_set_curve(group, p, a, b, ctx))))
+			{
+			ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
+			goto err;
+			}
+		}
+	else if (data->field_type == NID_X9_62_prime_field)
 		{
 		if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL)
 			{
@@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else	/* field_type == NID_X9_62_characteristic_two_field */
 		{
 		if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL)
@@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 			goto err;
 			}
 		}
+#endif
 
 	if ((P = EC_POINT_new(group)) == NULL)
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
 		goto err;
 		}
-	
+
 	if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL))
 		|| !(y = BN_bin2bn(params+4*param_len, param_len, NULL)))
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB);
 		goto err;
 		}
-	if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx))
+	if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx))
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
 		goto err;
@@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid)
 	for (i=0; i<curve_list_length; i++)
 		if (curve_list[i].nid == nid)
 			{
-			ret = ec_group_new_from_data(curve_list[i].data);
+			ret = ec_group_new_from_data(curve_list[i]);
 			break;
 			}
 
diff --git a/main/openssl/crypto/ec/ec_cvt.c b/main/openssl/crypto/ec/ec_cvt.c
index d45640ba..bfcbab35 100644
--- a/main/openssl/crypto/ec/ec_cvt.c
+++ b/main/openssl/crypto/ec/ec_cvt.c
@@ -78,7 +78,32 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
 	const EC_METHOD *meth;
 	EC_GROUP *ret;
 
+#if defined(OPENSSL_BN_ASM_MONT)
+	/*
+	 * This might appear controversial, but the fact is that generic
+	 * prime method was observed to deliver better performance even
+	 * for NIST primes on a range of platforms, e.g.: 60%-15%
+	 * improvement on IA-64, ~25% on ARM, 30%-90% on P4, 20%-25%
+	 * in 32-bit build and 35%--12% in 64-bit build on Core2...
+	 * Coefficients are relative to optimized bn_nist.c for most
+	 * intensive ECDSA verify and ECDH operations for 192- and 521-
+	 * bit keys respectively. Choice of these boundary values is
+	 * arguable, because the dependency of improvement coefficient
+	 * from key length is not a "monotone" curve. For example while
+	 * 571-bit result is 23% on ARM, 384-bit one is -1%. But it's
+	 * generally faster, sometimes "respectfully" faster, sometimes
+	 * "tolerably" slower... What effectively happens is that loop
+	 * with bn_mul_add_words is put against bn_mul_mont, and the
+	 * latter "wins" on short vectors. Correct solution should be
+	 * implementing dedicated NxN multiplication subroutines for
+	 * small N. But till it materializes, let's stick to generic
+	 * prime method...
+	 *						<appro>
+	 */
+	meth = EC_GFp_mont_method();
+#else
 	meth = EC_GFp_nist_method();
+#endif
 	
 	ret = EC_GROUP_new(meth);
 	if (ret == NULL)
@@ -122,7 +147,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
 	return ret;
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
 	const EC_METHOD *meth;
@@ -142,3 +167,4 @@ EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM
 
 	return ret;
 	}
+#endif
diff --git a/main/openssl/crypto/ec/ec_err.c b/main/openssl/crypto/ec/ec_err.c
index 84b48333..0d193987 100644
--- a/main/openssl/crypto/ec/ec_err.c
+++ b/main/openssl/crypto/ec/ec_err.c
@@ -1,6 +1,6 @@
 /* crypto/ec/ec_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA EC_str_functs[]=
 	{
+{ERR_FUNC(EC_F_BN_TO_FELEM),	"BN_TO_FELEM"},
 {ERR_FUNC(EC_F_COMPUTE_WNAF),	"COMPUTE_WNAF"},
 {ERR_FUNC(EC_F_D2I_ECPARAMETERS),	"d2i_ECParameters"},
 {ERR_FUNC(EC_F_D2I_ECPKPARAMETERS),	"d2i_ECPKParameters"},
@@ -112,6 +113,15 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_EC_GFP_MONT_FIELD_SQR),	"ec_GFp_mont_field_sqr"},
 {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE),	"ec_GFp_mont_group_set_curve"},
 {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP),	"EC_GFP_MONT_GROUP_SET_CURVE_GFP"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE),	"ec_GFp_nistp224_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINTS_MUL),	"ec_GFp_nistp224_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp224_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE),	"ec_GFp_nistp256_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINTS_MUL),	"ec_GFp_nistp256_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp256_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE),	"ec_GFp_nistp521_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINTS_MUL),	"ec_GFp_nistp521_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp521_point_get_affine_coordinates"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_MUL),	"ec_GFp_nist_field_mul"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_SQR),	"ec_GFp_nist_field_sqr"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_GROUP_SET_CURVE),	"ec_GFp_nist_group_set_curve"},
@@ -154,6 +164,7 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_EC_KEY_NEW),	"EC_KEY_new"},
 {ERR_FUNC(EC_F_EC_KEY_PRINT),	"EC_KEY_print"},
 {ERR_FUNC(EC_F_EC_KEY_PRINT_FP),	"EC_KEY_print_fp"},
+{ERR_FUNC(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES),	"EC_KEY_set_public_key_affine_coordinates"},
 {ERR_FUNC(EC_F_EC_POINTS_MAKE_AFFINE),	"EC_POINTs_make_affine"},
 {ERR_FUNC(EC_F_EC_POINT_ADD),	"EC_POINT_add"},
 {ERR_FUNC(EC_F_EC_POINT_CMP),	"EC_POINT_cmp"},
@@ -184,6 +195,9 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_I2D_ECPKPARAMETERS),	"i2d_ECPKParameters"},
 {ERR_FUNC(EC_F_I2D_ECPRIVATEKEY),	"i2d_ECPrivateKey"},
 {ERR_FUNC(EC_F_I2O_ECPUBLICKEY),	"i2o_ECPublicKey"},
+{ERR_FUNC(EC_F_NISTP224_PRE_COMP_NEW),	"NISTP224_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP256_PRE_COMP_NEW),	"NISTP256_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP521_PRE_COMP_NEW),	"NISTP521_PRE_COMP_NEW"},
 {ERR_FUNC(EC_F_O2I_ECPUBLICKEY),	"o2i_ECPublicKey"},
 {ERR_FUNC(EC_F_OLD_EC_PRIV_DECODE),	"OLD_EC_PRIV_DECODE"},
 {ERR_FUNC(EC_F_PKEY_EC_CTRL),	"PKEY_EC_CTRL"},
@@ -199,12 +213,15 @@ static ERR_STRING_DATA EC_str_reasons[]=
 	{
 {ERR_REASON(EC_R_ASN1_ERROR)             ,"asn1 error"},
 {ERR_REASON(EC_R_ASN1_UNKNOWN_FIELD)     ,"asn1 unknown field"},
+{ERR_REASON(EC_R_BIGNUM_OUT_OF_RANGE)    ,"bignum out of range"},
 {ERR_REASON(EC_R_BUFFER_TOO_SMALL)       ,"buffer too small"},
+{ERR_REASON(EC_R_COORDINATES_OUT_OF_RANGE),"coordinates out of range"},
 {ERR_REASON(EC_R_D2I_ECPKPARAMETERS_FAILURE),"d2i ecpkparameters failure"},
 {ERR_REASON(EC_R_DECODE_ERROR)           ,"decode error"},
 {ERR_REASON(EC_R_DISCRIMINANT_IS_ZERO)   ,"discriminant is zero"},
 {ERR_REASON(EC_R_EC_GROUP_NEW_BY_NAME_FAILURE),"ec group new by name failure"},
 {ERR_REASON(EC_R_FIELD_TOO_LARGE)        ,"field too large"},
+{ERR_REASON(EC_R_GF2M_NOT_SUPPORTED)     ,"gf2m not supported"},
 {ERR_REASON(EC_R_GROUP2PKPARAMETERS_FAILURE),"group2pkparameters failure"},
 {ERR_REASON(EC_R_I2D_ECPKPARAMETERS_FAILURE),"i2d ecpkparameters failure"},
 {ERR_REASON(EC_R_INCOMPATIBLE_OBJECTS)   ,"incompatible objects"},
@@ -239,6 +256,7 @@ static ERR_STRING_DATA EC_str_reasons[]=
 {ERR_REASON(EC_R_UNKNOWN_GROUP)          ,"unknown group"},
 {ERR_REASON(EC_R_UNKNOWN_ORDER)          ,"unknown order"},
 {ERR_REASON(EC_R_UNSUPPORTED_FIELD)      ,"unsupported field"},
+{ERR_REASON(EC_R_WRONG_CURVE_PARAMETERS) ,"wrong curve parameters"},
 {ERR_REASON(EC_R_WRONG_ORDER)            ,"wrong order"},
 {0,NULL}
 	};
diff --git a/main/openssl/crypto/ec/ec_key.c b/main/openssl/crypto/ec/ec_key.c
index 522802c0..73dd7b97 100644
--- a/main/openssl/crypto/ec/ec_key.c
+++ b/main/openssl/crypto/ec/ec_key.c
@@ -64,7 +64,9 @@
 #include <string.h>
 #include "ec_lcl.h"
 #include <openssl/err.h>
-#include <string.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 EC_KEY *EC_KEY_new(void)
 	{
@@ -78,10 +80,12 @@ EC_KEY *EC_KEY_new(void)
 		}
 
 	ret->version = 1;	
+	ret->flags = 0;
 	ret->group   = NULL;
 	ret->pub_key = NULL;
 	ret->priv_key= NULL;
 	ret->enc_flag= 0; 
+	ret->nonce_from_hash_flag = 0;
 	ret->conv_form = POINT_CONVERSION_UNCOMPRESSED;
 	ret->references= 1;
 	ret->method_data = NULL;
@@ -195,8 +199,10 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src)
 
 	/* copy the rest */
 	dest->enc_flag  = src->enc_flag;
+	dest->nonce_from_hash_flag = src->nonce_from_hash_flag;
 	dest->conv_form = src->conv_form;
 	dest->version   = src->version;
+	dest->flags = src->flags;
 
 	return dest;
 	}
@@ -237,6 +243,11 @@ int EC_KEY_generate_key(EC_KEY *eckey)
 	BIGNUM	*priv_key = NULL, *order = NULL;
 	EC_POINT *pub_key = NULL;
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_ec_key_generate_key(eckey);
+#endif
+
 	if (!eckey || !eckey->group)
 		{
 		ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER);
@@ -371,6 +382,82 @@ err:
 	return(ok);
 	}
 
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y)
+	{
+	BN_CTX *ctx = NULL;
+	BIGNUM *tx, *ty;
+	EC_POINT *point = NULL;
+	int ok = 0, tmp_nid, is_char_two = 0;
+
+	if (!key || !key->group || !x || !y)
+		{
+		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+						ERR_R_PASSED_NULL_PARAMETER);
+		return 0;
+		}
+	ctx = BN_CTX_new();
+	if (!ctx)
+		goto err;
+
+	point = EC_POINT_new(key->group);
+
+	if (!point)
+		goto err;
+
+	tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group));
+
+        if (tmp_nid == NID_X9_62_characteristic_two_field)
+		is_char_two = 1;
+
+	tx = BN_CTX_get(ctx);
+	ty = BN_CTX_get(ctx);
+#ifndef OPENSSL_NO_EC2M
+	if (is_char_two)
+		{
+		if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point,
+								x, y, ctx))
+			goto err;
+		if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point,
+								tx, ty, ctx))
+			goto err;
+		}
+	else
+#endif
+		{
+		if (!EC_POINT_set_affine_coordinates_GFp(key->group, point,
+								x, y, ctx))
+			goto err;
+		if (!EC_POINT_get_affine_coordinates_GFp(key->group, point,
+								tx, ty, ctx))
+			goto err;
+		}
+	/* Check if retrieved coordinates match originals: if not values
+	 * are out of range.
+	 */
+	if (BN_cmp(x, tx) || BN_cmp(y, ty))
+		{
+		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+			EC_R_COORDINATES_OUT_OF_RANGE);
+		goto err;
+		}
+
+	if (!EC_KEY_set_public_key(key, point))
+		goto err;
+
+	if (EC_KEY_check_key(key) == 0)
+		goto err;
+
+	ok = 1;
+
+	err:
+	if (ctx)
+		BN_CTX_free(ctx);
+	if (point)
+		EC_POINT_free(point);
+	return ok;
+
+	}
+
 const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key)
 	{
 	return key->group;
@@ -420,6 +507,16 @@ void EC_KEY_set_enc_flags(EC_KEY *key, unsigned int flags)
 	key->enc_flag = flags;
 	}
 
+int EC_KEY_get_nonce_from_hash(const EC_KEY *key)
+	{
+	return key->nonce_from_hash_flag;
+	}
+
+void EC_KEY_set_nonce_from_hash(EC_KEY *key, int on)
+	{
+	key->nonce_from_hash_flag = on != 0;
+	}
+
 point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key)
 	{
 	return key->conv_form;
@@ -435,18 +532,27 @@ void EC_KEY_set_conv_form(EC_KEY *key, point_conversion_form_t cform)
 void *EC_KEY_get_key_method_data(EC_KEY *key,
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *))
 	{
-	return EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
+	void *ret;
+
+	CRYPTO_r_lock(CRYPTO_LOCK_EC);
+	ret = EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
+	CRYPTO_r_unlock(CRYPTO_LOCK_EC);
+
+	return ret;
 	}
 
-void EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
+void *EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *))
 	{
 	EC_EXTRA_DATA *ex_data;
+
 	CRYPTO_w_lock(CRYPTO_LOCK_EC);
 	ex_data = EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
 	if (ex_data == NULL)
 		EC_EX_DATA_set_data(&key->method_data, data, dup_func, free_func, clear_free_func);
 	CRYPTO_w_unlock(CRYPTO_LOCK_EC);
+
+	return ex_data;
 	}
 
 void EC_KEY_set_asn1_flag(EC_KEY *key, int flag)
@@ -461,3 +567,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx)
 		return 0;
 	return EC_GROUP_precompute_mult(key->group, ctx);
 	}
+
+int EC_KEY_get_flags(const EC_KEY *key)
+	{
+	return key->flags;
+	}
+
+void EC_KEY_set_flags(EC_KEY *key, int flags)
+	{
+	key->flags |= flags;
+	}
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags)
+	{
+	key->flags &= ~flags;
+	}
diff --git a/main/openssl/crypto/ec/ec_lcl.h b/main/openssl/crypto/ec/ec_lcl.h
index 3e2c34b0..6f714c75 100644
--- a/main/openssl/crypto/ec/ec_lcl.h
+++ b/main/openssl/crypto/ec/ec_lcl.h
@@ -3,7 +3,7 @@
  * Originally written by Bodo Moeller for the OpenSSL project.
  */
 /* ====================================================================
- * Copyright (c) 1998-2003 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -82,10 +82,15 @@
 # endif
 #endif
 
+/* Use default functions for poin2oct, oct2point and compressed coordinates */
+#define EC_FLAGS_DEFAULT_OCT	0x1
+
 /* Structure details are not part of the exported interface,
  * so all this may change in future versions. */
 
 struct ec_method_st {
+	/* Various method flags */
+	int flags;
 	/* used by EC_METHOD_get_field_type: */
 	int field_type; /* a NID */
 
@@ -241,9 +246,11 @@ struct ec_key_st {
 	BIGNUM	 *priv_key;
 
 	unsigned int enc_flag;
+	char nonce_from_hash_flag;
 	point_conversion_form_t conv_form;
 
 	int 	references;
+	int	flags;
 
 	EC_EXTRA_DATA *method_data;
 } /* EC_KEY */;
@@ -391,3 +398,50 @@ int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
 	size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
 int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
 int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ec2_mult.c */
+int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
+	size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+#ifndef OPENSSL_EC_NISTP_64_GCC_128
+/* method functions in ecp_nistp224.c */
+int ec_GFp_nistp224_group_init(EC_GROUP *group);
+int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp224_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp256.c */
+int ec_GFp_nistp256_group_init(EC_GROUP *group);
+int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp256_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp521.c */
+int ec_GFp_nistp521_group_init(EC_GROUP *group);
+int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp521_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group);
+
+/* utility functions in ecp_nistputil.c */
+void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
+	size_t felem_size, void *tmp_felems,
+	void (*felem_one)(void *out),
+	int (*felem_is_zero)(const void *in),
+	void (*felem_assign)(void *out, const void *in),
+	void (*felem_square)(void *out, const void *in),
+	void (*felem_mul)(void *out, const void *in1, const void *in2),
+	void (*felem_inv)(void *out, const void *in),
+	void (*felem_contract)(void *out, const void *in));
+void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in);
+#endif
diff --git a/main/openssl/crypto/ec/ec_lib.c b/main/openssl/crypto/ec/ec_lib.c
index dd7da0fc..de9a0cc2 100644
--- a/main/openssl/crypto/ec/ec_lib.c
+++ b/main/openssl/crypto/ec/ec_lib.c
@@ -425,7 +425,7 @@ int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *
 	return group->meth->group_get_curve(group, p, a, b, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
 	if (group->meth->group_set_curve == 0)
@@ -446,7 +446,7 @@ int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM
 		}
 	return group->meth->group_get_curve(group, p, a, b, ctx);
 	}
-
+#endif
 
 int EC_GROUP_get_degree(const EC_GROUP *group)
 	{
@@ -480,10 +480,10 @@ int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx)
 	if (EC_METHOD_get_field_type(EC_GROUP_method_of(a)) !=
 	    EC_METHOD_get_field_type(EC_GROUP_method_of(b)))
 		return 1;
-	/* compare the curve name (if present) */
+	/* compare the curve name (if present in both) */
 	if (EC_GROUP_get_curve_name(a) && EC_GROUP_get_curve_name(b) &&
-	    EC_GROUP_get_curve_name(a) == EC_GROUP_get_curve_name(b))
-		return 0;
+	    EC_GROUP_get_curve_name(a) != EC_GROUP_get_curve_name(b))
+		return 1;
 
 	if (!ctx)
 		ctx_new = ctx = BN_CTX_new();
@@ -856,7 +856,7 @@ int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
 	return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
 	const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx)
 	{
@@ -872,7 +872,7 @@ int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
 		}
 	return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
 	}
-
+#endif
 
 int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *point,
 	BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
@@ -890,7 +890,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *p
 	return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *point,
 	BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
 	{
@@ -906,75 +906,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *
 		}
 	return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
 	}
-
-
-int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x, int y_bit, BN_CTX *ctx)
-	{
-	if (group->meth->point_set_compressed_coordinates == 0)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
-	}
-
-
-int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x, int y_bit, BN_CTX *ctx)
-	{
-	if (group->meth->point_set_compressed_coordinates == 0)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
-	}
-
-
-size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-        unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	if (group->meth->point2oct == 0)
-		{
-		ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point2oct(group, point, form, buf, len, ctx);
-	}
-
-
-int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
-        const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	if (group->meth->oct2point == 0)
-		{
-		ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->oct2point(group, point, buf, len, ctx);
-	}
-
+#endif
 
 int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
 	{
@@ -1061,12 +993,12 @@ int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, BN
 	if (group->meth->point_cmp == 0)
 		{
 		ECerr(EC_F_EC_POINT_CMP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
+		return -1;
 		}
 	if ((group->meth != a->meth) || (a->meth != b->meth))
 		{
 		ECerr(EC_F_EC_POINT_CMP, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
+		return -1;
 		}
 	return group->meth->point_cmp(group, a, b, ctx);
 	}
diff --git a/main/openssl/crypto/ec/ec_oct.c b/main/openssl/crypto/ec/ec_oct.c
new file mode 100644
index 00000000..fd9db079
--- /dev/null
+++ b/main/openssl/crypto/ec/ec_oct.c
@@ -0,0 +1,199 @@
+/* crypto/ec/ec_lib.c */
+/*
+ * Originally written by Bodo Moeller for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2003 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Binary polynomial ECC support in OpenSSL originally developed by 
+ * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project.
+ */
+
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/opensslv.h>
+
+#include "ec_lcl.h"
+
+int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x, int y_bit, BN_CTX *ctx)
+	{
+	if (group->meth->point_set_compressed_coordinates == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+#endif
+		}
+	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+	}
+
+#ifndef OPENSSL_NO_EC2M
+int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x, int y_bit, BN_CTX *ctx)
+	{
+	if (group->meth->point_set_compressed_coordinates == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		else
+			return ec_GF2m_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		}
+	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+	}
+#endif
+
+size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+        unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	if (group->meth->point2oct == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_point2oct(group, point,
+							form, buf, len, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_point2oct(group, point,
+							form, buf, len, ctx);
+#endif
+		}
+			
+	return group->meth->point2oct(group, point, form, buf, len, ctx);
+	}
+
+
+int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
+        const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	if (group->meth->oct2point == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_oct2point(group, point,
+							buf, len, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_oct2point(group, point,
+							buf, len, ctx);
+#endif
+		}
+	return group->meth->oct2point(group, point, buf, len, ctx);
+	}
+
diff --git a/main/openssl/crypto/ec/ec_pmeth.c b/main/openssl/crypto/ec/ec_pmeth.c
index f433076c..66ee397d 100644
--- a/main/openssl/crypto/ec/ec_pmeth.c
+++ b/main/openssl/crypto/ec/ec_pmeth.c
@@ -188,7 +188,7 @@ static int pkey_ec_derive(EVP_PKEY_CTX *ctx, unsigned char *key, size_t *keylen)
 
 	pubkey = EC_KEY_get0_public_key(ctx->peerkey->pkey.ec);
 
-	/* NB: unlike PKS#3 DH, if *outlen is less than maximum size this is
+	/* NB: unlike PKCS#3 DH, if *outlen is less than maximum size this is
 	 * not an error, the result is truncated.
 	 */
 
@@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 
 		case EVP_PKEY_CTRL_MD:
 		if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
diff --git a/main/openssl/crypto/ec/eck_prn.c b/main/openssl/crypto/ec/eck_prn.c
index 7d3e175a..06de8f39 100644
--- a/main/openssl/crypto/ec/eck_prn.c
+++ b/main/openssl/crypto/ec/eck_prn.c
@@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
 			reason = ERR_R_MALLOC_FAILURE;
 			goto err;
 			}
-
+#ifndef OPENSSL_NO_EC2M
 		if (is_char_two)
 			{
 			if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx))
@@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
 				}
 			}
 		else /* prime field */
+#endif
 			{
 			if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx))
 				{
diff --git a/main/openssl/crypto/ec/ecp_mont.c b/main/openssl/crypto/ec/ecp_mont.c
index 9fc4a466..f04f132c 100644
--- a/main/openssl/crypto/ec/ecp_mont.c
+++ b/main/openssl/crypto/ec/ecp_mont.c
@@ -63,12 +63,20 @@
 
 #include <openssl/err.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 #include "ec_lcl.h"
 
 
 const EC_METHOD *EC_GFp_mont_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_mont_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_mont_group_init,
 		ec_GFp_mont_group_finish,
@@ -87,9 +95,7 @@ const EC_METHOD *EC_GFp_mont_method(void)
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -109,6 +115,7 @@ const EC_METHOD *EC_GFp_mont_method(void)
 		ec_GFp_mont_field_set_to_one };
 
 	return &ret;
+#endif
 	}
 
 
diff --git a/main/openssl/crypto/ec/ecp_nist.c b/main/openssl/crypto/ec/ecp_nist.c
index 2a5682ea..aad2d5f4 100644
--- a/main/openssl/crypto/ec/ecp_nist.c
+++ b/main/openssl/crypto/ec/ecp_nist.c
@@ -67,9 +67,17 @@
 #include <openssl/obj_mac.h>
 #include "ec_lcl.h"
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const EC_METHOD *EC_GFp_nist_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_nist_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_simple_group_init,
 		ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 int ec_GFp_nist_group_copy(EC_GROUP *dest, const EC_GROUP *src)
diff --git a/main/openssl/crypto/ec/ecp_oct.c b/main/openssl/crypto/ec/ecp_oct.c
new file mode 100644
index 00000000..374a0ee7
--- /dev/null
+++ b/main/openssl/crypto/ec/ecp_oct.c
@@ -0,0 +1,433 @@
+/* crypto/ec/ecp_oct.c */
+/* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
+ * for the OpenSSL project. 
+ * Includes code written by Bodo Moeller for the OpenSSL project.
+*/
+/* ====================================================================
+ * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Portions of this software developed by SUN MICROSYSTEMS, INC.,
+ * and contributed to the OpenSSL project.
+ */
+
+#include <openssl/err.h>
+#include <openssl/symhacks.h>
+
+#include "ec_lcl.h"
+
+int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+	{
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *tmp1, *tmp2, *x, *y;
+	int ret = 0;
+
+	/* clear error queue*/
+	ERR_clear_error();
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	y_bit = (y_bit != 0);
+
+	BN_CTX_start(ctx);
+	tmp1 = BN_CTX_get(ctx);
+	tmp2 = BN_CTX_get(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	if (y == NULL) goto err;
+
+	/* Recover y.  We have a Weierstrass equation
+	 *     y^2 = x^3 + a*x + b,
+	 * so  y  is one of the square roots of  x^3 + a*x + b.
+	 */
+
+	/* tmp1 := x^3 */
+	if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
+	if (group->meth->field_decode == 0)
+		{
+		/* field_{sqr,mul} work on standard representation */
+		if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
+		if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
+		if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
+		}
+	
+	/* tmp1 := tmp1 + a*x */
+	if (group->a_is_minus3)
+		{
+		if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
+		if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
+		if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	else
+		{
+		if (group->meth->field_decode)
+			{
+			if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
+			if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
+			}
+		else
+			{
+			/* field_mul works on standard representation */
+			if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
+			}
+		
+		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	
+	/* tmp1 := tmp1 + b */
+	if (group->meth->field_decode)
+		{
+		if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
+		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	else
+		{
+		if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
+		}
+	
+	if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
+		{
+		unsigned long err = ERR_peek_last_error();
+		
+		if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
+			{
+			ERR_clear_error();
+			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+			}
+		else
+			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+		goto err;
+		}
+
+	if (y_bit != BN_is_odd(y))
+		{
+		if (BN_is_zero(y))
+			{
+			int kron;
+
+			kron = BN_kronecker(x, &group->field, ctx);
+			if (kron == -2) goto err;
+
+			if (kron == 1)
+				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
+			else
+				/* BN_mod_sqrt() should have cought this error (not a square) */
+				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+			goto err;
+			}
+		if (!BN_usub(y, &group->field, y)) goto err;
+		}
+	if (y_bit != BN_is_odd(y))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
+		goto err;
+		}
+
+	if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+	ret = 1;
+
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+
+size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+	unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	size_t ret;
+	BN_CTX *new_ctx = NULL;
+	int used_ctx = 0;
+	BIGNUM *x, *y;
+	size_t field_len, i, skip;
+
+	if ((form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+		goto err;
+		}
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		/* encodes to a single 0 octet */
+		if (buf != NULL)
+			{
+			if (len < 1)
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+				return 0;
+				}
+			buf[0] = 0;
+			}
+		return 1;
+		}
+
+
+	/* ret := required output buffer length */
+	field_len = BN_num_bytes(&group->field);
+	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	/* if 'buf' is NULL, just return required length */
+	if (buf != NULL)
+		{
+		if (len < ret)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+			goto err;
+			}
+
+		if (ctx == NULL)
+			{
+			ctx = new_ctx = BN_CTX_new();
+			if (ctx == NULL)
+				return 0;
+			}
+
+		BN_CTX_start(ctx);
+		used_ctx = 1;
+		x = BN_CTX_get(ctx);
+		y = BN_CTX_get(ctx);
+		if (y == NULL) goto err;
+
+		if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+		if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
+			buf[0] = form + 1;
+		else
+			buf[0] = form;
+	
+		i = 1;
+		
+		skip = field_len - BN_num_bytes(x);
+		if (skip > field_len)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		while (skip > 0)
+			{
+			buf[i++] = 0;
+			skip--;
+			}
+		skip = BN_bn2bin(x, buf + i);
+		i += skip;
+		if (i != 1 + field_len)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+
+		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+			{
+			skip = field_len - BN_num_bytes(y);
+			if (skip > field_len)
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			while (skip > 0)
+				{
+				buf[i++] = 0;
+				skip--;
+				}
+			skip = BN_bn2bin(y, buf + i);
+			i += skip;
+			}
+
+		if (i != ret)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		}
+	
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+
+ err:
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return 0;
+	}
+
+
+int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+	const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	point_conversion_form_t form;
+	int y_bit;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y;
+	size_t field_len, enc_len;
+	int ret = 0;
+
+	if (len == 0)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+		return 0;
+		}
+	form = buf[0];
+	y_bit = form & 1;
+	form = form & ~1U;
+	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (form == 0)
+		{
+		if (len != 1)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			return 0;
+			}
+
+		return EC_POINT_set_to_infinity(group, point);
+		}
+	
+	field_len = BN_num_bytes(&group->field);
+	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	if (len != enc_len)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	BN_CTX_start(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	if (y == NULL) goto err;
+
+	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+	if (BN_ucmp(x, &group->field) >= 0)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		goto err;
+		}
+
+	if (form == POINT_CONVERSION_COMPRESSED)
+		{
+		if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+		if (BN_ucmp(y, &group->field) >= 0)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			goto err;
+			}
+		if (form == POINT_CONVERSION_HYBRID)
+			{
+			if (y_bit != BN_is_odd(y))
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+				goto err;
+				}
+			}
+
+		if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+		}
+	
+	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+		goto err;
+		}
+
+	ret = 1;
+	
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
diff --git a/main/openssl/crypto/ec/ecp_smpl.c b/main/openssl/crypto/ec/ecp_smpl.c
index 66a92e2a..7cbb321f 100644
--- a/main/openssl/crypto/ec/ecp_smpl.c
+++ b/main/openssl/crypto/ec/ecp_smpl.c
@@ -65,11 +65,19 @@
 #include <openssl/err.h>
 #include <openssl/symhacks.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 #include "ec_lcl.h"
 
 const EC_METHOD *EC_GFp_simple_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_simple_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_simple_group_init,
 		ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 
@@ -633,372 +640,6 @@ int ec_GFp_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_P
 	return ret;
 	}
 
-
-int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
-	{
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *tmp1, *tmp2, *x, *y;
-	int ret = 0;
-
-	/* clear error queue*/
-	ERR_clear_error();
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	y_bit = (y_bit != 0);
-
-	BN_CTX_start(ctx);
-	tmp1 = BN_CTX_get(ctx);
-	tmp2 = BN_CTX_get(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	if (y == NULL) goto err;
-
-	/* Recover y.  We have a Weierstrass equation
-	 *     y^2 = x^3 + a*x + b,
-	 * so  y  is one of the square roots of  x^3 + a*x + b.
-	 */
-
-	/* tmp1 := x^3 */
-	if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
-	if (group->meth->field_decode == 0)
-		{
-		/* field_{sqr,mul} work on standard representation */
-		if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
-		if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
-		if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
-		}
-	
-	/* tmp1 := tmp1 + a*x */
-	if (group->a_is_minus3)
-		{
-		if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
-		if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
-		if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	else
-		{
-		if (group->meth->field_decode)
-			{
-			if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
-			if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
-			}
-		else
-			{
-			/* field_mul works on standard representation */
-			if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
-			}
-		
-		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	
-	/* tmp1 := tmp1 + b */
-	if (group->meth->field_decode)
-		{
-		if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
-		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	else
-		{
-		if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
-		}
-	
-	if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
-		{
-		unsigned long err = ERR_peek_last_error();
-		
-		if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
-			{
-			ERR_clear_error();
-			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-			}
-		else
-			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
-		goto err;
-		}
-
-	if (y_bit != BN_is_odd(y))
-		{
-		if (BN_is_zero(y))
-			{
-			int kron;
-
-			kron = BN_kronecker(x, &group->field, ctx);
-			if (kron == -2) goto err;
-
-			if (kron == 1)
-				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
-			else
-				/* BN_mod_sqrt() should have cought this error (not a square) */
-				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-			goto err;
-			}
-		if (!BN_usub(y, &group->field, y)) goto err;
-		}
-	if (y_bit != BN_is_odd(y))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
-		goto err;
-		}
-
-	if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
-	ret = 1;
-
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
-size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-	unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	size_t ret;
-	BN_CTX *new_ctx = NULL;
-	int used_ctx = 0;
-	BIGNUM *x, *y;
-	size_t field_len, i, skip;
-
-	if ((form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
-		goto err;
-		}
-
-	if (EC_POINT_is_at_infinity(group, point))
-		{
-		/* encodes to a single 0 octet */
-		if (buf != NULL)
-			{
-			if (len < 1)
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-				return 0;
-				}
-			buf[0] = 0;
-			}
-		return 1;
-		}
-
-
-	/* ret := required output buffer length */
-	field_len = BN_num_bytes(&group->field);
-	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	/* if 'buf' is NULL, just return required length */
-	if (buf != NULL)
-		{
-		if (len < ret)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-			goto err;
-			}
-
-		if (ctx == NULL)
-			{
-			ctx = new_ctx = BN_CTX_new();
-			if (ctx == NULL)
-				return 0;
-			}
-
-		BN_CTX_start(ctx);
-		used_ctx = 1;
-		x = BN_CTX_get(ctx);
-		y = BN_CTX_get(ctx);
-		if (y == NULL) goto err;
-
-		if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
-		if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
-			buf[0] = form + 1;
-		else
-			buf[0] = form;
-	
-		i = 1;
-		
-		skip = field_len - BN_num_bytes(x);
-		if (skip > field_len)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		while (skip > 0)
-			{
-			buf[i++] = 0;
-			skip--;
-			}
-		skip = BN_bn2bin(x, buf + i);
-		i += skip;
-		if (i != 1 + field_len)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-
-		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
-			{
-			skip = field_len - BN_num_bytes(y);
-			if (skip > field_len)
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-				goto err;
-				}
-			while (skip > 0)
-				{
-				buf[i++] = 0;
-				skip--;
-				}
-			skip = BN_bn2bin(y, buf + i);
-			i += skip;
-			}
-
-		if (i != ret)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		}
-	
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-
- err:
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return 0;
-	}
-
-
-int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
-	const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	point_conversion_form_t form;
-	int y_bit;
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *x, *y;
-	size_t field_len, enc_len;
-	int ret = 0;
-
-	if (len == 0)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
-		return 0;
-		}
-	form = buf[0];
-	y_bit = form & 1;
-	form = form & ~1U;
-	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (form == 0)
-		{
-		if (len != 1)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			return 0;
-			}
-
-		return EC_POINT_set_to_infinity(group, point);
-		}
-	
-	field_len = BN_num_bytes(&group->field);
-	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	if (len != enc_len)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	BN_CTX_start(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	if (y == NULL) goto err;
-
-	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
-	if (BN_ucmp(x, &group->field) >= 0)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		goto err;
-		}
-
-	if (form == POINT_CONVERSION_COMPRESSED)
-		{
-		if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
-		if (BN_ucmp(y, &group->field) >= 0)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			goto err;
-			}
-		if (form == POINT_CONVERSION_HYBRID)
-			{
-			if (y_bit != BN_is_odd(y))
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-				goto err;
-				}
-			}
-
-		if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-		}
-	
-	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
-		goto err;
-		}
-
-	ret = 1;
-	
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
 int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
 	{
 	int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *, BN_CTX *);
diff --git a/main/openssl/crypto/ec/ectest.c b/main/openssl/crypto/ec/ectest.c
index 7509cb9c..102eaa9b 100644
--- a/main/openssl/crypto/ec/ectest.c
+++ b/main/openssl/crypto/ec/ectest.c
@@ -94,6 +94,7 @@ int main(int argc, char * argv[]) { puts("Elliptic curves are disabled."); retur
 #include <openssl/objects.h>
 #include <openssl/rand.h>
 #include <openssl/bn.h>
+#include <openssl/opensslconf.h>
 
 #if defined(_MSC_VER) && defined(_MIPS_) && (_MSC_VER/100==12)
 /* suppress "too big too optimize" warning */
@@ -107,10 +108,6 @@ int main(int argc, char * argv[]) { puts("Elliptic curves are disabled."); retur
 	EXIT(1); \
 } while (0)
 
-void prime_field_tests(void);
-void char2_field_tests(void);
-void internal_curve_test(void);
-
 #define TIMING_BASE_PT 0
 #define TIMING_RAND_PT 1
 #define TIMING_SIMUL 2
@@ -195,8 +192,51 @@ static void timings(EC_GROUP *group, int type, BN_CTX *ctx)
 	}
 #endif
 
-void prime_field_tests()
-	{	
+/* test multiplication with group order, long and negative scalars */
+static void group_order_tests(EC_GROUP *group)
+	{
+	BIGNUM *n1, *n2, *order;
+	EC_POINT *P = EC_POINT_new(group);
+	EC_POINT *Q = EC_POINT_new(group);
+	BN_CTX *ctx = BN_CTX_new();
+
+	n1 = BN_new(); n2 = BN_new(); order = BN_new();
+	fprintf(stdout, "verify group order ...");
+	fflush(stdout);
+	if (!EC_GROUP_get_order(group, order, ctx)) ABORT;
+	if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+	fprintf(stdout, ".");
+	fflush(stdout);
+	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
+	if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+	fprintf(stdout, " ok\n");
+	fprintf(stdout, "long/negative scalar tests ... ");
+	if (!BN_one(n1)) ABORT;
+	/* n1 = 1 - order */
+	if (!BN_sub(n1, n1, order)) ABORT;
+	if(!EC_POINT_mul(group, Q, NULL, P, n1, ctx)) ABORT;
+	if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+	/* n2 = 1 + order */
+	if (!BN_add(n2, order, BN_value_one())) ABORT;
+	if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+	if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+	/* n2 = (1 - order) * (1 + order) */
+	if (!BN_mul(n2, n1, n2, ctx)) ABORT;
+	if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+	if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+	fprintf(stdout, "ok\n");
+	EC_POINT_free(P);
+	EC_POINT_free(Q);
+	BN_free(n1);
+	BN_free(n2);
+	BN_free(order);
+	BN_CTX_free(ctx);
+	}
+
+static void prime_field_tests(void)
+	{
 	BN_CTX *ctx = NULL;
 	BIGNUM *p, *a, *b;
 	EC_GROUP *group;
@@ -321,21 +361,21 @@ void prime_field_tests()
 	if (len == 0) ABORT;
 	if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
 	if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
-	fprintf(stdout, "Generator as octect string, compressed form:\n     ");
+	fprintf(stdout, "Generator as octet string, compressed form:\n     ");
 	for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
 	
 	len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_UNCOMPRESSED, buf, sizeof buf, ctx);
 	if (len == 0) ABORT;
 	if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
 	if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
-	fprintf(stdout, "\nGenerator as octect string, uncompressed form:\n     ");
+	fprintf(stdout, "\nGenerator as octet string, uncompressed form:\n     ");
 	for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
 	
 	len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_HYBRID, buf, sizeof buf, ctx);
 	if (len == 0) ABORT;
 	if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
 	if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
-	fprintf(stdout, "\nGenerator as octect string, hybrid form:\n     ");
+	fprintf(stdout, "\nGenerator as octet string, hybrid form:\n     ");
 	for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
 	
 	if (!EC_POINT_get_Jprojective_coordinates_GFp(group, R, x, y, z, ctx)) ABORT;
@@ -381,17 +421,7 @@ void prime_field_tests()
 	if (EC_GROUP_get_degree(group) != 160) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_160 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_160, group)) ABORT;
@@ -425,17 +455,7 @@ void prime_field_tests()
 	if (EC_GROUP_get_degree(group) != 192) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_192 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_192, group)) ABORT;
@@ -469,17 +489,7 @@ void prime_field_tests()
 	if (EC_GROUP_get_degree(group) != 224) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_224 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_224, group)) ABORT;
@@ -514,17 +524,7 @@ void prime_field_tests()
 	if (EC_GROUP_get_degree(group) != 256) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_256 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_256, group)) ABORT;
@@ -563,18 +563,8 @@ void prime_field_tests()
 	fprintf(stdout, "verify degree ...");
 	if (EC_GROUP_get_degree(group) != 384) ABORT;
 	fprintf(stdout, " ok\n");
-	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+
+	group_order_tests(group);
 
 	if (!(P_384 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_384, group)) ABORT;
@@ -619,18 +609,8 @@ void prime_field_tests()
 	fprintf(stdout, "verify degree ...");
 	if (EC_GROUP_get_degree(group) != 521) ABORT;
 	fprintf(stdout, " ok\n");
-	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+
+ 	group_order_tests(group);
 
 	if (!(P_521 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_521, group)) ABORT;
@@ -659,6 +639,7 @@ void prime_field_tests()
 		points[2] = Q;
 		points[3] = Q;
 
+		if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
 		if (!BN_add(y, z, BN_value_one())) ABORT;
 		if (BN_is_odd(y)) ABORT;
 		if (!BN_rshift1(y, y)) ABORT;
@@ -792,22 +773,14 @@ void prime_field_tests()
 	fprintf(stdout, "verify degree ..."); \
 	if (EC_GROUP_get_degree(group) != _degree) ABORT; \
 	fprintf(stdout, " ok\n"); \
-	fprintf(stdout, "verify group order ..."); \
-	fflush(stdout); \
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT; \
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
-	fprintf(stdout, "."); \
-	fflush(stdout); \
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT; \
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
-	fprintf(stdout, " ok\n"); \
+	group_order_tests(group); \
 	if (!(_variable = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT; \
-	if (!EC_GROUP_copy(_variable, group)) ABORT;
+	if (!EC_GROUP_copy(_variable, group)) ABORT; \
 
-void char2_field_tests()
-	{	
+#ifndef OPENSSL_NO_EC2M
+
+static void char2_field_tests(void)
+	{
 	BN_CTX *ctx = NULL;
 	BIGNUM *p, *a, *b;
 	EC_GROUP *group;
@@ -1239,8 +1212,9 @@ void char2_field_tests()
 	if (C2_B571) EC_GROUP_free(C2_B571);
 
 	}
+#endif
 
-void internal_curve_test(void)
+static void internal_curve_test(void)
 	{
 	EC_builtin_curve *curves = NULL;
 	size_t crv_len = 0, n = 0;
@@ -1287,13 +1261,189 @@ void internal_curve_test(void)
 		EC_GROUP_free(group);
 		}
 	if (ok)
-		fprintf(stdout, " ok\n");
+		fprintf(stdout, " ok\n\n");
 	else
-		fprintf(stdout, " failed\n");
+		{
+		fprintf(stdout, " failed\n\n");
+		ABORT;
+		}
 	OPENSSL_free(curves);
 	return;
 	}
 
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/* nistp_test_params contains magic numbers for testing our optimized
+ * implementations of several NIST curves with characteristic > 3. */
+struct nistp_test_params
+	{
+	const EC_METHOD* (*meth) ();
+	int degree;
+	/* Qx, Qy and D are taken from
+	 * http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/ECDSA_Prime.pdf
+	 * Otherwise, values are standard curve parameters from FIPS 180-3 */
+	const char *p, *a, *b, *Qx, *Qy, *Gx, *Gy, *order, *d;
+	};
+
+static const struct nistp_test_params nistp_tests_params[] =
+	{
+		{
+		/* P-224 */
+		EC_GFp_nistp224_method,
+		224,
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF000000000000000000000001", /* p */
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFE", /* a */
+		"B4050A850C04B3ABF54132565044B0B7D7BFD8BA270B39432355FFB4", /* b */
+		"E84FB0B8E7000CB657D7973CF6B42ED78B301674276DF744AF130B3E", /* Qx */
+		"4376675C6FC5612C21A0FF2D2A89D2987DF7A2BC52183B5982298555", /* Qy */
+		"B70E0CBD6BB4BF7F321390B94A03C1D356C21122343280D6115C1D21", /* Gx */
+		"BD376388B5F723FB4C22DFE6CD4375A05A07476444D5819985007E34", /* Gy */
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFF16A2E0B8F03E13DD29455C5C2A3D", /* order */
+		"3F0C488E987C80BE0FEE521F8D90BE6034EC69AE11CA72AA777481E8", /* d */
+		},
+		{
+		/* P-256 */
+		EC_GFp_nistp256_method,
+		256,
+		"ffffffff00000001000000000000000000000000ffffffffffffffffffffffff", /* p */
+		"ffffffff00000001000000000000000000000000fffffffffffffffffffffffc", /* a */
+		"5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", /* b */
+		"b7e08afdfe94bad3f1dc8c734798ba1c62b3a0ad1e9ea2a38201cd0889bc7a19", /* Qx */
+		"3603f747959dbf7a4bb226e41928729063adc7ae43529e61b563bbc606cc5e09", /* Qy */
+		"6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", /* Gx */
+		"4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", /* Gy */
+		"ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551", /* order */
+		"c477f9f65c22cce20657faa5b2d1d8122336f851a508a1ed04e479c34985bf96", /* d */
+		},
+		{
+		/* P-521 */
+		EC_GFp_nistp521_method,
+		521,
+		"1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", /* p */
+		"1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffc", /* a */
+		"051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00", /* b */
+		"0098e91eef9a68452822309c52fab453f5f117c1da8ed796b255e9ab8f6410cca16e59df403a6bdc6ca467a37056b1e54b3005d8ac030decfeb68df18b171885d5c4", /* Qx */
+		"0164350c321aecfc1cca1ba4364c9b15656150b4b78d6a48d7d28e7f31985ef17be8554376b72900712c4b83ad668327231526e313f5f092999a4632fd50d946bc2e", /* Qy */
+		"c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66", /* Gx */
+		"11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650", /* Gy */
+		"1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409", /* order */
+		"0100085f47b8e1b8b11b7eb33028c0b2888e304bfc98501955b45bba1478dc184eeedf09b86a5f7c21994406072787205e69a63709fe35aa93ba333514b24f961722", /* d */
+		},
+	};
+
+void nistp_single_test(const struct nistp_test_params *test)
+	{
+	BN_CTX *ctx;
+	BIGNUM *p, *a, *b, *x, *y, *n, *m, *order;
+	EC_GROUP *NISTP;
+	EC_POINT *G, *P, *Q, *Q_CHECK;
+
+	fprintf(stdout, "\nNIST curve P-%d (optimised implementation):\n", test->degree);
+	ctx = BN_CTX_new();
+	p = BN_new();
+	a = BN_new();
+	b = BN_new();
+	x = BN_new(); y = BN_new();
+	m = BN_new(); n = BN_new(); order = BN_new();
+
+	NISTP = EC_GROUP_new(test->meth());
+	if(!NISTP) ABORT;
+	if (!BN_hex2bn(&p, test->p)) ABORT;
+	if (1 != BN_is_prime_ex(p, BN_prime_checks, ctx, NULL)) ABORT;
+	if (!BN_hex2bn(&a, test->a)) ABORT;
+	if (!BN_hex2bn(&b, test->b)) ABORT;
+	if (!EC_GROUP_set_curve_GFp(NISTP, p, a, b, ctx)) ABORT;
+	G = EC_POINT_new(NISTP);
+	P = EC_POINT_new(NISTP);
+	Q = EC_POINT_new(NISTP);
+	Q_CHECK = EC_POINT_new(NISTP);
+	if(!BN_hex2bn(&x, test->Qx)) ABORT;
+	if(!BN_hex2bn(&y, test->Qy)) ABORT;
+	if(!EC_POINT_set_affine_coordinates_GFp(NISTP, Q_CHECK, x, y, ctx)) ABORT;
+	if (!BN_hex2bn(&x, test->Gx)) ABORT;
+	if (!BN_hex2bn(&y, test->Gy)) ABORT;
+	if (!EC_POINT_set_affine_coordinates_GFp(NISTP, G, x, y, ctx)) ABORT;
+	if (!BN_hex2bn(&order, test->order)) ABORT;
+	if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+
+	fprintf(stdout, "verify degree ... ");
+	if (EC_GROUP_get_degree(NISTP) != test->degree) ABORT;
+	fprintf(stdout, "ok\n");
+
+	fprintf(stdout, "NIST test vectors ... ");
+	if (!BN_hex2bn(&n, test->d)) ABORT;
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	/* set generator to P = 2*G, where G is the standard generator */
+	if (!EC_POINT_dbl(NISTP, P, G, ctx)) ABORT;
+	if (!EC_GROUP_set_generator(NISTP, P, order, BN_value_one())) ABORT;
+	/* set the scalar to m=n/2, where n is the NIST test scalar */
+	if (!BN_rshift(m, n, 1)) ABORT;
+
+	/* test the non-standard generator */
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	/* now repeat all tests with precomputation */
+	if (!EC_GROUP_precompute_mult(NISTP, ctx)) ABORT;
+
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	/* reset generator */
+	if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	fprintf(stdout, "ok\n");
+	group_order_tests(NISTP);
+#if 0
+	timings(NISTP, TIMING_BASE_PT, ctx);
+	timings(NISTP, TIMING_RAND_PT, ctx);
+#endif
+	EC_GROUP_free(NISTP);
+	EC_POINT_free(G);
+	EC_POINT_free(P);
+	EC_POINT_free(Q);
+	EC_POINT_free(Q_CHECK);
+	BN_free(n);
+	BN_free(m);
+	BN_free(p);
+	BN_free(a);
+	BN_free(b);
+	BN_free(x);
+	BN_free(y);
+	BN_free(order);
+	BN_CTX_free(ctx);
+	}
+
+void nistp_tests()
+	{
+	unsigned i;
+
+	for (i = 0; i < sizeof(nistp_tests_params) / sizeof(struct nistp_test_params); i++)
+		{
+		nistp_single_test(&nistp_tests_params[i]);
+		}
+	}
+#endif
+
 static const char rnd_seed[] = "string to make the random number generator think it has entropy";
 
 int main(int argc, char *argv[])
@@ -1317,7 +1467,12 @@ int main(int argc, char *argv[])
 
 	prime_field_tests();
 	puts("");
+#ifndef OPENSSL_NO_EC2M
 	char2_field_tests();
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	nistp_tests();
+#endif
 	/* test the internal curves */
 	internal_curve_test();
 
diff --git a/main/openssl/crypto/ecdh/ecdh.h b/main/openssl/crypto/ecdh/ecdh.h
index b4b58ee6..8887102c 100644
--- a/main/openssl/crypto/ecdh/ecdh.h
+++ b/main/openssl/crypto/ecdh/ecdh.h
@@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void);
 /* Error codes for the ECDH functions. */
 
 /* Function codes. */
+#define ECDH_F_ECDH_CHECK				 102
 #define ECDH_F_ECDH_COMPUTE_KEY				 100
 #define ECDH_F_ECDH_DATA_NEW_METHOD			 101
 
 /* Reason codes. */
 #define ECDH_R_KDF_FAILED				 102
+#define ECDH_R_NON_FIPS_METHOD				 103
 #define ECDH_R_NO_PRIVATE_VALUE				 100
 #define ECDH_R_POINT_ARITHMETIC_FAILURE			 101
 
diff --git a/main/openssl/crypto/ecdh/ecdhtest.c b/main/openssl/crypto/ecdh/ecdhtest.c
index 212a87ef..823d7baa 100644
--- a/main/openssl/crypto/ecdh/ecdhtest.c
+++ b/main/openssl/crypto/ecdh/ecdhtest.c
@@ -158,11 +158,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out)
 		if (!EC_POINT_get_affine_coordinates_GFp(group,
 			EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
 		}
+#ifndef OPENSSL_NO_EC2M
 	else
 		{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group,
 			EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
 		}
+#endif
 #ifdef NOISY
 	BIO_puts(out,"  pri 1=");
 	BN_print(out,a->priv_key);
@@ -183,11 +185,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out)
 		if (!EC_POINT_get_affine_coordinates_GFp(group, 
 			EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
 		}
+#ifndef OPENSSL_NO_EC2M
 	else
 		{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group, 
 			EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
 		}
+#endif
 
 #ifdef NOISY
 	BIO_puts(out,"  pri 2=");
@@ -324,6 +328,7 @@ int main(int argc, char *argv[])
 	if (!test_ecdh_curve(NID_X9_62_prime256v1, "NIST Prime-Curve P-256", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_secp384r1, "NIST Prime-Curve P-384", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_secp521r1, "NIST Prime-Curve P-521", ctx, out)) goto err;
+#ifndef OPENSSL_NO_EC2M
 	/* NIST BINARY CURVES TESTS */
 	if (!test_ecdh_curve(NID_sect163k1, "NIST Binary-Curve K-163", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_sect163r2, "NIST Binary-Curve B-163", ctx, out)) goto err;
@@ -335,6 +340,7 @@ int main(int argc, char *argv[])
 	if (!test_ecdh_curve(NID_sect409r1, "NIST Binary-Curve B-409", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_sect571k1, "NIST Binary-Curve K-571", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_sect571r1, "NIST Binary-Curve B-571", ctx, out)) goto err;
+#endif
 
 	ret = 0;
 
diff --git a/main/openssl/crypto/ecdh/ech_err.c b/main/openssl/crypto/ecdh/ech_err.c
index 6f4b0c99..3bd24739 100644
--- a/main/openssl/crypto/ecdh/ech_err.c
+++ b/main/openssl/crypto/ecdh/ech_err.c
@@ -1,6 +1,6 @@
 /* crypto/ecdh/ech_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA ECDH_str_functs[]=
 	{
+{ERR_FUNC(ECDH_F_ECDH_CHECK),	"ECDH_CHECK"},
 {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY),	"ECDH_compute_key"},
 {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD),	"ECDH_DATA_new_method"},
 {0,NULL}
@@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]=
 static ERR_STRING_DATA ECDH_str_reasons[]=
 	{
 {ERR_REASON(ECDH_R_KDF_FAILED)           ,"KDF failed"},
+{ERR_REASON(ECDH_R_NON_FIPS_METHOD)      ,"non fips method"},
 {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE)     ,"no private value"},
 {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"},
 {0,NULL}
diff --git a/main/openssl/crypto/ecdh/ech_key.c b/main/openssl/crypto/ecdh/ech_key.c
index f44da929..2988899e 100644
--- a/main/openssl/crypto/ecdh/ech_key.c
+++ b/main/openssl/crypto/ecdh/ech_key.c
@@ -68,9 +68,6 @@
  */
 
 #include "ech_locl.h"
-#ifndef OPENSSL_NO_ENGINE
-#include <openssl/engine.h>
-#endif
 
 int ECDH_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
 	EC_KEY *eckey,
diff --git a/main/openssl/crypto/ecdh/ech_lib.c b/main/openssl/crypto/ecdh/ech_lib.c
index 4d8ea03d..0644431b 100644
--- a/main/openssl/crypto/ecdh/ech_lib.c
+++ b/main/openssl/crypto/ecdh/ech_lib.c
@@ -73,6 +73,9 @@
 #include <openssl/engine.h>
 #endif
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT;
 
@@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth)
 const ECDH_METHOD *ECDH_get_default_method(void)
 	{
 	if(!default_ECDH_method) 
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_ecdh_openssl();
+		else
+			return ECDH_OpenSSL();
+#else
 		default_ECDH_method = ECDH_OpenSSL();
+#endif
+		}
 	return default_ECDH_method;
 	}
 
@@ -210,11 +222,26 @@ ECDH_DATA *ecdh_check(EC_KEY *key)
 		ecdh_data = (ECDH_DATA *)ecdh_data_new();
 		if (ecdh_data == NULL)
 			return NULL;
-		EC_KEY_insert_key_method_data(key, (void *)ecdh_data,
-			ecdh_data_dup, ecdh_data_free, ecdh_data_free);
+		data = EC_KEY_insert_key_method_data(key, (void *)ecdh_data,
+			   ecdh_data_dup, ecdh_data_free, ecdh_data_free);
+		if (data != NULL)
+			{
+			/* Another thread raced us to install the key_method
+			 * data and won. */
+			ecdh_data_free(ecdh_data);
+			ecdh_data = (ECDH_DATA *)data;
+			}
 	}
 	else
 		ecdh_data = (ECDH_DATA *)data;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD)
+			&& !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+		{
+		ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD);
+		return NULL;
+		}
+#endif
 	
 
 	return ecdh_data;
diff --git a/main/openssl/crypto/ecdh/ech_locl.h b/main/openssl/crypto/ecdh/ech_locl.h
index f658526a..f6cad6a8 100644
--- a/main/openssl/crypto/ecdh/ech_locl.h
+++ b/main/openssl/crypto/ecdh/ech_locl.h
@@ -75,6 +75,14 @@ struct ecdh_method
 	char *app_data;
 	};
 
+/* If this flag is set the ECDH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDH_FLAG_FIPS_METHOD	0x1
+
 typedef struct ecdh_data_st {
 	/* EC_KEY_METH_DATA part */
 	int (*init)(EC_KEY *);
diff --git a/main/openssl/crypto/ecdh/ech_ossl.c b/main/openssl/crypto/ecdh/ech_ossl.c
index 2a40ff12..4a30628f 100644
--- a/main/openssl/crypto/ecdh/ech_ossl.c
+++ b/main/openssl/crypto/ecdh/ech_ossl.c
@@ -157,6 +157,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else
 		{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group, tmp, x, y, ctx)) 
@@ -165,6 +166,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
 			goto err;
 			}
 		}
+#endif
 
 	buflen = (EC_GROUP_get_degree(group) + 7)/8;
 	len = BN_num_bytes(x);
diff --git a/main/openssl/crypto/ecdsa/ecdsa.h b/main/openssl/crypto/ecdsa/ecdsa.h
index e61c5398..dc6a36b1 100644
--- a/main/openssl/crypto/ecdsa/ecdsa.h
+++ b/main/openssl/crypto/ecdsa/ecdsa.h
@@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void);
 /* Error codes for the ECDSA functions. */
 
 /* Function codes. */
+#define ECDSA_F_ECDSA_CHECK				 104
 #define ECDSA_F_ECDSA_DATA_NEW_METHOD			 100
 #define ECDSA_F_ECDSA_DO_SIGN				 101
 #define ECDSA_F_ECDSA_DO_VERIFY				 102
@@ -249,6 +250,8 @@ void ERR_load_ECDSA_strings(void);
 #define ECDSA_R_ERR_EC_LIB				 102
 #define ECDSA_R_MISSING_PARAMETERS			 103
 #define ECDSA_R_NEED_NEW_SETUP_VALUES			 106
+#define ECDSA_R_NONCE_CANNOT_BE_PRECOMPUTED		 108
+#define ECDSA_R_NON_FIPS_METHOD				 107
 #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED		 104
 #define ECDSA_R_SIGNATURE_MALLOC_FAILED			 105
 
diff --git a/main/openssl/crypto/ecdsa/ecdsatest.c b/main/openssl/crypto/ecdsa/ecdsatest.c
index 26a4a9ee..537bb303 100644
--- a/main/openssl/crypto/ecdsa/ecdsatest.c
+++ b/main/openssl/crypto/ecdsa/ecdsatest.c
@@ -262,6 +262,7 @@ int x9_62_tests(BIO *out)
 		"3238135532097973577080787768312505059318910517550078427819"
 		"78505179448783"))
 		goto x962_err;
+#ifndef OPENSSL_NO_EC2M
 	if (!x9_62_test_internal(out, NID_X9_62_c2tnb191v1,
 		"87194383164871543355722284926904419997237591535066528048",
 		"308992691965804947361541664549085895292153777025772063598"))
@@ -272,7 +273,7 @@ int x9_62_tests(BIO *out)
 		"1970303740007316867383349976549972270528498040721988191026"
 		"49413465737174"))
 		goto x962_err;
-
+#endif
 	ret = 1;
 x962_err:
 	if (!restore_rand())
@@ -286,9 +287,13 @@ int test_builtin(BIO *out)
 	size_t		crv_len = 0, n = 0;
 	EC_KEY		*eckey = NULL, *wrong_eckey = NULL;
 	EC_GROUP	*group;
+	ECDSA_SIG	*ecdsa_sig = NULL;
 	unsigned char	digest[20], wrong_digest[20];
-	unsigned char	*signature = NULL; 
-	unsigned int	sig_len;
+	unsigned char	*signature = NULL;
+	const unsigned char	*sig_ptr;
+	unsigned char	*sig_ptr2;
+	unsigned char	*raw_buf = NULL;
+	unsigned int	sig_len, degree, r_len, s_len, bn_len, buf_len;
 	int		nid, ret =  0;
 	
 	/* fill digest values with some random data */
@@ -338,7 +343,8 @@ int test_builtin(BIO *out)
 		if (EC_KEY_set_group(eckey, group) == 0)
 			goto builtin_err;
 		EC_GROUP_free(group);
-		if (EC_GROUP_get_degree(EC_KEY_get0_group(eckey)) < 160)
+		degree = EC_GROUP_get_degree(EC_KEY_get0_group(eckey));
+		if (degree < 160)
 			/* drop the curve */ 
 			{
 			EC_KEY_free(eckey);
@@ -414,26 +420,89 @@ int test_builtin(BIO *out)
 			}
 		BIO_printf(out, ".");
 		(void)BIO_flush(out);
-		/* modify a single byte of the signature */
-		offset = signature[10] % sig_len;
-		dirt   = signature[11];
-		signature[offset] ^= dirt ? dirt : 1; 
+		/* wrong length */
+		if (ECDSA_verify(0, digest, 20, signature, sig_len - 1,
+			eckey) == 1)
+			{
+			BIO_printf(out, " failed\n");
+			goto builtin_err;
+			}
+		BIO_printf(out, ".");
+		(void)BIO_flush(out);
+
+		/* Modify a single byte of the signature: to ensure we don't
+		 * garble the ASN1 structure, we read the raw signature and
+		 * modify a byte in one of the bignums directly. */
+		sig_ptr = signature;
+		if ((ecdsa_sig = d2i_ECDSA_SIG(NULL, &sig_ptr, sig_len)) == NULL)
+			{
+			BIO_printf(out, " failed\n");
+			goto builtin_err;
+			}
+
+		/* Store the two BIGNUMs in raw_buf. */
+		r_len = BN_num_bytes(ecdsa_sig->r);
+		s_len = BN_num_bytes(ecdsa_sig->s);
+		bn_len = (degree + 7) / 8;
+		if ((r_len > bn_len) || (s_len > bn_len))
+			{
+			BIO_printf(out, " failed\n");
+			goto builtin_err;
+			}
+		buf_len = 2 * bn_len;
+		if ((raw_buf = OPENSSL_malloc(buf_len)) == NULL)
+			goto builtin_err;
+		/* Pad the bignums with leading zeroes. */
+		memset(raw_buf, 0, buf_len);
+		BN_bn2bin(ecdsa_sig->r, raw_buf + bn_len - r_len);
+		BN_bn2bin(ecdsa_sig->s, raw_buf + buf_len - s_len);
+
+		/* Modify a single byte in the buffer. */
+		offset = raw_buf[10] % buf_len;
+		dirt   = raw_buf[11] ? raw_buf[11] : 1;
+		raw_buf[offset] ^= dirt;
+		/* Now read the BIGNUMs back in from raw_buf. */
+		if ((BN_bin2bn(raw_buf, bn_len, ecdsa_sig->r) == NULL) ||
+			(BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL))
+			goto builtin_err;
+
+		sig_ptr2 = signature;
+		sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2);
 		if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) == 1)
 			{
 			BIO_printf(out, " failed\n");
 			goto builtin_err;
 			}
+		/* Sanity check: undo the modification and verify signature. */
+		raw_buf[offset] ^= dirt;
+		if ((BN_bin2bn(raw_buf, bn_len, ecdsa_sig->r) == NULL) ||
+			(BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL))
+			goto builtin_err;
+
+		sig_ptr2 = signature;
+		sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2);
+		if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) != 1)
+			{
+			BIO_printf(out, " failed\n");
+			goto builtin_err;
+			}
 		BIO_printf(out, ".");
 		(void)BIO_flush(out);
 		
 		BIO_printf(out, " ok\n");
 		/* cleanup */
+		/* clean bogus errors */
+		ERR_clear_error();
 		OPENSSL_free(signature);
 		signature = NULL;
 		EC_KEY_free(eckey);
 		eckey = NULL;
 		EC_KEY_free(wrong_eckey);
 		wrong_eckey = NULL;
+		ECDSA_SIG_free(ecdsa_sig);
+		ecdsa_sig = NULL;
+		OPENSSL_free(raw_buf);
+		raw_buf = NULL;
 		}
 
 	ret = 1;	
@@ -442,8 +511,12 @@ builtin_err:
 		EC_KEY_free(eckey);
 	if (wrong_eckey)
 		EC_KEY_free(wrong_eckey);
+	if (ecdsa_sig)
+		ECDSA_SIG_free(ecdsa_sig);
 	if (signature)
 		OPENSSL_free(signature);
+	if (raw_buf)
+		OPENSSL_free(raw_buf);
 	if (curves)
 		OPENSSL_free(curves);
 
diff --git a/main/openssl/crypto/ecdsa/ecs_err.c b/main/openssl/crypto/ecdsa/ecs_err.c
index 98e38d53..7406c6d8 100644
--- a/main/openssl/crypto/ecdsa/ecs_err.c
+++ b/main/openssl/crypto/ecdsa/ecs_err.c
@@ -1,6 +1,6 @@
 /* crypto/ecdsa/ecs_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA ECDSA_str_functs[]=
 	{
+{ERR_FUNC(ECDSA_F_ECDSA_CHECK),	"ECDSA_CHECK"},
 {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD),	"ECDSA_DATA_NEW_METHOD"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN),	"ECDSA_do_sign"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY),	"ECDSA_do_verify"},
@@ -84,6 +85,8 @@ static ERR_STRING_DATA ECDSA_str_reasons[]=
 {ERR_REASON(ECDSA_R_ERR_EC_LIB)          ,"err ec lib"},
 {ERR_REASON(ECDSA_R_MISSING_PARAMETERS)  ,"missing parameters"},
 {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"},
+{ERR_REASON(ECDSA_R_NONCE_CANNOT_BE_PRECOMPUTED),"nonce cannot be precomputed"},
+{ERR_REASON(ECDSA_R_NON_FIPS_METHOD)     ,"non fips method"},
 {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"},
 {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"},
 {0,NULL}
diff --git a/main/openssl/crypto/ecdsa/ecs_lib.c b/main/openssl/crypto/ecdsa/ecs_lib.c
index 2ebae3aa..814a6bf4 100644
--- a/main/openssl/crypto/ecdsa/ecs_lib.c
+++ b/main/openssl/crypto/ecdsa/ecs_lib.c
@@ -60,6 +60,9 @@
 #endif
 #include <openssl/err.h>
 #include <openssl/bn.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT;
 
@@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth)
 const ECDSA_METHOD *ECDSA_get_default_method(void)
 {
 	if(!default_ECDSA_method) 
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_ecdsa_openssl();
+		else
+			return ECDSA_OpenSSL();
+#else
 		default_ECDSA_method = ECDSA_OpenSSL();
+#endif
+		}
 	return default_ECDSA_method;
 }
 
@@ -188,12 +200,26 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key)
 		ecdsa_data = (ECDSA_DATA *)ecdsa_data_new();
 		if (ecdsa_data == NULL)
 			return NULL;
-		EC_KEY_insert_key_method_data(key, (void *)ecdsa_data,
-			ecdsa_data_dup, ecdsa_data_free, ecdsa_data_free);
+		data = EC_KEY_insert_key_method_data(key, (void *)ecdsa_data,
+			   ecdsa_data_dup, ecdsa_data_free, ecdsa_data_free);
+		if (data != NULL)
+			{
+			/* Another thread raced us to install the key_method
+			 * data and won. */
+			ecdsa_data_free(ecdsa_data);
+			ecdsa_data = (ECDSA_DATA *)data;
+			}
 	}
 	else
 		ecdsa_data = (ECDSA_DATA *)data;
-	
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD)
+			&& !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+		{
+		ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD);
+		return NULL;
+		}
+#endif
 
 	return ecdsa_data;
 }
diff --git a/main/openssl/crypto/ecdsa/ecs_locl.h b/main/openssl/crypto/ecdsa/ecs_locl.h
index 3a69a840..46f7ad91 100644
--- a/main/openssl/crypto/ecdsa/ecs_locl.h
+++ b/main/openssl/crypto/ecdsa/ecs_locl.h
@@ -70,8 +70,9 @@ struct ecdsa_method
 	const char *name;
 	ECDSA_SIG *(*ecdsa_do_sign)(const unsigned char *dgst, int dgst_len, 
 			const BIGNUM *inv, const BIGNUM *rp, EC_KEY *eckey);
-	int (*ecdsa_sign_setup)(EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv, 
-			BIGNUM **r);
+	int (*ecdsa_sign_setup)(EC_KEY *eckey, BN_CTX *ctx,
+				BIGNUM **kinv, BIGNUM **r,
+				const unsigned char *dgst, int dlen);
 	int (*ecdsa_do_verify)(const unsigned char *dgst, int dgst_len, 
 			const ECDSA_SIG *sig, EC_KEY *eckey);
 #if 0
@@ -82,6 +83,14 @@ struct ecdsa_method
 	char *app_data;
 	};
 
+/* If this flag is set the ECDSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDSA_FLAG_FIPS_METHOD	0x1
+
 typedef struct ecdsa_data_st {
 	/* EC_KEY_METH_DATA part */
 	int (*init)(EC_KEY *);
diff --git a/main/openssl/crypto/ecdsa/ecs_ossl.c b/main/openssl/crypto/ecdsa/ecs_ossl.c
index 1bbf328d..325aca8c 100644
--- a/main/openssl/crypto/ecdsa/ecs_ossl.c
+++ b/main/openssl/crypto/ecdsa/ecs_ossl.c
@@ -60,11 +60,13 @@
 #include <openssl/err.h>
 #include <openssl/obj_mac.h>
 #include <openssl/bn.h>
+#include <openssl/rand.h>
 
 static ECDSA_SIG *ecdsa_do_sign(const unsigned char *dgst, int dlen, 
 		const BIGNUM *, const BIGNUM *, EC_KEY *eckey);
-static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, 
-		BIGNUM **rp);
+static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
+			    BIGNUM **kinvp, BIGNUM **rp,
+			    const unsigned char *dgst, int dlen);
 static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len, 
 		const ECDSA_SIG *sig, EC_KEY *eckey);
 
@@ -86,8 +88,9 @@ const ECDSA_METHOD *ECDSA_OpenSSL(void)
 	return &openssl_ecdsa_meth;
 }
 
-static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
-		BIGNUM **rp)
+static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
+			    BIGNUM **kinvp, BIGNUM **rp,
+			    const unsigned char *dgst, int dlen)
 {
 	BN_CTX   *ctx = NULL;
 	BIGNUM	 *k = NULL, *r = NULL, *order = NULL, *X = NULL;
@@ -136,11 +139,28 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
 	{
 		/* get random k */	
 		do
-			if (!BN_rand_range(k, order))
+#ifndef OPENSSL_NO_SHA512
+			if (EC_KEY_get_nonce_from_hash(eckey))
 			{
-				ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP,
-				 ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED);	
-				goto err;
+				if (!BN_generate_dsa_nonce(
+					k, order,
+					EC_KEY_get0_private_key(eckey),
+					dgst, dlen, ctx))
+					{
+					ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP,
+						 ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED);
+					goto err;
+					}
+			}
+			else
+#endif
+			{
+				if (!BN_rand_range(k, order))
+				{
+					ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP,
+					 ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED);
+					goto err;
+				}
 			}
 		while (BN_is_zero(k));
 
@@ -167,6 +187,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
 				goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 		else /* NID_X9_62_characteristic_two_field */
 		{
 			if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -176,6 +197,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
 				goto err;
 			}
 		}
+#endif
 		if (!BN_nnmod(r, X, order, ctx))
 		{
 			ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
@@ -280,7 +302,7 @@ static ECDSA_SIG *ecdsa_do_sign(const unsigned char *dgst, int dgst_len,
 	{
 		if (in_kinv == NULL || in_r == NULL)
 		{
-			if (!ECDSA_sign_setup(eckey, ctx, &kinv, &ret->r))
+			if (!ecdsa->meth->ecdsa_sign_setup(eckey, ctx, &kinv, &ret->r, dgst, dgst_len))
 			{
 				ECDSAerr(ECDSA_F_ECDSA_DO_SIGN,ERR_R_ECDSA_LIB);
 				goto err;
@@ -454,6 +476,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
 			goto err;
 		}
 	}
+#ifndef OPENSSL_NO_EC2M
 	else /* NID_X9_62_characteristic_two_field */
 	{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -463,7 +486,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
 			goto err;
 		}
 	}
-	
+#endif	
 	if (!BN_nnmod(u1, X, order, ctx))
 	{
 		ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);
diff --git a/main/openssl/crypto/ecdsa/ecs_sign.c b/main/openssl/crypto/ecdsa/ecs_sign.c
index 353d5af5..ea79a24b 100644
--- a/main/openssl/crypto/ecdsa/ecs_sign.c
+++ b/main/openssl/crypto/ecdsa/ecs_sign.c
@@ -58,6 +58,7 @@
 #include <openssl/engine.h>
 #endif
 #include <openssl/rand.h>
+#include <openssl/err.h>
 
 ECDSA_SIG *ECDSA_do_sign(const unsigned char *dgst, int dlen, EC_KEY *eckey)
 {
@@ -102,5 +103,12 @@ int ECDSA_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
 	ECDSA_DATA *ecdsa = ecdsa_check(eckey);
 	if (ecdsa == NULL)
 		return 0;
-	return ecdsa->meth->ecdsa_sign_setup(eckey, ctx_in, kinvp, rp); 
+	if (EC_KEY_get_nonce_from_hash(eckey))
+		{
+		/* You cannot precompute the ECDSA nonce if it is required to
+		 * depend on the message. */
+		ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ECDSA_R_NONCE_CANNOT_BE_PRECOMPUTED);
+		return 0;
+		}
+	return ecdsa->meth->ecdsa_sign_setup(eckey, ctx_in, kinvp, rp, NULL, 0);
 }
diff --git a/main/openssl/crypto/engine/eng_all.c b/main/openssl/crypto/engine/eng_all.c
index 22c12045..6093376d 100644
--- a/main/openssl/crypto/engine/eng_all.c
+++ b/main/openssl/crypto/engine/eng_all.c
@@ -61,6 +61,8 @@
 
 void ENGINE_load_builtin_engines(void)
 	{
+	/* Some ENGINEs need this */
+	OPENSSL_cpuid_setup();
 #if 0
 	/* There's no longer any need for an "openssl" ENGINE unless, one day,
 	 * it is the *only* way for standard builtin implementations to be be
@@ -71,6 +73,12 @@ void ENGINE_load_builtin_engines(void)
 #if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV))
 	ENGINE_load_cryptodev();
 #endif
+#ifndef OPENSSL_NO_RSAX
+	ENGINE_load_rsax();
+#endif
+#ifndef OPENSSL_NO_RDRAND
+	ENGINE_load_rdrand();
+#endif
 	ENGINE_load_dynamic();
 #ifndef OPENSSL_NO_STATIC_ENGINE
 #ifndef OPENSSL_NO_HW
@@ -112,6 +120,7 @@ void ENGINE_load_builtin_engines(void)
 	ENGINE_load_capi();
 #endif
 #endif
+	ENGINE_register_all_complete();
 	}
 
 #if defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV)
diff --git a/main/openssl/crypto/engine/eng_cryptodev.c b/main/openssl/crypto/engine/eng_cryptodev.c
index 52f4ca39..5a715aca 100644
--- a/main/openssl/crypto/engine/eng_cryptodev.c
+++ b/main/openssl/crypto/engine/eng_cryptodev.c
@@ -79,8 +79,6 @@ struct dev_crypto_state {
 	unsigned char digest_res[HASH_MAX_LEN];
 	char *mac_data;
 	int mac_len;
-
-	int copy;
 #endif
 };
 
@@ -200,6 +198,7 @@ get_dev_crypto(void)
 
 	if ((fd = open_dev_crypto()) == -1)
 		return (-1);
+#ifndef CRIOGET_NOT_NEEDED
 	if (ioctl(fd, CRIOGET, &retfd) == -1)
 		return (-1);
 
@@ -208,9 +207,19 @@ get_dev_crypto(void)
 		close(retfd);
 		return (-1);
 	}
+#else
+        retfd = fd;
+#endif
 	return (retfd);
 }
 
+static void put_dev_crypto(int fd)
+{
+#ifndef CRIOGET_NOT_NEEDED
+	close(fd);
+#endif
+}
+
 /* Caching version for asym operations */
 static int
 get_asym_dev_crypto(void)
@@ -252,7 +261,7 @@ get_cryptodev_ciphers(const int **cnids)
 		    ioctl(fd, CIOCFSESSION, &sess.ses) != -1)
 			nids[count++] = ciphers[i].nid;
 	}
-	close(fd);
+	put_dev_crypto(fd);
 
 	if (count > 0)
 		*cnids = nids;
@@ -291,7 +300,7 @@ get_cryptodev_digests(const int **cnids)
 		    ioctl(fd, CIOCFSESSION, &sess.ses) != -1)
 			nids[count++] = digests[i].nid;
 	}
-	close(fd);
+	put_dev_crypto(fd);
 
 	if (count > 0)
 		*cnids = nids;
@@ -436,7 +445,7 @@ cryptodev_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	sess->cipher = cipher;
 
 	if (ioctl(state->d_fd, CIOCGSESSION, sess) == -1) {
-		close(state->d_fd);
+		put_dev_crypto(state->d_fd);
 		state->d_fd = -1;
 		return (0);
 	}
@@ -473,7 +482,7 @@ cryptodev_cleanup(EVP_CIPHER_CTX *ctx)
 	} else {
 		ret = 1;
 	}
-	close(state->d_fd);
+	put_dev_crypto(state->d_fd);
 	state->d_fd = -1;
 
 	return (ret);
@@ -686,7 +695,7 @@ static int cryptodev_digest_init(EVP_MD_CTX *ctx)
 	sess->mac = digest;
 
 	if (ioctl(state->d_fd, CIOCGSESSION, sess) < 0) {
-		close(state->d_fd);
+		put_dev_crypto(state->d_fd);
 		state->d_fd = -1;
 		printf("cryptodev_digest_init: Open session failed\n");
 		return (0);
@@ -758,14 +767,12 @@ static int cryptodev_digest_final(EVP_MD_CTX *ctx, unsigned char *md)
 	if (! (ctx->flags & EVP_MD_CTX_FLAG_ONESHOT) ) {
 		/* if application doesn't support one buffer */
 		memset(&cryp, 0, sizeof(cryp));
-
 		cryp.ses = sess->ses;
 		cryp.flags = 0;
 		cryp.len = state->mac_len;
 		cryp.src = state->mac_data;
 		cryp.dst = NULL;
 		cryp.mac = (caddr_t)md;
-
 		if (ioctl(state->d_fd, CIOCCRYPT, &cryp) < 0) {
 			printf("cryptodev_digest_final: digest failed\n");
 			return (0);
@@ -786,6 +793,9 @@ static int cryptodev_digest_cleanup(EVP_MD_CTX *ctx)
 	struct dev_crypto_state *state = ctx->md_data;
 	struct session_op *sess = &state->d_sess;
 
+	if (state == NULL)
+	  return 0;
+
 	if (state->d_fd < 0) {
 		printf("cryptodev_digest_cleanup: illegal input\n");
 		return (0);
@@ -797,16 +807,13 @@ static int cryptodev_digest_cleanup(EVP_MD_CTX *ctx)
 		state->mac_len = 0;
 	}
 
-	if (state->copy)
-		return 1;
-
 	if (ioctl(state->d_fd, CIOCFSESSION, &sess->ses) < 0) {
 		printf("cryptodev_digest_cleanup: failed to close session\n");
 		ret = 0;
 	} else {
 		ret = 1;
 	}
-	close(state->d_fd);	
+	put_dev_crypto(state->d_fd);	
 	state->d_fd = -1;
 
 	return (ret);
@@ -816,15 +823,39 @@ static int cryptodev_digest_copy(EVP_MD_CTX *to,const EVP_MD_CTX *from)
 {
 	struct dev_crypto_state *fstate = from->md_data;
 	struct dev_crypto_state *dstate = to->md_data;
+	struct session_op *sess;
+	int digest;
 
-	memcpy(dstate, fstate, sizeof(struct dev_crypto_state));
+	if (dstate == NULL || fstate == NULL)
+	  return 1;
 
-	if (fstate->mac_len != 0) {
-		dstate->mac_data = OPENSSL_malloc(fstate->mac_len);
-		memcpy(dstate->mac_data, fstate->mac_data, fstate->mac_len);
+       	memcpy(dstate, fstate, sizeof(struct dev_crypto_state));
+
+	sess = &dstate->d_sess;
+
+	digest = digest_nid_to_cryptodev(to->digest->type);
+
+	sess->mackey = dstate->dummy_mac_key;
+	sess->mackeylen = digest_key_length(to->digest->type);
+	sess->mac = digest;
+
+	dstate->d_fd = get_dev_crypto();
+
+	if (ioctl(dstate->d_fd, CIOCGSESSION, sess) < 0) {
+		put_dev_crypto(dstate->d_fd);
+		dstate->d_fd = -1;
+		printf("cryptodev_digest_init: Open session failed\n");
+		return (0);
 	}
 
-	dstate->copy = 1;
+	if (fstate->mac_len != 0) {
+	        if (fstate->mac_data != NULL)
+	                {
+        		dstate->mac_data = OPENSSL_malloc(fstate->mac_len);
+	        	memcpy(dstate->mac_data, fstate->mac_data, fstate->mac_len);
+           		dstate->mac_len = fstate->mac_len;
+	        	}
+	}
 
 	return 1;
 }
@@ -1347,11 +1378,11 @@ ENGINE_load_cryptodev(void)
 	 * find out what asymmetric crypto algorithms we support
 	 */
 	if (ioctl(fd, CIOCASYMFEAT, &cryptodev_asymfeat) == -1) {
-		close(fd);
+		put_dev_crypto(fd);
 		ENGINE_free(engine);
 		return;
 	}
-	close(fd);
+	put_dev_crypto(fd);
 
 	if (!ENGINE_set_id(engine, "cryptodev") ||
 	    !ENGINE_set_name(engine, "BSD cryptodev engine") ||
diff --git a/main/openssl/crypto/engine/eng_dyn.c b/main/openssl/crypto/engine/eng_dyn.c
index 807da7a5..8fb8634e 100644
--- a/main/openssl/crypto/engine/eng_dyn.c
+++ b/main/openssl/crypto/engine/eng_dyn.c
@@ -408,7 +408,7 @@ static int int_load(dynamic_data_ctx *ctx)
 	int num, loop;
 	/* Unless told not to, try a direct load */
 	if((ctx->dir_load != 2) && (DSO_load(ctx->dynamic_dso,
-				ctx->DYNAMIC_LIBNAME, NULL, 0)) != NULL)
+				ctx->DYNAMIC_LIBNAME, NULL, 0) != NULL))
 		return 1;
 	/* If we're not allowed to use 'dirs' or we have none, fail */
 	if(!ctx->dir_load || (num = sk_OPENSSL_STRING_num(ctx->dirs)) < 1)
@@ -423,6 +423,9 @@ static int int_load(dynamic_data_ctx *ctx)
 			{
 			/* Found what we're looking for */
 			OPENSSL_free(merge);
+			/* Previous failed loop iterations, if any, will have resulted in
+			 * errors. Clear them out before returning success. */
+			ERR_clear_error();
 			return 1;
 			}
 		OPENSSL_free(merge);
diff --git a/main/openssl/crypto/engine/eng_fat.c b/main/openssl/crypto/engine/eng_fat.c
index db66e623..789b8d57 100644
--- a/main/openssl/crypto/engine/eng_fat.c
+++ b/main/openssl/crypto/engine/eng_fat.c
@@ -176,6 +176,7 @@ int ENGINE_register_all_complete(void)
 	ENGINE *e;
 
 	for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
-		ENGINE_register_complete(e);
+		if (!(e->flags & ENGINE_FLAGS_NO_REGISTER_ALL))
+			ENGINE_register_complete(e);
 	return 1;
 	}
diff --git a/main/openssl/crypto/engine/engine.h b/main/openssl/crypto/engine/engine.h
index 943aeae2..f8be4977 100644
--- a/main/openssl/crypto/engine/engine.h
+++ b/main/openssl/crypto/engine/engine.h
@@ -141,6 +141,13 @@ extern "C" {
  * the existing ENGINE's structural reference count. */
 #define ENGINE_FLAGS_BY_ID_COPY		(int)0x0004
 
+/* This flag if for an ENGINE that does not want its methods registered as 
+ * part of ENGINE_register_all_complete() for example if the methods are
+ * not usable as default methods.
+ */
+
+#define ENGINE_FLAGS_NO_REGISTER_ALL	(int)0x0008
+
 /* ENGINEs can support their own command types, and these flags are used in
  * ENGINE_CTRL_GET_CMD_FLAGS to indicate to the caller what kind of input each
  * command expects. Currently only numeric and string input is supported. If a
@@ -344,6 +351,8 @@ void ENGINE_load_gost(void);
 #endif
 #endif
 void ENGINE_load_cryptodev(void);
+void ENGINE_load_rsax(void);
+void ENGINE_load_rdrand(void);
 void ENGINE_load_builtin_engines(void);
 
 /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation
diff --git a/main/openssl/crypto/engine/tb_asnmth.c b/main/openssl/crypto/engine/tb_asnmth.c
new file mode 100644
index 00000000..75090339
--- /dev/null
+++ b/main/openssl/crypto/engine/tb_asnmth.c
@@ -0,0 +1,246 @@
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "eng_int.h"
+#include "asn1_locl.h"
+#include <openssl/evp.h>
+
+/* If this symbol is defined then ENGINE_get_pkey_asn1_meth_engine(), the
+ * function that is used by EVP to hook in pkey_asn1_meth code and cache
+ * defaults (etc), will display brief debugging summaries to stderr with the
+ * 'nid'. */
+/* #define ENGINE_PKEY_ASN1_METH_DEBUG */
+
+static ENGINE_TABLE *pkey_asn1_meth_table = NULL;
+
+void ENGINE_unregister_pkey_asn1_meths(ENGINE *e)
+	{
+	engine_table_unregister(&pkey_asn1_meth_table, e);
+	}
+
+static void engine_unregister_all_pkey_asn1_meths(void)
+	{
+	engine_table_cleanup(&pkey_asn1_meth_table);
+	}
+
+int ENGINE_register_pkey_asn1_meths(ENGINE *e)
+	{
+	if(e->pkey_asn1_meths)
+		{
+		const int *nids;
+		int num_nids = e->pkey_asn1_meths(e, NULL, &nids, 0);
+		if(num_nids > 0)
+			return engine_table_register(&pkey_asn1_meth_table,
+				engine_unregister_all_pkey_asn1_meths, e, nids,
+					num_nids, 0);
+		}
+	return 1;
+	}
+
+void ENGINE_register_all_pkey_asn1_meths(void)
+	{
+	ENGINE *e;
+
+	for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
+		ENGINE_register_pkey_asn1_meths(e);
+	}
+
+int ENGINE_set_default_pkey_asn1_meths(ENGINE *e)
+	{
+	if(e->pkey_asn1_meths)
+		{
+		const int *nids;
+		int num_nids = e->pkey_asn1_meths(e, NULL, &nids, 0);
+		if(num_nids > 0)
+			return engine_table_register(&pkey_asn1_meth_table,
+				engine_unregister_all_pkey_asn1_meths, e, nids,
+					num_nids, 1);
+		}
+	return 1;
+	}
+
+/* Exposed API function to get a functional reference from the implementation
+ * table (ie. try to get a functional reference from the tabled structural
+ * references) for a given pkey_asn1_meth 'nid' */
+ENGINE *ENGINE_get_pkey_asn1_meth_engine(int nid)
+	{
+	return engine_table_select(&pkey_asn1_meth_table, nid);
+	}
+
+/* Obtains a pkey_asn1_meth implementation from an ENGINE functional reference */
+const EVP_PKEY_ASN1_METHOD *ENGINE_get_pkey_asn1_meth(ENGINE *e, int nid)
+	{
+	EVP_PKEY_ASN1_METHOD *ret;
+	ENGINE_PKEY_ASN1_METHS_PTR fn = ENGINE_get_pkey_asn1_meths(e);
+	if(!fn || !fn(e, &ret, NULL, nid))
+		{
+		ENGINEerr(ENGINE_F_ENGINE_GET_PKEY_ASN1_METH,
+				ENGINE_R_UNIMPLEMENTED_PUBLIC_KEY_METHOD);
+		return NULL;
+		}
+	return ret;
+	}
+
+/* Gets the pkey_asn1_meth callback from an ENGINE structure */
+ENGINE_PKEY_ASN1_METHS_PTR ENGINE_get_pkey_asn1_meths(const ENGINE *e)
+	{
+	return e->pkey_asn1_meths;
+	}
+
+/* Sets the pkey_asn1_meth callback in an ENGINE structure */
+int ENGINE_set_pkey_asn1_meths(ENGINE *e, ENGINE_PKEY_ASN1_METHS_PTR f)
+	{
+	e->pkey_asn1_meths = f;
+	return 1;
+	}
+
+/* Internal function to free up EVP_PKEY_ASN1_METHOD structures before an
+ * ENGINE is destroyed
+ */
+
+void engine_pkey_asn1_meths_free(ENGINE *e)
+	{
+	int i;
+	EVP_PKEY_ASN1_METHOD *pkm;
+	if (e->pkey_asn1_meths)
+		{
+		const int *pknids;
+		int npknids;
+		npknids = e->pkey_asn1_meths(e, NULL, &pknids, 0);
+		for (i = 0; i < npknids; i++)
+			{
+			if (e->pkey_asn1_meths(e, &pkm, NULL, pknids[i]))
+				{
+				EVP_PKEY_asn1_free(pkm);
+				}
+			}
+		}
+	}
+
+/* Find a method based on a string. This does a linear search through
+ * all implemented algorithms. This is OK in practice because only
+ * a small number of algorithms are likely to be implemented in an engine
+ * and it is not used for speed critical operations.
+ */
+
+const EVP_PKEY_ASN1_METHOD *ENGINE_get_pkey_asn1_meth_str(ENGINE *e,
+					const char *str, int len)
+	{
+	int i, nidcount;
+	const int *nids;
+	EVP_PKEY_ASN1_METHOD *ameth;
+	if (!e->pkey_asn1_meths)
+		return NULL;
+	if (len == -1)
+		len = strlen(str);
+	nidcount = e->pkey_asn1_meths(e, NULL, &nids, 0);
+	for (i = 0; i < nidcount; i++)
+		{
+		e->pkey_asn1_meths(e, &ameth, NULL, nids[i]);
+		if (((int)strlen(ameth->pem_str) == len) && 
+					!strncasecmp(ameth->pem_str, str, len))
+			return ameth;
+		}
+	return NULL;
+	}
+
+typedef struct 
+	{
+	ENGINE *e;
+	const EVP_PKEY_ASN1_METHOD *ameth;
+	const char *str;
+	int len;
+	} ENGINE_FIND_STR;
+
+static void look_str_cb(int nid, STACK_OF(ENGINE) *sk, ENGINE *def, void *arg)
+	{
+	ENGINE_FIND_STR *lk = arg;
+	int i;
+	if (lk->ameth)
+		return;
+	for (i = 0; i < sk_ENGINE_num(sk); i++)
+		{
+		ENGINE *e = sk_ENGINE_value(sk, i);
+		EVP_PKEY_ASN1_METHOD *ameth;
+		e->pkey_asn1_meths(e, &ameth, NULL, nid);
+		if (((int)strlen(ameth->pem_str) == lk->len) && 
+				!strncasecmp(ameth->pem_str, lk->str, lk->len))
+			{
+			lk->e = e;
+			lk->ameth = ameth;
+			return;
+			}
+		}
+	}
+
+const EVP_PKEY_ASN1_METHOD *ENGINE_pkey_asn1_find_str(ENGINE **pe,
+					const char *str, int len)
+	{
+	ENGINE_FIND_STR fstr;
+	fstr.e = NULL;
+	fstr.ameth = NULL;
+	fstr.str = str;
+	fstr.len = len;
+	CRYPTO_w_lock(CRYPTO_LOCK_ENGINE);
+	engine_table_doall(pkey_asn1_meth_table, look_str_cb, &fstr);
+	/* If found obtain a structural reference to engine */
+	if (fstr.e)
+		{
+		fstr.e->struct_ref++;
+		engine_ref_debug(fstr.e, 0, 1)
+		}
+	*pe = fstr.e;
+	CRYPTO_w_unlock(CRYPTO_LOCK_ENGINE);
+	return fstr.ameth;
+	}
diff --git a/main/openssl/crypto/engine/tb_pkmeth.c b/main/openssl/crypto/engine/tb_pkmeth.c
new file mode 100644
index 00000000..1cdb967f
--- /dev/null
+++ b/main/openssl/crypto/engine/tb_pkmeth.c
@@ -0,0 +1,167 @@
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "eng_int.h"
+#include <openssl/evp.h>
+
+/* If this symbol is defined then ENGINE_get_pkey_meth_engine(), the function
+ * that is used by EVP to hook in pkey_meth code and cache defaults (etc), will
+ * display brief debugging summaries to stderr with the 'nid'. */
+/* #define ENGINE_PKEY_METH_DEBUG */
+
+static ENGINE_TABLE *pkey_meth_table = NULL;
+
+void ENGINE_unregister_pkey_meths(ENGINE *e)
+	{
+	engine_table_unregister(&pkey_meth_table, e);
+	}
+
+static void engine_unregister_all_pkey_meths(void)
+	{
+	engine_table_cleanup(&pkey_meth_table);
+	}
+
+int ENGINE_register_pkey_meths(ENGINE *e)
+	{
+	if(e->pkey_meths)
+		{
+		const int *nids;
+		int num_nids = e->pkey_meths(e, NULL, &nids, 0);
+		if(num_nids > 0)
+			return engine_table_register(&pkey_meth_table,
+				engine_unregister_all_pkey_meths, e, nids,
+					num_nids, 0);
+		}
+	return 1;
+	}
+
+void ENGINE_register_all_pkey_meths()
+	{
+	ENGINE *e;
+
+	for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
+		ENGINE_register_pkey_meths(e);
+	}
+
+int ENGINE_set_default_pkey_meths(ENGINE *e)
+	{
+	if(e->pkey_meths)
+		{
+		const int *nids;
+		int num_nids = e->pkey_meths(e, NULL, &nids, 0);
+		if(num_nids > 0)
+			return engine_table_register(&pkey_meth_table,
+				engine_unregister_all_pkey_meths, e, nids,
+					num_nids, 1);
+		}
+	return 1;
+	}
+
+/* Exposed API function to get a functional reference from the implementation
+ * table (ie. try to get a functional reference from the tabled structural
+ * references) for a given pkey_meth 'nid' */
+ENGINE *ENGINE_get_pkey_meth_engine(int nid)
+	{
+	return engine_table_select(&pkey_meth_table, nid);
+	}
+
+/* Obtains a pkey_meth implementation from an ENGINE functional reference */
+const EVP_PKEY_METHOD *ENGINE_get_pkey_meth(ENGINE *e, int nid)
+	{
+	EVP_PKEY_METHOD *ret;
+	ENGINE_PKEY_METHS_PTR fn = ENGINE_get_pkey_meths(e);
+	if(!fn || !fn(e, &ret, NULL, nid))
+		{
+		ENGINEerr(ENGINE_F_ENGINE_GET_PKEY_METH,
+				ENGINE_R_UNIMPLEMENTED_PUBLIC_KEY_METHOD);
+		return NULL;
+		}
+	return ret;
+	}
+
+/* Gets the pkey_meth callback from an ENGINE structure */
+ENGINE_PKEY_METHS_PTR ENGINE_get_pkey_meths(const ENGINE *e)
+	{
+	return e->pkey_meths;
+	}
+
+/* Sets the pkey_meth callback in an ENGINE structure */
+int ENGINE_set_pkey_meths(ENGINE *e, ENGINE_PKEY_METHS_PTR f)
+	{
+	e->pkey_meths = f;
+	return 1;
+	}
+
+/* Internal function to free up EVP_PKEY_METHOD structures before an
+ * ENGINE is destroyed
+ */
+
+void engine_pkey_meths_free(ENGINE *e)
+	{
+	int i;
+	EVP_PKEY_METHOD *pkm;
+	if (e->pkey_meths)
+		{
+		const int *pknids;
+		int npknids;
+		npknids = e->pkey_meths(e, NULL, &pknids, 0);
+		for (i = 0; i < npknids; i++)
+			{
+			if (e->pkey_meths(e, &pkm, NULL, pknids[i]))
+				{
+				EVP_PKEY_meth_free(pkm);
+				}
+			}
+		}
+	}
diff --git a/main/openssl/crypto/err/err.c b/main/openssl/crypto/err/err.c
index 69713a6e..fcdb2440 100644
--- a/main/openssl/crypto/err/err.c
+++ b/main/openssl/crypto/err/err.c
@@ -1066,6 +1066,13 @@ void ERR_set_error_data(char *data, int flags)
 void ERR_add_error_data(int num, ...)
 	{
 	va_list args;
+	va_start(args, num);
+	ERR_add_error_vdata(num, args);
+	va_end(args);
+	}
+
+void ERR_add_error_vdata(int num, va_list args)
+	{
 	int i,n,s;
 	char *str,*p,*a;
 
@@ -1074,7 +1081,6 @@ void ERR_add_error_data(int num, ...)
 	if (str == NULL) return;
 	str[0]='\0';
 
-	va_start(args, num);
 	n=0;
 	for (i=0; i<num; i++)
 		{
@@ -1090,7 +1096,7 @@ void ERR_add_error_data(int num, ...)
 				if (p == NULL)
 					{
 					OPENSSL_free(str);
-					goto err;
+					return;
 					}
 				else
 					str=p;
@@ -1099,9 +1105,6 @@ void ERR_add_error_data(int num, ...)
 			}
 		}
 	ERR_set_error_data(str,ERR_TXT_MALLOCED|ERR_TXT_STRING);
-
-err:
-	va_end(args);
 	}
 
 int ERR_set_mark(void)
diff --git a/main/openssl/crypto/err/err.h b/main/openssl/crypto/err/err.h
index b9f8c16d..974cc9cc 100644
--- a/main/openssl/crypto/err/err.h
+++ b/main/openssl/crypto/err/err.h
@@ -344,8 +344,9 @@ void ERR_print_errors_fp(FILE *fp);
 #endif
 #ifndef OPENSSL_NO_BIO
 void ERR_print_errors(BIO *bp);
-void ERR_add_error_data(int num, ...);
 #endif
+void ERR_add_error_data(int num, ...);
+void ERR_add_error_vdata(int num, va_list args);
 void ERR_load_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_unload_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_load_ERR_strings(void);
diff --git a/main/openssl/crypto/err/err_all.c b/main/openssl/crypto/err/err_all.c
index fc049e8e..8eb547d9 100644
--- a/main/openssl/crypto/err/err_all.c
+++ b/main/openssl/crypto/err/err_all.c
@@ -64,7 +64,9 @@
 #endif
 #include <openssl/buffer.h>
 #include <openssl/bio.h>
+#ifndef OPENSSL_NO_COMP
 #include <openssl/comp.h>
+#endif
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
@@ -95,6 +97,9 @@
 #include <openssl/ui.h>
 #include <openssl/ocsp.h>
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include <openssl/ts.h>
 #ifndef OPENSSL_NO_CMS
 #include <openssl/cms.h>
@@ -102,7 +107,6 @@
 #ifndef OPENSSL_NO_JPAKE
 #include <openssl/jpake.h>
 #endif
-#include <openssl/comp.h>
 
 void ERR_load_crypto_strings(void)
 	{
@@ -126,7 +130,9 @@ void ERR_load_crypto_strings(void)
 	ERR_load_ASN1_strings();
 	ERR_load_CONF_strings();
 	ERR_load_CRYPTO_strings();
+#ifndef OPENSSL_NO_COMP
 	ERR_load_COMP_strings();
+#endif
 #ifndef OPENSSL_NO_EC
 	ERR_load_EC_strings();
 #endif
@@ -149,12 +155,14 @@ void ERR_load_crypto_strings(void)
 #endif
 	ERR_load_OCSP_strings();
 	ERR_load_UI_strings();
+#ifdef OPENSSL_FIPS
+	ERR_load_FIPS_strings();
+#endif
 #ifndef OPENSSL_NO_CMS
 	ERR_load_CMS_strings();
 #endif
 #ifndef OPENSSL_NO_JPAKE
 	ERR_load_JPAKE_strings();
 #endif
-	ERR_load_COMP_strings();
 #endif
 	}
diff --git a/main/openssl/crypto/evp/bio_md.c b/main/openssl/crypto/evp/bio_md.c
index 9841e32e..144fdfd5 100644
--- a/main/openssl/crypto/evp/bio_md.c
+++ b/main/openssl/crypto/evp/bio_md.c
@@ -153,8 +153,12 @@ static int md_write(BIO *b, const char *in, int inl)
 		{
 		if (ret > 0)
 			{
-			EVP_DigestUpdate(ctx,(const unsigned char *)in,
-				(unsigned int)ret);
+			if (!EVP_DigestUpdate(ctx,(const unsigned char *)in,
+				(unsigned int)ret))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
 			}
 		}
 	if(b->next_bio != NULL)
@@ -220,7 +224,8 @@ static long md_ctrl(BIO *b, int cmd, long num, void *ptr)
 	case BIO_CTRL_DUP:
 		dbio=ptr;
 		dctx=dbio->ptr;
-		EVP_MD_CTX_copy_ex(dctx,ctx);
+		if (!EVP_MD_CTX_copy_ex(dctx,ctx))
+			return 0;
 		b->init=1;
 		break;
 	default:
diff --git a/main/openssl/crypto/evp/bio_ok.c b/main/openssl/crypto/evp/bio_ok.c
index 98bc1ab4..e6433535 100644
--- a/main/openssl/crypto/evp/bio_ok.c
+++ b/main/openssl/crypto/evp/bio_ok.c
@@ -133,10 +133,10 @@ static int ok_new(BIO *h);
 static int ok_free(BIO *data);
 static long ok_callback_ctrl(BIO *h, int cmd, bio_info_cb *fp);
 
-static void sig_out(BIO* b);
-static void sig_in(BIO* b);
-static void block_out(BIO* b);
-static void block_in(BIO* b);
+static int sig_out(BIO* b);
+static int sig_in(BIO* b);
+static int block_out(BIO* b);
+static int block_in(BIO* b);
 #define OK_BLOCK_SIZE	(1024*4)
 #define OK_BLOCK_BLOCK	4
 #define IOBS		(OK_BLOCK_SIZE+ OK_BLOCK_BLOCK+ 3*EVP_MAX_MD_SIZE)
@@ -266,10 +266,24 @@ static int ok_read(BIO *b, char *out, int outl)
 		ctx->buf_len+= i;
 
 		/* no signature yet -- check if we got one */
-		if (ctx->sigio == 1) sig_in(b);
+		if (ctx->sigio == 1)
+			{
+			if (!sig_in(b))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
+			}
 
 		/* signature ok -- check if we got block */
-		if (ctx->sigio == 0) block_in(b);
+		if (ctx->sigio == 0)
+			{
+			if (!block_in(b))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
+			}
 
 		/* invalid block -- cancel */
 		if (ctx->cont <= 0) break;
@@ -293,7 +307,8 @@ static int ok_write(BIO *b, const char *in, int inl)
 
 	if ((ctx == NULL) || (b->next_bio == NULL) || (b->init == 0)) return(0);
 
-	if(ctx->sigio) sig_out(b);
+	if(ctx->sigio && !sig_out(b))
+		return 0;
 
 	do{
 		BIO_clear_retry_flags(b);
@@ -332,7 +347,11 @@ static int ok_write(BIO *b, const char *in, int inl)
 
 		if(ctx->buf_len >= OK_BLOCK_SIZE+ OK_BLOCK_BLOCK)
 			{
-			block_out(b);
+			if (!block_out(b))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
 			}
 	}while(inl > 0);
 
@@ -379,7 +398,8 @@ static long ok_ctrl(BIO *b, int cmd, long num, void *ptr)
 	case BIO_CTRL_FLUSH:
 		/* do a final write */
 		if(ctx->blockout == 0)
-			block_out(b);
+			if (!block_out(b))
+				return 0;
 
 		while (ctx->blockout)
 			{
@@ -408,7 +428,8 @@ static long ok_ctrl(BIO *b, int cmd, long num, void *ptr)
 		break;
 	case BIO_C_SET_MD:
 		md=ptr;
-		EVP_DigestInit_ex(&ctx->md, md, NULL);
+		if (!EVP_DigestInit_ex(&ctx->md, md, NULL))
+			return 0;
 		b->init=1;
 		break;
 	case BIO_C_GET_MD:
@@ -455,7 +476,7 @@ static void longswap(void *_ptr, size_t len)
 	}
 }
 
-static void sig_out(BIO* b)
+static int sig_out(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -463,9 +484,10 @@ static void sig_out(BIO* b)
 	ctx=b->ptr;
 	md=&ctx->md;
 
-	if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return;
+	if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return 1;
 
-	EVP_DigestInit_ex(md, md->digest, NULL);
+	if (!EVP_DigestInit_ex(md, md->digest, NULL))
+		goto berr;
 	/* FIXME: there's absolutely no guarantee this makes any sense at all,
 	 * particularly now EVP_MD_CTX has been restructured.
 	 */
@@ -474,14 +496,20 @@ static void sig_out(BIO* b)
 	longswap(&(ctx->buf[ctx->buf_len]), md->digest->md_size);
 	ctx->buf_len+= md->digest->md_size;
 
-	EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
-	EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+	if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+		goto berr;
 	ctx->buf_len+= md->digest->md_size;
 	ctx->blockout= 1;
 	ctx->sigio= 0;
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
-static void sig_in(BIO* b)
+static int sig_in(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -491,15 +519,18 @@ static void sig_in(BIO* b)
 	ctx=b->ptr;
 	md=&ctx->md;
 
-	if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return;
+	if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return 1;
 
-	EVP_DigestInit_ex(md, md->digest, NULL);
+	if (!EVP_DigestInit_ex(md, md->digest, NULL))
+		goto berr;
 	memcpy(md->md_data, &(ctx->buf[ctx->buf_off]), md->digest->md_size);
 	longswap(md->md_data, md->digest->md_size);
 	ctx->buf_off+= md->digest->md_size;
 
-	EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
-	EVP_DigestFinal_ex(md, tmp, NULL);
+	if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, tmp, NULL))
+		goto berr;
 	ret= memcmp(&(ctx->buf[ctx->buf_off]), tmp, md->digest->md_size) == 0;
 	ctx->buf_off+= md->digest->md_size;
 	if(ret == 1)
@@ -516,9 +547,13 @@ static void sig_in(BIO* b)
 		{
 		ctx->cont= 0;
 		}
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
-static void block_out(BIO* b)
+static int block_out(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -532,13 +567,20 @@ static void block_out(BIO* b)
 	ctx->buf[1]=(unsigned char)(tl>>16);
 	ctx->buf[2]=(unsigned char)(tl>>8);
 	ctx->buf[3]=(unsigned char)(tl);
-	EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
-	EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+	if (!EVP_DigestUpdate(md,
+		(unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+		goto berr;
 	ctx->buf_len+= md->digest->md_size;
 	ctx->blockout= 1;
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
-static void block_in(BIO* b)
+static int block_in(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -554,10 +596,13 @@ static void block_in(BIO* b)
 	tl|=ctx->buf[2]; tl<<=8;
 	tl|=ctx->buf[3];
 
-	if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return;
+	if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return 1;
  
-	EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
-	EVP_DigestFinal_ex(md, tmp, NULL);
+	if (!EVP_DigestUpdate(md,
+			(unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, tmp, NULL))
+		goto berr;
 	if(memcmp(&(ctx->buf[tl+ OK_BLOCK_BLOCK]), tmp, md->digest->md_size) == 0)
 		{
 		/* there might be parts from next block lurking around ! */
@@ -571,5 +616,9 @@ static void block_in(BIO* b)
 		{
 		ctx->cont= 0;
 		}
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
diff --git a/main/openssl/crypto/evp/c_allc.c b/main/openssl/crypto/evp/c_allc.c
index c5f92683..2a45d435 100644
--- a/main/openssl/crypto/evp/c_allc.c
+++ b/main/openssl/crypto/evp/c_allc.c
@@ -98,6 +98,9 @@ void OpenSSL_add_all_ciphers(void)
 #ifndef OPENSSL_NO_RC4
 	EVP_add_cipher(EVP_rc4());
 	EVP_add_cipher(EVP_rc4_40());
+#ifndef OPENSSL_NO_MD5
+	EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_IDEA
@@ -166,9 +169,9 @@ void OpenSSL_add_all_ciphers(void)
 	EVP_add_cipher(EVP_aes_128_cfb1());
 	EVP_add_cipher(EVP_aes_128_cfb8());
 	EVP_add_cipher(EVP_aes_128_ofb());
-#if 0
 	EVP_add_cipher(EVP_aes_128_ctr());
-#endif
+	EVP_add_cipher(EVP_aes_128_gcm());
+	EVP_add_cipher(EVP_aes_128_xts());
 	EVP_add_cipher_alias(SN_aes_128_cbc,"AES128");
 	EVP_add_cipher_alias(SN_aes_128_cbc,"aes128");
 	EVP_add_cipher(EVP_aes_192_ecb());
@@ -177,9 +180,8 @@ void OpenSSL_add_all_ciphers(void)
 	EVP_add_cipher(EVP_aes_192_cfb1());
 	EVP_add_cipher(EVP_aes_192_cfb8());
 	EVP_add_cipher(EVP_aes_192_ofb());
-#if 0
 	EVP_add_cipher(EVP_aes_192_ctr());
-#endif
+	EVP_add_cipher(EVP_aes_192_gcm());
 	EVP_add_cipher_alias(SN_aes_192_cbc,"AES192");
 	EVP_add_cipher_alias(SN_aes_192_cbc,"aes192");
 	EVP_add_cipher(EVP_aes_256_ecb());
@@ -188,11 +190,15 @@ void OpenSSL_add_all_ciphers(void)
 	EVP_add_cipher(EVP_aes_256_cfb1());
 	EVP_add_cipher(EVP_aes_256_cfb8());
 	EVP_add_cipher(EVP_aes_256_ofb());
-#if 0
 	EVP_add_cipher(EVP_aes_256_ctr());
-#endif
+	EVP_add_cipher(EVP_aes_256_gcm());
+	EVP_add_cipher(EVP_aes_256_xts());
 	EVP_add_cipher_alias(SN_aes_256_cbc,"AES256");
 	EVP_add_cipher_alias(SN_aes_256_cbc,"aes256");
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+	EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+	EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_CAMELLIA
diff --git a/main/openssl/crypto/evp/digest.c b/main/openssl/crypto/evp/digest.c
index 982ba2b1..d14e8e48 100644
--- a/main/openssl/crypto/evp/digest.c
+++ b/main/openssl/crypto/evp/digest.c
@@ -117,6 +117,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 void EVP_MD_CTX_init(EVP_MD_CTX *ctx)
 	{
 	memset(ctx,'\0',sizeof *ctx);
@@ -225,12 +229,26 @@ skip_to_init:
 		}
 	if (ctx->flags & EVP_MD_CTX_FLAG_NO_INIT)
 		return 1;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		if (FIPS_digestinit(ctx, type))
+			return 1;
+		OPENSSL_free(ctx->md_data);
+		ctx->md_data = NULL;
+		return 0;
+		}
+#endif
 	return ctx->digest->init(ctx);
 	}
 
 int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t count)
 	{
+#ifdef OPENSSL_FIPS
+	return FIPS_digestupdate(ctx, data, count);
+#else
 	return ctx->update(ctx,data,count);
+#endif
 	}
 
 /* The caller can assume that this removes any secret data from the context */
@@ -245,6 +263,9 @@ int EVP_DigestFinal(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 /* The caller can assume that this removes any secret data from the context */
 int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 	{
+#ifdef OPENSSL_FIPS
+	return FIPS_digestfinal(ctx, md, size);
+#else
 	int ret;
 
 	OPENSSL_assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE);
@@ -258,6 +279,7 @@ int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 		}
 	memset(ctx->md_data,0,ctx->digest->ctx_size);
 	return ret;
+#endif
 	}
 
 int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in)
@@ -344,13 +366,17 @@ int EVP_Digest(const void *data, size_t count,
 
 void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx)
 	{
-	EVP_MD_CTX_cleanup(ctx);
-	OPENSSL_free(ctx);
+	if (ctx)
+		{
+		EVP_MD_CTX_cleanup(ctx);
+		OPENSSL_free(ctx);
+		}
 	}
 
 /* This call frees resources associated with the context */
 int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 	{
+#ifndef OPENSSL_FIPS
 	/* Don't assume ctx->md_data was cleaned in EVP_Digest_Final,
 	 * because sometimes only copies of the context are ever finalised.
 	 */
@@ -363,6 +389,7 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 		OPENSSL_cleanse(ctx->md_data,ctx->digest->ctx_size);
 		OPENSSL_free(ctx->md_data);
 		}
+#endif
 	if (ctx->pctx)
 		EVP_PKEY_CTX_free(ctx->pctx);
 #ifndef OPENSSL_NO_ENGINE
@@ -371,6 +398,9 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 		 * functional reference we held for this reason. */
 		ENGINE_finish(ctx->engine);
 #endif
+#ifdef OPENSSL_FIPS
+	FIPS_md_ctx_cleanup(ctx);
+#endif
 	memset(ctx,'\0',sizeof *ctx);
 
 	return 1;
diff --git a/main/openssl/crypto/evp/e_aes.c b/main/openssl/crypto/evp/e_aes.c
index bd6c0a3a..c7869b69 100644
--- a/main/openssl/crypto/evp/e_aes.c
+++ b/main/openssl/crypto/evp/e_aes.c
@@ -1,5 +1,5 @@
 /* ====================================================================
- * Copyright (c) 2001 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 2001-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -56,57 +56,511 @@
 #include <assert.h>
 #include <openssl/aes.h>
 #include "evp_locl.h"
-
-static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-					const unsigned char *iv, int enc);
+#ifndef OPENSSL_FIPS
+#include "modes_lcl.h"
+#include <openssl/rand.h>
 
 typedef struct
 	{
 	AES_KEY ks;
+	block128_f block;
+	union {
+		cbc128_f cbc;
+		ctr128_f ctr;
+	} stream;
 	} EVP_AES_KEY;
 
-#define data(ctx)	EVP_C_DATA(EVP_AES_KEY,ctx)
-
-IMPLEMENT_BLOCK_CIPHER(aes_128, ks, AES, EVP_AES_KEY,
-		       NID_aes_128, 16, 16, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_192, ks, AES, EVP_AES_KEY,
-		       NID_aes_192, 16, 24, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_256, ks, AES, EVP_AES_KEY,
-		       NID_aes_256, 16, 32, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-
-#define IMPLEMENT_AES_CFBR(ksize,cbits)	IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
-
-IMPLEMENT_AES_CFBR(128,1)
-IMPLEMENT_AES_CFBR(192,1)
-IMPLEMENT_AES_CFBR(256,1)
-
-IMPLEMENT_AES_CFBR(128,8)
-IMPLEMENT_AES_CFBR(192,8)
-IMPLEMENT_AES_CFBR(256,8)
+typedef struct
+	{
+	AES_KEY ks;		/* AES key schedule to use */
+	int key_set;		/* Set if key initialised */
+	int iv_set;		/* Set if an iv is set */
+	GCM128_CONTEXT gcm;
+	unsigned char *iv;	/* Temporary IV store */
+	int ivlen;		/* IV length */
+	int taglen;
+	int iv_gen;		/* It is OK to generate IVs */
+	int tls_aad_len;	/* TLS AAD length */
+	ctr128_f ctr;
+	} EVP_AES_GCM_CTX;
+
+typedef struct
+	{
+	AES_KEY ks1, ks2;	/* AES key schedules to use */
+	XTS128_CONTEXT xts;
+	void     (*stream)(const unsigned char *in,
+			unsigned char *out, size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+	} EVP_AES_XTS_CTX;
+
+typedef struct
+	{
+	AES_KEY ks;		/* AES key schedule to use */
+	int key_set;		/* Set if key initialised */
+	int iv_set;		/* Set if an iv is set */
+	int tag_set;		/* Set if tag is valid */
+	int len_set;		/* Set if message length set */
+	int L, M;		/* L and M parameters from RFC3610 */
+	CCM128_CONTEXT ccm;
+	ccm128_f str;
+	} EVP_AES_CCM_CTX;
+
+#define MAXBITCHUNK	((size_t)1<<(sizeof(size_t)*8-4))
+
+#ifdef VPAES_ASM
+int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+
+void vpaes_encrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+void vpaes_decrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+
+void vpaes_cbc_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			unsigned char *ivec, int enc);
+#endif
+#ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+			size_t length, const AES_KEY *key,
+			unsigned char ivec[16], int enc);
+void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+			size_t len, const AES_KEY *key,
+			const unsigned char ivec[16]);
+void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
+			size_t len, const AES_KEY *key1,
+			const AES_KEY *key2, const unsigned char iv[16]);
+void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
+			size_t len, const AES_KEY *key1,
+			const AES_KEY *key2, const unsigned char iv[16]);
+#endif
+#ifdef AES_CTR_ASM
+void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+			size_t blocks, const AES_KEY *key,
+			const unsigned char ivec[AES_BLOCK_SIZE]);
+#endif
+#ifdef AES_XTS_ASM
+void AES_xts_encrypt(const char *inp,char *out,size_t len,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+void AES_xts_decrypt(const char *inp,char *out,size_t len,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+#endif
+
+#if	defined(AES_ASM) && !defined(I386_ONLY) &&	(  \
+	((defined(__i386)	|| defined(__i386__)	|| \
+	  defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)				)
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+#ifdef VPAES_ASM
+#define VPAES_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+#endif
+#ifdef BSAES_ASM
+#define BSAES_CAPABLE	VPAES_CAPABLE
+#endif
+/*
+ * AES-NI section
+ */
+#define	AESNI_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+
+void aesni_encrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+void aesni_decrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+
+void aesni_ecb_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			int enc);
+void aesni_cbc_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			unsigned char *ivec, int enc);
+
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char *ivec);
+
+void aesni_xts_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+
+void aesni_xts_decrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+
+void aesni_ccm64_encrypt_blocks (const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char ivec[16],
+			unsigned char cmac[16]);
+
+void aesni_ccm64_decrypt_blocks (const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char ivec[16],
+			unsigned char cmac[16]);
+
+static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+		   const unsigned char *iv, int enc)
+	{
+	int ret, mode;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	mode = ctx->cipher->flags & EVP_CIPH_MODE;
+	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+	    && !enc)
+		{ 
+		ret = aesni_set_decrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+		dat->block	= (block128_f)aesni_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)aesni_cbc_encrypt :
+					NULL;
+		}
+	else	{
+		ret = aesni_set_encrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+		dat->block	= (block128_f)aesni_encrypt;
+		if (mode==EVP_CIPH_CBC_MODE)
+			dat->stream.cbc	= (cbc128_f)aesni_cbc_encrypt;
+		else if (mode==EVP_CIPH_CTR_MODE)
+			dat->stream.ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+		else
+			dat->stream.cbc = NULL;
+		}
+
+	if(ret < 0)
+		{
+		EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
+		return 0;
+		}
+
+	return 1;
+	}
+
+static int aesni_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	aesni_cbc_encrypt(in,out,len,ctx->cipher_data,ctx->iv,ctx->encrypt);
+
+	return 1;
+}
+
+static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	size_t	bl = ctx->cipher->block_size;
+
+	if (len<bl)	return 1;
+
+	aesni_ecb_encrypt(in,out,len,ctx->cipher_data,ctx->encrypt);
+
+	return 1;
+}
+
+#define aesni_ofb_cipher aes_ofb_cipher
+static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb_cipher aes_cfb_cipher
+static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb8_cipher aes_cfb8_cipher
+static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb1_cipher aes_cfb1_cipher
+static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_ctr_cipher aes_ctr_cipher
+static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{
+		aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+				(block128_f)aesni_encrypt);
+		gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+		/* If we have an iv can set it directly, otherwise use
+		 * saved IV.
+		 */
+		if (iv == NULL && gctx->iv_set)
+			iv = gctx->iv;
+		if (iv)
+			{
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+			gctx->iv_set = 1;
+			}
+		gctx->key_set = 1;
+		}
+	else
+		{
+		/* If key set use IV, otherwise copy */
+		if (gctx->key_set)
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+		else
+			memcpy(gctx->iv, iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		gctx->iv_gen = 0;
+		}
+	return 1;
+	}
+
+#define aesni_gcm_cipher aes_gcm_cipher
+static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+
+	if (key)
+		{
+		/* key_len is two AES keys */
+		if (enc)
+			{
+			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)aesni_encrypt;
+			xctx->stream = aesni_xts_encrypt;
+			}
+		else
+			{
+			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)aesni_decrypt;
+			xctx->stream = aesni_xts_decrypt;
+			}
+
+		aesni_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)aesni_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		}
+
+	if (iv)
+		{
+		xctx->xts.key2 = &xctx->ks2;
+		memcpy(ctx->iv, iv, 16);
+		}
+
+	return 1;
+	}
+
+#define aesni_xts_cipher aes_xts_cipher
+static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{
+		aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)aesni_encrypt);
+		cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
+				(ccm128_f)aesni_ccm64_decrypt_blocks;
+		cctx->key_set = 1;
+		}
+	if (iv)
+		{
+		memcpy(ctx->iv, iv, 15 - cctx->L);
+		cctx->iv_set = 1;
+		}
+	return 1;
+	}
+
+#define aesni_ccm_cipher aes_ccm_cipher
+static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aesni_init_key,			\
+	aesni_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,	\
+	keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_init_key,			\
+	aes_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aesni_##mode##_init_key,	\
+	aesni_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_##mode##_init_key,		\
+	aes_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#else
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_init_key,			\
+	aes_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_##mode##_init_key,		\
+	aes_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+#endif
+
+#define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\
+	BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,ctr,ctr,CTR,flags)
 
 static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		   const unsigned char *iv, int enc)
 	{
-	int ret;
+	int ret, mode;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
 
-	if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
-	    || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
-	    || enc) 
-		ret=AES_set_encrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+	mode = ctx->cipher->flags & EVP_CIPH_MODE;
+	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+	    && !enc)
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= (cbc128_f)bsaes_cbc_encrypt;
+		}
+	    else
+#endif
+#ifdef VPAES_CAPABLE
+	    if (VPAES_CAPABLE)
+		{
+		ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)vpaes_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)vpaes_cbc_encrypt :
+					NULL;
+		}
+	    else
+#endif
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)AES_cbc_encrypt :
+					NULL;
+		}
 	else
-		ret=AES_set_decrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
+		{
+		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_encrypt;
+		dat->stream.ctr	= (ctr128_f)bsaes_ctr32_encrypt_blocks;
+		}
+	    else
+#endif
+#ifdef VPAES_CAPABLE
+	    if (VPAES_CAPABLE)
+		{
+		ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)vpaes_encrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)vpaes_cbc_encrypt :
+					NULL;
+		}
+	    else
+#endif
+		{
+		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_encrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)AES_cbc_encrypt :
+					NULL;
+#ifdef AES_CTR_ASM
+		if (mode==EVP_CIPH_CTR_MODE)
+			dat->stream.ctr = (ctr128_f)AES_ctr32_encrypt;
+#endif
+		}
 
 	if(ret < 0)
 		{
@@ -117,4 +571,750 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	return 1;
 	}
 
+static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (dat->stream.cbc)
+		(*dat->stream.cbc)(in,out,len,&dat->ks,ctx->iv,ctx->encrypt);
+	else if (ctx->encrypt)
+		CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+	else
+		CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+
+	return 1;
+}
+
+static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	size_t	bl = ctx->cipher->block_size;
+	size_t	i;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (len<bl)	return 1;
+
+	for (i=0,len-=bl;i<=len;i+=bl)
+		(*dat->block)(in+i,out+i,&dat->ks);
+
+	return 1;
+}
+
+static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_ofb128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,dat->block);
+	return 1;
+}
+
+static int aes_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_cfb128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	return 1;
+}
+
+static int aes_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_cfb128_8_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	return 1;
+}
+
+static int aes_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (ctx->flags&EVP_CIPH_FLAG_LENGTH_BITS) {
+		CRYPTO_cfb128_1_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+		return 1;
+	}
+
+	while (len>=MAXBITCHUNK) {
+		CRYPTO_cfb128_1_encrypt(in,out,MAXBITCHUNK*8,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+		len-=MAXBITCHUNK;
+	}
+	if (len)
+		CRYPTO_cfb128_1_encrypt(in,out,len*8,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	
+	return 1;
+}
+
+static int aes_ctr_cipher (EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+{
+	unsigned int num = ctx->num;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (dat->stream.ctr)
+		CRYPTO_ctr128_encrypt_ctr32(in,out,len,&dat->ks,
+			ctx->iv,ctx->buf,&num,dat->stream.ctr);
+	else
+		CRYPTO_ctr128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,ctx->buf,&num,dat->block);
+	ctx->num = (size_t)num;
+	return 1;
+}
+
+BLOCK_CIPHER_generic_pack(NID_aes,128,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,192,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,256,EVP_CIPH_FLAG_FIPS)
+
+static int aes_gcm_cleanup(EVP_CIPHER_CTX *c)
+	{
+	EVP_AES_GCM_CTX *gctx = c->cipher_data;
+	OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
+	if (gctx->iv != c->iv)
+		OPENSSL_free(gctx->iv);
+	return 1;
+	}
+
+/* increment counter (64-bit int) by 1 */
+static void ctr64_inc(unsigned char *counter) {
+	int n=8;
+	unsigned char  c;
+
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_GCM_CTX *gctx = c->cipher_data;
+	switch (type)
+		{
+	case EVP_CTRL_INIT:
+		gctx->key_set = 0;
+		gctx->iv_set = 0;
+		gctx->ivlen = c->cipher->iv_len;
+		gctx->iv = c->iv;
+		gctx->taglen = -1;
+		gctx->iv_gen = 0;
+		gctx->tls_aad_len = -1;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IVLEN:
+		if (arg <= 0)
+			return 0;
+#ifdef OPENSSL_FIPS
+		if (FIPS_module_mode() && !(c->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)
+						 && arg < 12)
+			return 0;
+#endif
+		/* Allocate memory for IV if needed */
+		if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen))
+			{
+			if (gctx->iv != c->iv)
+				OPENSSL_free(gctx->iv);
+			gctx->iv = OPENSSL_malloc(arg);
+			if (!gctx->iv)
+				return 0;
+			}
+		gctx->ivlen = arg;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_TAG:
+		if (arg <= 0 || arg > 16 || c->encrypt)
+			return 0;
+		memcpy(c->buf, ptr, arg);
+		gctx->taglen = arg;
+		return 1;
+
+	case EVP_CTRL_GCM_GET_TAG:
+		if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0)
+			return 0;
+		memcpy(ptr, c->buf, arg);
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IV_FIXED:
+		/* Special case: -1 length restores whole IV */
+		if (arg == -1)
+			{
+			memcpy(gctx->iv, ptr, gctx->ivlen);
+			gctx->iv_gen = 1;
+			return 1;
+			}
+		/* Fixed field must be at least 4 bytes and invocation field
+		 * at least 8.
+		 */
+		if ((arg < 4) || (gctx->ivlen - arg) < 8)
+			return 0;
+		if (arg)
+			memcpy(gctx->iv, ptr, arg);
+		if (c->encrypt &&
+			RAND_bytes(gctx->iv + arg, gctx->ivlen - arg) <= 0)
+			return 0;
+		gctx->iv_gen = 1;
+		return 1;
+
+	case EVP_CTRL_GCM_IV_GEN:
+		if (gctx->iv_gen == 0 || gctx->key_set == 0)
+			return 0;
+		CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+		if (arg <= 0 || arg > gctx->ivlen)
+			arg = gctx->ivlen;
+		memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
+		/* Invocation field will be at least 8 bytes in size and
+		 * so no need to check wrap around or increment more than
+		 * last 8 bytes.
+		 */
+		ctr64_inc(gctx->iv + gctx->ivlen - 8);
+		gctx->iv_set = 1;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IV_INV:
+		if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt)
+			return 0;
+		memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
+		CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		return 1;
+
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		/* Save the AAD for later use */
+		if (arg != 13)
+			return 0;
+		memcpy(c->buf, ptr, arg);
+		gctx->tls_aad_len = arg;
+			{
+			unsigned int len=c->buf[arg-2]<<8|c->buf[arg-1];
+			/* Correct length for explicit IV */
+			len -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			/* If decrypting correct for tag too */
+			if (!c->encrypt)
+				len -= EVP_GCM_TLS_TAG_LEN;
+                        c->buf[arg-2] = len>>8;
+                        c->buf[arg-1] = len & 0xff;
+			}
+		/* Extra padding: tag appended to record */
+		return EVP_GCM_TLS_TAG_LEN;
+
+	default:
+		return -1;
+
+		}
+	}
+
+static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{ do {
+#ifdef BSAES_CAPABLE
+		if (BSAES_CAPABLE)
+			{
+			AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)AES_encrypt);
+			gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
+			break;
+			}
+		else
+#endif
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+			{
+			vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)vpaes_encrypt);
+			gctx->ctr = NULL;
+			break;
+			}
+		else
+#endif
+		(void)0;	/* terminate potentially open 'else' */
+
+		AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
+#ifdef AES_CTR_ASM
+		gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
+#else
+		gctx->ctr = NULL;
+#endif
+		} while (0);
+
+		/* If we have an iv can set it directly, otherwise use
+		 * saved IV.
+		 */
+		if (iv == NULL && gctx->iv_set)
+			iv = gctx->iv;
+		if (iv)
+			{
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+			gctx->iv_set = 1;
+			}
+		gctx->key_set = 1;
+		}
+	else
+		{
+		/* If key set use IV, otherwise copy */
+		if (gctx->key_set)
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+		else
+			memcpy(gctx->iv, iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		gctx->iv_gen = 0;
+		}
+	return 1;
+	}
+
+/* Handle TLS GCM packet format. This consists of the last portion of the IV
+ * followed by the payload and finally the tag. On encrypt generate IV,
+ * encrypt payload and write the tag. On verify retrieve IV, decrypt payload
+ * and verify tag.
+ */
+
+static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	int rv = -1;
+	/* Encrypt/decrypt must be performed in place */
+	if (out != in || len < (EVP_GCM_TLS_EXPLICIT_IV_LEN+EVP_GCM_TLS_TAG_LEN))
+		return -1;
+	/* Set IV from start of buffer or generate IV and write to start
+	 * of buffer.
+	 */
+	if (EVP_CIPHER_CTX_ctrl(ctx, ctx->encrypt ?
+				EVP_CTRL_GCM_IV_GEN : EVP_CTRL_GCM_SET_IV_INV,
+				EVP_GCM_TLS_EXPLICIT_IV_LEN, out) <= 0)
+		goto err;
+	/* Use saved AAD */
+	if (CRYPTO_gcm128_aad(&gctx->gcm, ctx->buf, gctx->tls_aad_len))
+		goto err;
+	/* Fix buffer and length to point to payload */
+	in += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+	out += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+	len -= EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+	if (ctx->encrypt)
+		{
+		/* Encrypt payload */
+		if (gctx->ctr)
+			{
+			if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+				goto err;
+			}
+		else	{
+			if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+				goto err;
+			}
+		out += len;
+		/* Finally write tag */
+		CRYPTO_gcm128_tag(&gctx->gcm, out, EVP_GCM_TLS_TAG_LEN);
+		rv = len + EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+		}
+	else
+		{
+		/* Decrypt */
+		if (gctx->ctr)
+			{
+			if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+				goto err;
+			}
+		else	{
+			if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+				goto err;
+			}
+		/* Retrieve tag */
+		CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf,
+					EVP_GCM_TLS_TAG_LEN);
+		/* If tag mismatch wipe buffer */
+		if (memcmp(ctx->buf, in + len, EVP_GCM_TLS_TAG_LEN))
+			{
+			OPENSSL_cleanse(out, len);
+			goto err;
+			}
+		rv = len;
+		}
+
+	err:
+	gctx->iv_set = 0;
+	gctx->tls_aad_len = -1;
+	return rv;
+	}
+
+static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	/* If not set up, return error */
+	if (!gctx->key_set)
+		return -1;
+
+	if (gctx->tls_aad_len >= 0)
+		return aes_gcm_tls_cipher(ctx, out, in, len);
+
+	if (!gctx->iv_set)
+		return -1;
+	if (in)
+		{
+		if (out == NULL)
+			{
+			if (CRYPTO_gcm128_aad(&gctx->gcm, in, len))
+				return -1;
+			}
+		else if (ctx->encrypt)
+			{
+			if (gctx->ctr)
+				{
+				if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+					return -1;
+				}
+			else	{
+				if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+					return -1;
+				}
+			}
+		else
+			{
+			if (gctx->ctr)
+				{
+				if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+					return -1;
+				}
+			else	{
+				if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+					return -1;
+				}
+			}
+		return len;
+		}
+	else
+		{
+		if (!ctx->encrypt)
+			{
+			if (gctx->taglen < 0)
+				return -1;
+			if (CRYPTO_gcm128_finish(&gctx->gcm,
+					ctx->buf, gctx->taglen) != 0)
+				return -1;
+			gctx->iv_set = 0;
+			return 0;
+			}
+		CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
+		gctx->taglen = 16;
+		/* Don't reuse the IV */
+		gctx->iv_set = 0;
+		return 0;
+		}
+
+	}
+
+#define CUSTOM_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 \
+		| EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+		| EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+
+static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_XTS_CTX *xctx = c->cipher_data;
+	if (type != EVP_CTRL_INIT)
+		return -1;
+	/* key1 and key2 are used as an indicator both key and IV are set */
+	xctx->xts.key1 = NULL;
+	xctx->xts.key2 = NULL;
+	return 1;
+	}
+
+static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+
+	if (key) do
+		{
+#ifdef AES_XTS_ASM
+		xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
+#else
+		xctx->stream = NULL;
+#endif
+		/* key_len is two AES keys */
+#ifdef BSAES_CAPABLE
+		if (BSAES_CAPABLE)
+			xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
+		else
+#endif
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+		    {
+		    if (enc)
+			{
+			vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)vpaes_encrypt;
+			}
+		    else
+			{
+			vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)vpaes_decrypt;
+			}
+
+		    vpaes_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		    xctx->xts.block2 = (block128_f)vpaes_encrypt;
+
+		    xctx->xts.key1 = &xctx->ks1;
+		    break;
+		    }
+		else
+#endif
+		(void)0;	/* terminate potentially open 'else' */
+
+		if (enc)
+			{
+			AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)AES_encrypt;
+			}
+		else
+			{
+			AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)AES_decrypt;
+			}
+
+		AES_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)AES_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		} while (0);
+
+	if (iv)
+		{
+		xctx->xts.key2 = &xctx->ks2;
+		memcpy(ctx->iv, iv, 16);
+		}
+
+	return 1;
+	}
+
+static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!xctx->xts.key1 || !xctx->xts.key2)
+		return 0;
+	if (!out || !in || len<AES_BLOCK_SIZE)
+		return 0;
+#ifdef OPENSSL_FIPS
+	/* Requirement of SP800-38E */
+	if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
+			(len > (1UL<<20)*16))
+		{
+		EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE);
+		return 0;
+		}
+#endif
+	if (xctx->stream)
+		(*xctx->stream)(in, out, len,
+				xctx->xts.key1, xctx->xts.key2, ctx->iv);
+	else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+								ctx->encrypt))
+		return 0;
+	return 1;
+	}
+
+#define aes_xts_cleanup NULL
+
+#define XTS_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
+			 | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+
+static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_CCM_CTX *cctx = c->cipher_data;
+	switch (type)
+		{
+	case EVP_CTRL_INIT:
+		cctx->key_set = 0;
+		cctx->iv_set = 0;
+		cctx->L = 8;
+		cctx->M = 12;
+		cctx->tag_set = 0;
+		cctx->len_set = 0;
+		return 1;
+
+	case EVP_CTRL_CCM_SET_IVLEN:
+		arg = 15 - arg;
+	case EVP_CTRL_CCM_SET_L:
+		if (arg < 2 || arg > 8)
+			return 0;
+		cctx->L = arg;
+		return 1;
+
+	case EVP_CTRL_CCM_SET_TAG:
+		if ((arg & 1) || arg < 4 || arg > 16)
+			return 0;
+		if ((c->encrypt && ptr) || (!c->encrypt && !ptr))
+			return 0;
+		if (ptr)
+			{
+			cctx->tag_set = 1;
+			memcpy(c->buf, ptr, arg);
+			}
+		cctx->M = arg;
+		return 1;
+
+	case EVP_CTRL_CCM_GET_TAG:
+		if (!c->encrypt || !cctx->tag_set)
+			return 0;
+		if(!CRYPTO_ccm128_tag(&cctx->ccm, ptr, (size_t)arg))
+			return 0;
+		cctx->tag_set = 0;
+		cctx->iv_set = 0;
+		cctx->len_set = 0;
+		return 1;
+
+	default:
+		return -1;
+
+		}
+	}
+
+static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key) do
+		{
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+			{
+			vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
+			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)vpaes_encrypt);
+			cctx->str = NULL;
+			cctx->key_set = 1;
+			break;
+			}
+#endif
+		AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)AES_encrypt);
+		cctx->str = NULL;
+		cctx->key_set = 1;
+		} while (0);
+	if (iv)
+		{
+		memcpy(ctx->iv, iv, 15 - cctx->L);
+		cctx->iv_set = 1;
+		}
+	return 1;
+	}
+
+static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	CCM128_CONTEXT *ccm = &cctx->ccm;
+	/* If not set up, return error */
+	if (!cctx->iv_set && !cctx->key_set)
+		return -1;
+	if (!ctx->encrypt && !cctx->tag_set)
+		return -1;
+	if (!out)
+		{
+		if (!in)
+			{
+			if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L,len))
+				return -1;
+			cctx->len_set = 1;
+			return len;
+			}
+		/* If have AAD need message length */
+		if (!cctx->len_set && len)
+			return -1;
+		CRYPTO_ccm128_aad(ccm, in, len);
+		return len;
+		}
+	/* EVP_*Final() doesn't return any data */
+	if (!in)
+		return 0;
+	/* If not set length yet do it */
+	if (!cctx->len_set)
+		{
+		if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+			return -1;
+		cctx->len_set = 1;
+		}
+	if (ctx->encrypt)
+		{
+		if (cctx->str ? CRYPTO_ccm128_encrypt_ccm64(ccm, in, out, len,
+						cctx->str) :
+				CRYPTO_ccm128_encrypt(ccm, in, out, len))
+			return -1;
+		cctx->tag_set = 1;
+		return len;
+		}
+	else
+		{
+		int rv = -1;
+		if (cctx->str ? !CRYPTO_ccm128_decrypt_ccm64(ccm, in, out, len,
+						cctx->str) :
+				!CRYPTO_ccm128_decrypt(ccm, in, out, len))
+			{
+			unsigned char tag[16];
+			if (CRYPTO_ccm128_tag(ccm, tag, cctx->M))
+				{
+				if (!memcmp(tag, ctx->buf, cctx->M))
+					rv = len;
+				}
+			}
+		if (rv == -1)
+			OPENSSL_cleanse(out, len);
+		cctx->iv_set = 0;
+		cctx->tag_set = 0;
+		cctx->len_set = 0;
+		return rv;
+		}
+
+	}
+
+#define aes_ccm_cleanup NULL
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+
+#endif
 #endif
diff --git a/main/openssl/crypto/evp/e_aes_cbc_hmac_sha1.c b/main/openssl/crypto/evp/e_aes_cbc_hmac_sha1.c
new file mode 100644
index 00000000..fb2c884a
--- /dev/null
+++ b/main/openssl/crypto/evp/e_aes_cbc_hmac_sha1.c
@@ -0,0 +1,581 @@
+/* ====================================================================
+ * Copyright (c) 2011-2013 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/aes.h>
+#include <openssl/sha.h>
+#include "evp_locl.h"
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+#define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+#endif
+
+#define TLS1_1_VERSION 0x0302
+
+typedef struct
+    {
+    AES_KEY		ks;
+    SHA_CTX		head,tail,md;
+    size_t		payload_length;	/* AAD length in decrypt case */
+    union {
+	unsigned int	tls_ver;
+    	unsigned char	tls_aad[16];	/* 13 used */
+    } aux;
+    } EVP_AES_HMAC_SHA1;
+
+#define NO_PAYLOAD_LENGTH	((size_t)-1)
+
+#if	defined(AES_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)	)
+
+#if defined(__GNUC__) && __GNUC__>=2 && !defined(PEDANTIC)
+# define BSWAP(x) ({ unsigned int r=(x); asm ("bswapl %0":"=r"(r):"0"(r)); r; })
+#endif
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+#define AESNI_CAPABLE   (1<<(57-32))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+
+void aesni_cbc_encrypt(const unsigned char *in,
+			   unsigned char *out,
+			   size_t length,
+			   const AES_KEY *key,
+			   unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks,
+		const AES_KEY *key, unsigned char iv[16],
+		SHA_CTX *ctx,const void *in0);
+
+#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
+
+static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	int ret;
+
+	if (enc)
+		ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks);
+	else
+		ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks);
+
+	SHA1_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return ret<0?0:1;
+	}
+
+#define	STITCHED_CALL
+
+#if !defined(STITCHED_CALL)
+#define	aes_off 0
+#endif
+
+void sha1_block_data_order (void *c,const void *p,size_t len);
+
+static void sha1_update(SHA_CTX *c,const void *data,size_t len)
+{	const unsigned char *ptr = data;
+	size_t res;
+
+	if ((res = c->num)) {
+		res = SHA_CBLOCK-res;
+		if (len<res) res=len;
+		SHA1_Update (c,ptr,res);
+		ptr += res;
+		len -= res;
+	}
+
+	res = len % SHA_CBLOCK;
+	len -= res;
+
+	if (len) {
+		sha1_block_data_order(c,ptr,len/SHA_CBLOCK);
+
+		ptr += len;
+		c->Nh += len>>29;
+		c->Nl += len<<=3;
+		if (c->Nl<(unsigned int)len) c->Nh++;
+	}
+
+	if (res)
+		SHA1_Update(c,ptr,res);
+}
+
+#ifdef SHA1_Update
+#undef SHA1_Update
+#endif
+#define SHA1_Update sha1_update
+
+static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	unsigned int l;
+	size_t	plen = key->payload_length,
+		iv = 0,		/* explicit IV in TLS 1.1 and later */
+		sha_off = 0;
+#if defined(STITCHED_CALL)
+	size_t	aes_off = 0,
+		blocks;
+
+	sha_off = SHA_CBLOCK-key->md.num;
+#endif
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	if (len%AES_BLOCK_SIZE) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==NO_PAYLOAD_LENGTH)
+			plen = len;
+		else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE))
+			return 0;
+		else if (key->aux.tls_ver >= TLS1_1_VERSION)
+			iv = AES_BLOCK_SIZE;
+
+#if defined(STITCHED_CALL)
+		if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) {
+			SHA1_Update(&key->md,in+iv,sha_off);
+
+			aesni_cbc_sha1_enc(in,out,blocks,&key->ks,
+				ctx->iv,&key->md,in+iv+sha_off);
+			blocks *= SHA_CBLOCK;
+			aes_off += blocks;
+			sha_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			sha_off = 0;
+		}
+#endif
+		sha_off += iv;
+		SHA1_Update(&key->md,in+sha_off,plen-sha_off);
+
+		if (plen!=len)	{	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+aes_off,in+aes_off,plen-aes_off);
+
+			/* calculate HMAC and append it to payload */
+			SHA1_Final(out+plen,&key->md);
+			key->md = key->tail;
+			SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH);
+			SHA1_Final(out+plen,&key->md);
+
+			/* pad the payload|hmac */
+			plen += SHA_DIGEST_LENGTH;
+			for (l=len-plen-1;plen<len;plen++) out[plen]=l;
+			/* encrypt HMAC|padding at once */
+			aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		} else {
+			aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		}
+	} else {
+		union { unsigned int  u[SHA_DIGEST_LENGTH/sizeof(unsigned int)];
+			unsigned char c[32+SHA_DIGEST_LENGTH]; } mac, *pmac;
+
+		/* arrange cache line alignment */
+		pmac = (void *)(((size_t)mac.c+31)&((size_t)0-32));
+
+		/* decrypt HMAC|padding at once */
+		aesni_cbc_encrypt(in,out,len,
+				&key->ks,ctx->iv,0);
+
+		if (plen) {	/* "TLS" mode of operation */
+			size_t inp_len, mask, j, i;
+			unsigned int res, maxpad, pad, bitlen;
+			int ret = 1;
+			union {	unsigned int  u[SHA_LBLOCK];
+				unsigned char c[SHA_CBLOCK]; }
+				*data = (void *)key->md.data;
+
+			if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3])
+			    >= TLS1_1_VERSION)
+				iv = AES_BLOCK_SIZE;
+
+			if (len<(iv+SHA_DIGEST_LENGTH+1))
+				return 0;
+
+			/* omit explicit iv */
+			out += iv;
+			len -= iv;
+
+			/* figure out payload length */
+			pad = out[len-1];
+			maxpad = len-(SHA_DIGEST_LENGTH+1);
+			maxpad |= (255-maxpad)>>(sizeof(maxpad)*8-8);
+			maxpad &= 255;
+
+			inp_len = len - (SHA_DIGEST_LENGTH+pad+1);
+			mask = (0-((inp_len-len)>>(sizeof(inp_len)*8-1)));
+			inp_len &= mask;
+			ret &= (int)mask;
+
+			key->aux.tls_aad[plen-2] = inp_len>>8;
+			key->aux.tls_aad[plen-1] = inp_len;
+
+			/* calculate HMAC */
+			key->md = key->head;
+			SHA1_Update(&key->md,key->aux.tls_aad,plen);
+
+#if 1
+			len -= SHA_DIGEST_LENGTH;		/* amend mac */
+			if (len>=(256+SHA_CBLOCK)) {
+				j = (len-(256+SHA_CBLOCK))&(0-SHA_CBLOCK);
+				j += SHA_CBLOCK-key->md.num;
+				SHA1_Update(&key->md,out,j);
+				out += j;
+				len -= j;
+				inp_len -= j;
+			}
+
+			/* but pretend as if we hashed padded payload */
+			bitlen = key->md.Nl+(inp_len<<3);	/* at most 18 bits */
+#ifdef BSWAP
+			bitlen = BSWAP(bitlen);
+#else
+			mac.c[0] = 0;
+			mac.c[1] = (unsigned char)(bitlen>>16);
+			mac.c[2] = (unsigned char)(bitlen>>8);
+			mac.c[3] = (unsigned char)bitlen;
+			bitlen = mac.u[0];
+#endif
+
+			pmac->u[0]=0;
+			pmac->u[1]=0;
+			pmac->u[2]=0;
+			pmac->u[3]=0;
+			pmac->u[4]=0;
+
+			for (res=key->md.num, j=0;j<len;j++) {
+				size_t c = out[j];
+				mask = (j-inp_len)>>(sizeof(j)*8-8);
+				c &= mask;
+				c |= 0x80&~mask&~((inp_len-j)>>(sizeof(j)*8-8));
+				data->c[res++]=(unsigned char)c;
+
+				if (res!=SHA_CBLOCK) continue;
+
+				/* j is not incremented yet */
+				mask = 0-((inp_len+7-j)>>(sizeof(j)*8-1));
+				data->u[SHA_LBLOCK-1] |= bitlen&mask;
+				sha1_block_data_order(&key->md,data,1);
+				mask &= 0-((j-inp_len-72)>>(sizeof(j)*8-1));
+				pmac->u[0] |= key->md.h0 & mask;
+				pmac->u[1] |= key->md.h1 & mask;
+				pmac->u[2] |= key->md.h2 & mask;
+				pmac->u[3] |= key->md.h3 & mask;
+				pmac->u[4] |= key->md.h4 & mask;
+				res=0;
+			}
+
+			for(i=res;i<SHA_CBLOCK;i++,j++) data->c[i]=0;
+
+			if (res>SHA_CBLOCK-8) {
+				mask = 0-((inp_len+8-j)>>(sizeof(j)*8-1));
+				data->u[SHA_LBLOCK-1] |= bitlen&mask;
+				sha1_block_data_order(&key->md,data,1);
+				mask &= 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+				pmac->u[0] |= key->md.h0 & mask;
+				pmac->u[1] |= key->md.h1 & mask;
+				pmac->u[2] |= key->md.h2 & mask;
+				pmac->u[3] |= key->md.h3 & mask;
+				pmac->u[4] |= key->md.h4 & mask;
+
+				memset(data,0,SHA_CBLOCK);
+				j+=64;
+			}
+			data->u[SHA_LBLOCK-1] = bitlen;
+			sha1_block_data_order(&key->md,data,1);
+			mask = 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+			pmac->u[0] |= key->md.h0 & mask;
+			pmac->u[1] |= key->md.h1 & mask;
+			pmac->u[2] |= key->md.h2 & mask;
+			pmac->u[3] |= key->md.h3 & mask;
+			pmac->u[4] |= key->md.h4 & mask;
+
+#ifdef BSWAP
+			pmac->u[0] = BSWAP(pmac->u[0]);
+			pmac->u[1] = BSWAP(pmac->u[1]);
+			pmac->u[2] = BSWAP(pmac->u[2]);
+			pmac->u[3] = BSWAP(pmac->u[3]);
+			pmac->u[4] = BSWAP(pmac->u[4]);
+#else
+			for (i=0;i<5;i++) {
+				res = pmac->u[i];
+				pmac->c[4*i+0]=(unsigned char)(res>>24);
+				pmac->c[4*i+1]=(unsigned char)(res>>16);
+				pmac->c[4*i+2]=(unsigned char)(res>>8);
+				pmac->c[4*i+3]=(unsigned char)res;
+			}
+#endif
+			len += SHA_DIGEST_LENGTH;
+#else
+			SHA1_Update(&key->md,out,inp_len);
+			res = key->md.num;
+			SHA1_Final(pmac->c,&key->md);
+
+			{
+			unsigned int inp_blocks, pad_blocks;
+
+			/* but pretend as if we hashed padded payload */
+			inp_blocks = 1+((SHA_CBLOCK-9-res)>>(sizeof(res)*8-1));
+			res += (unsigned int)(len-inp_len);
+			pad_blocks = res / SHA_CBLOCK;
+			res %= SHA_CBLOCK;
+			pad_blocks += 1+((SHA_CBLOCK-9-res)>>(sizeof(res)*8-1));
+			for (;inp_blocks<pad_blocks;inp_blocks++)
+				sha1_block_data_order(&key->md,data,1);
+			}
+#endif
+			key->md = key->tail;
+			SHA1_Update(&key->md,pmac->c,SHA_DIGEST_LENGTH);
+			SHA1_Final(pmac->c,&key->md);
+
+			/* verify HMAC */
+			out += inp_len;
+			len -= inp_len;
+#if 1
+			{
+			unsigned char *p = out+len-1-maxpad-SHA_DIGEST_LENGTH;
+			size_t off = out-p;
+			unsigned int c, cmask;
+
+			maxpad += SHA_DIGEST_LENGTH;
+			for (res=0,i=0,j=0;j<maxpad;j++) {
+				c = p[j];
+				cmask = ((int)(j-off-SHA_DIGEST_LENGTH))>>(sizeof(int)*8-1);
+				res |= (c^pad)&~cmask;	/* ... and padding */
+				cmask &= ((int)(off-1-j))>>(sizeof(int)*8-1);
+				res |= (c^pmac->c[i])&cmask;
+				i += 1&cmask;
+			}
+			maxpad -= SHA_DIGEST_LENGTH;
+
+			res = 0-((0-res)>>(sizeof(res)*8-1));
+			ret &= (int)~res;
+			}
+#else
+			for (res=0,i=0;i<SHA_DIGEST_LENGTH;i++)
+				res |= out[i]^pmac->c[i];
+			res = 0-((0-res)>>(sizeof(res)*8-1));
+			ret &= (int)~res;
+
+			/* verify padding */
+			pad = (pad&~res) | (maxpad&res);
+			out = out+len-1-pad;
+			for (res=0,i=0;i<pad;i++)
+				res |= out[i]^pad;
+
+			res = (0-res)>>(sizeof(res)*8-1);
+			ret &= (int)~res;
+#endif
+			return ret;
+		} else {
+			SHA1_Update(&key->md,out,len);
+		}
+	}
+
+	return 1;
+	}
+
+static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > (int)sizeof(hmac_key)) {
+			SHA1_Init(&key->head);
+			SHA1_Update(&key->head,ptr,arg);
+			SHA1_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		SHA1_Init(&key->head);
+		SHA1_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		SHA1_Init(&key->tail);
+		SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		OPENSSL_cleanse(hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (ctx->encrypt)
+			{
+			key->payload_length = len;
+			if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) {
+				len -= AES_BLOCK_SIZE;
+				p[arg-2] = len>>8;
+				p[arg-1] = len;
+			}
+			key->md = key->head;
+			SHA1_Update(&key->md,p,arg);
+
+			return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)
+				- len);
+			}
+		else
+			{
+			if (arg>13) arg = 13;
+			memcpy(key->aux.tls_aad,ptr,arg);
+			key->payload_length = arg;
+
+			return SHA_DIGEST_LENGTH;
+			}
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_128_cbc_hmac_sha1
+	NID_aes_128_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,16,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_256_cbc_hmac_sha1
+	NID_aes_256_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,32,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_128_cbc_hmac_sha1_cipher:NULL);
+	}
+
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_256_cbc_hmac_sha1_cipher:NULL);
+	}
+#else
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+#endif
+#endif
diff --git a/main/openssl/crypto/evp/e_des3.c b/main/openssl/crypto/evp/e_des3.c
index 3232cfe0..8d7b7de2 100644
--- a/main/openssl/crypto/evp/e_des3.c
+++ b/main/openssl/crypto/evp/e_des3.c
@@ -65,6 +65,8 @@
 #include <openssl/des.h>
 #include <openssl/rand.h>
 
+#ifndef OPENSSL_FIPS
+
 static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 			    const unsigned char *iv,int enc);
 
@@ -99,7 +101,7 @@ static int des_ede_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 static int des_ede_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 			      const unsigned char *in, size_t inl)
 {
-	if (inl>=EVP_MAXCHUNK)
+	while (inl>=EVP_MAXCHUNK)
 		{
 		DES_ede3_ofb64_encrypt(in, out, (long)EVP_MAXCHUNK,
 			       &data(ctx)->ks1, &data(ctx)->ks2, &data(ctx)->ks3,
@@ -130,7 +132,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 	printf("\n");
 	}
 #endif    /* KSSL_DEBUG */
-	if (inl>=EVP_MAXCHUNK)
+	while (inl>=EVP_MAXCHUNK)
 		{
 		DES_ede3_cbc_encrypt(in, out, (long)EVP_MAXCHUNK,
 			     &data(ctx)->ks1, &data(ctx)->ks2, &data(ctx)->ks3,
@@ -149,7 +151,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 static int des_ede_cfb64_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 			      const unsigned char *in, size_t inl)
 {
-	if (inl>=EVP_MAXCHUNK)
+	while (inl>=EVP_MAXCHUNK)
 		{
 		DES_ede3_cfb64_encrypt(in, out, (long)EVP_MAXCHUNK, 
 			       &data(ctx)->ks1, &data(ctx)->ks2, &data(ctx)->ks3,
@@ -311,3 +313,4 @@ const EVP_CIPHER *EVP_des_ede3(void)
 	return &des_ede3_ecb;
 }
 #endif
+#endif
diff --git a/main/openssl/crypto/evp/e_null.c b/main/openssl/crypto/evp/e_null.c
index 7cf50e14..f0c1f78b 100644
--- a/main/openssl/crypto/evp/e_null.c
+++ b/main/openssl/crypto/evp/e_null.c
@@ -61,6 +61,8 @@
 #include <openssl/evp.h>
 #include <openssl/objects.h>
 
+#ifndef OPENSSL_FIPS
+
 static int null_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	const unsigned char *iv,int enc);
 static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
@@ -99,4 +101,4 @@ static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 		memcpy((char *)out,(const char *)in,inl);
 	return 1;
 	}
-
+#endif
diff --git a/main/openssl/crypto/evp/e_rc2.c b/main/openssl/crypto/evp/e_rc2.c
index f78d7811..d4c33b58 100644
--- a/main/openssl/crypto/evp/e_rc2.c
+++ b/main/openssl/crypto/evp/e_rc2.c
@@ -183,7 +183,8 @@ static int rc2_get_asn1_type_and_iv(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
 		key_bits =rc2_magic_to_meth((int)num);
 		if (!key_bits)
 			return(-1);
-		if(i > 0) EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1);
+		if(i > 0 && !EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1))
+			return -1;
 		EVP_CIPHER_CTX_ctrl(c, EVP_CTRL_SET_RC2_KEY_BITS, key_bits, NULL);
 		EVP_CIPHER_CTX_set_key_length(c, key_bits / 8);
 		}
diff --git a/main/openssl/crypto/evp/e_rc4.c b/main/openssl/crypto/evp/e_rc4.c
index 8b5175e0..b4f6bda8 100644
--- a/main/openssl/crypto/evp/e_rc4.c
+++ b/main/openssl/crypto/evp/e_rc4.c
@@ -62,6 +62,7 @@
 #ifndef OPENSSL_NO_RC4
 
 #include <openssl/evp.h>
+#include "evp_locl.h"
 #include <openssl/objects.h>
 #include <openssl/rc4.h>
 
diff --git a/main/openssl/crypto/evp/e_rc4_hmac_md5.c b/main/openssl/crypto/evp/e_rc4_hmac_md5.c
new file mode 100644
index 00000000..56563191
--- /dev/null
+++ b/main/openssl/crypto/evp/e_rc4_hmac_md5.c
@@ -0,0 +1,298 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/rc4.h>
+#include <openssl/md5.h>
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+/* FIXME: surely this is available elsewhere? */
+#define EVP_RC4_KEY_SIZE		16
+
+typedef struct
+    {
+    RC4_KEY		ks;
+    MD5_CTX		head,tail,md;
+    size_t		payload_length;
+    } EVP_RC4_HMAC_MD5;
+
+#define NO_PAYLOAD_LENGTH	((size_t)-1)
+
+void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out,
+		MD5_CTX *ctx,const void *inp,size_t blocks);
+
+#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data)
+
+static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx),
+		    inkey);
+
+	MD5_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return 1;
+	}
+
+#if	!defined(OPENSSL_NO_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)		) && \
+	!(defined(__APPLE__) && defined(__MACH__))
+#define	STITCHED_CALL
+#endif
+
+#if !defined(STITCHED_CALL)
+#define	rc4_off 0
+#define	md5_off 0
+#endif
+
+static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+#if defined(STITCHED_CALL)
+	size_t	rc4_off = 32-1-(key->ks.x&(32-1)),	/* 32 is $MOD from rc4_md5-x86_64.pl */
+		md5_off = MD5_CBLOCK-key->md.num,
+		blocks;
+	unsigned int l;
+	extern unsigned int OPENSSL_ia32cap_P[];
+#endif
+	size_t	plen = key->payload_length;
+
+	if (plen!=NO_PAYLOAD_LENGTH && len!=(plen+MD5_DIGEST_LENGTH)) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==NO_PAYLOAD_LENGTH) plen = len;
+#if defined(STITCHED_CALL)
+		/* cipher has to "fall behind" */
+		if (rc4_off>md5_off) md5_off+=MD5_CBLOCK;
+
+		if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK) &&
+		    (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+			MD5_Update(&key->md,in,md5_off);
+			RC4(&key->ks,rc4_off,in,out);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,in+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			rc4_off = 0;
+			md5_off = 0;
+		}
+#endif
+		MD5_Update(&key->md,in+md5_off,plen-md5_off);
+
+		if (plen!=len) {	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+rc4_off,in+rc4_off,plen-rc4_off);
+
+			/* calculate HMAC and append it to payload */
+			MD5_Final(out+plen,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH);
+			MD5_Final(out+plen,&key->md);
+			/* encrypt HMAC at once */
+			RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off);
+		} else {
+			RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		}
+	} else {
+		unsigned char mac[MD5_DIGEST_LENGTH];
+#if defined(STITCHED_CALL)
+		/* digest has to "fall behind" */
+		if (md5_off>rc4_off)	rc4_off += 2*MD5_CBLOCK;
+		else			rc4_off += MD5_CBLOCK;
+
+		if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK) &&
+		    (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+			RC4(&key->ks,rc4_off,in,out);
+			MD5_Update(&key->md,out,md5_off);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,out+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			l = (key->md.Nl+(blocks<<3))&0xffffffffU;
+			if (l<key->md.Nl) key->md.Nh++;
+			key->md.Nl  = l;
+			key->md.Nh += blocks>>29;
+		} else {
+			md5_off=0;
+			rc4_off=0;
+		}
+#endif
+		/* decrypt HMAC at once */
+		RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		if (plen!=NO_PAYLOAD_LENGTH) {	/* "TLS" mode of operation */
+			MD5_Update(&key->md,out+md5_off,plen-md5_off);
+
+			/* calculate HMAC and verify it */
+			MD5_Final(mac,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH);
+			MD5_Final(mac,&key->md);
+
+			if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH))
+				return 0;
+		} else {
+			MD5_Update(&key->md,out+md5_off,len-md5_off);
+		}
+	}
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return 1;
+	}
+
+static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > (int)sizeof(hmac_key)) {
+			MD5_Init(&key->head);
+			MD5_Update(&key->head,ptr,arg);
+			MD5_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		MD5_Init(&key->head);
+		MD5_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		MD5_Init(&key->tail);
+		MD5_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (!ctx->encrypt)
+			{
+			len -= MD5_DIGEST_LENGTH;
+			p[arg-2] = len>>8;
+			p[arg-1] = len;
+			}
+		key->payload_length=len;
+		key->md = key->head;
+		MD5_Update(&key->md,p,arg);
+
+		return MD5_DIGEST_LENGTH;
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER r4_hmac_md5_cipher=
+	{
+#ifdef NID_rc4_hmac_md5
+	NID_rc4_hmac_md5,
+#else
+	NID_undef,
+#endif
+	1,EVP_RC4_KEY_SIZE,0,
+	EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER,
+	rc4_hmac_md5_init_key,
+	rc4_hmac_md5_cipher,
+	NULL,
+	sizeof(EVP_RC4_HMAC_MD5),
+	NULL,
+	NULL,
+	rc4_hmac_md5_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_rc4_hmac_md5(void)
+	{
+	return(&r4_hmac_md5_cipher);
+	}
+#endif
diff --git a/main/openssl/crypto/evp/evp.h b/main/openssl/crypto/evp/evp.h
index 9f9795e2..e43a58e6 100644
--- a/main/openssl/crypto/evp/evp.h
+++ b/main/openssl/crypto/evp/evp.h
@@ -83,7 +83,7 @@
 #define EVP_RC5_32_12_16_KEY_SIZE	16
 */
 #define EVP_MAX_MD_SIZE			64	/* longest known is SHA512 */
-#define EVP_MAX_KEY_LENGTH		32
+#define EVP_MAX_KEY_LENGTH		64
 #define EVP_MAX_IV_LENGTH		16
 #define EVP_MAX_BLOCK_LENGTH		32
 
@@ -116,6 +116,7 @@
 #define EVP_PKEY_DH	NID_dhKeyAgreement
 #define EVP_PKEY_EC	NID_X9_62_id_ecPublicKey
 #define EVP_PKEY_HMAC	NID_hmac
+#define EVP_PKEY_CMAC	NID_cmac
 
 #ifdef	__cplusplus
 extern "C" {
@@ -216,6 +217,8 @@ typedef int evp_verify_method(int type,const unsigned char *m,
 
 #define EVP_MD_FLAG_DIGALGID_CUSTOM		0x0018
 
+#define EVP_MD_FLAG_FIPS	0x0400 /* Note if suitable for use in FIPS mode */
+
 /* Digest ctrls */
 
 #define	EVP_MD_CTRL_DIGALGID			0x1
@@ -325,6 +328,10 @@ struct evp_cipher_st
 #define		EVP_CIPH_CBC_MODE		0x2
 #define		EVP_CIPH_CFB_MODE		0x3
 #define		EVP_CIPH_OFB_MODE		0x4
+#define		EVP_CIPH_CTR_MODE		0x5
+#define		EVP_CIPH_GCM_MODE		0x6
+#define		EVP_CIPH_CCM_MODE		0x7
+#define		EVP_CIPH_XTS_MODE		0x10001
 #define 	EVP_CIPH_MODE			0xF0007
 /* Set if variable length cipher */
 #define 	EVP_CIPH_VARIABLE_LENGTH	0x8
@@ -346,6 +353,15 @@ struct evp_cipher_st
 #define		EVP_CIPH_FLAG_DEFAULT_ASN1	0x1000
 /* Buffer length in bits not bytes: CFB1 mode only */
 #define		EVP_CIPH_FLAG_LENGTH_BITS	0x2000
+/* Note if suitable for use in FIPS mode */
+#define		EVP_CIPH_FLAG_FIPS		0x4000
+/* Allow non FIPS cipher in FIPS mode */
+#define		EVP_CIPH_FLAG_NON_FIPS_ALLOW	0x8000
+/* Cipher handles any and all padding logic as well
+ * as finalisation.
+ */
+#define 	EVP_CIPH_FLAG_CUSTOM_CIPHER	0x100000
+#define		EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
 
 /* ctrl() values */
 
@@ -358,6 +374,33 @@ struct evp_cipher_st
 #define 	EVP_CTRL_RAND_KEY		0x6
 #define 	EVP_CTRL_PBE_PRF_NID		0x7
 #define 	EVP_CTRL_COPY			0x8
+#define 	EVP_CTRL_GCM_SET_IVLEN		0x9
+#define 	EVP_CTRL_GCM_GET_TAG		0x10
+#define 	EVP_CTRL_GCM_SET_TAG		0x11
+#define		EVP_CTRL_GCM_SET_IV_FIXED	0x12
+#define		EVP_CTRL_GCM_IV_GEN		0x13
+#define		EVP_CTRL_CCM_SET_IVLEN		EVP_CTRL_GCM_SET_IVLEN
+#define		EVP_CTRL_CCM_GET_TAG		EVP_CTRL_GCM_GET_TAG
+#define		EVP_CTRL_CCM_SET_TAG		EVP_CTRL_GCM_SET_TAG
+#define		EVP_CTRL_CCM_SET_L		0x14
+#define		EVP_CTRL_CCM_SET_MSGLEN		0x15
+/* AEAD cipher deduces payload length and returns number of bytes
+ * required to store MAC and eventual padding. Subsequent call to
+ * EVP_Cipher even appends/verifies MAC.
+ */
+#define		EVP_CTRL_AEAD_TLS1_AAD		0x16
+/* Used by composite AEAD ciphers, no-op in GCM, CCM... */
+#define		EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+/* Set the GCM invocation field, decrypt only */
+#define		EVP_CTRL_GCM_SET_IV_INV		0x18
+
+/* GCM TLS constants */
+/* Length of fixed part of IV derived from PRF */
+#define EVP_GCM_TLS_FIXED_IV_LEN			4
+/* Length of explicit part of IV part of TLS records */
+#define EVP_GCM_TLS_EXPLICIT_IV_LEN			8
+/* Length of tag for TLS */
+#define EVP_GCM_TLS_TAG_LEN				16
 
 typedef struct evp_cipher_info_st
 	{
@@ -375,7 +418,7 @@ struct evp_cipher_ctx_st
 	unsigned char  oiv[EVP_MAX_IV_LENGTH];	/* original iv */
 	unsigned char  iv[EVP_MAX_IV_LENGTH];	/* working iv */
 	unsigned char buf[EVP_MAX_BLOCK_LENGTH];/* saved partial block */
-	int num;				/* used by cfb/ofb mode */
+	int num;				/* used by cfb/ofb/ctr mode */
 
 	void *app_data;		/* application stuff */
 	int key_len;		/* May change for variable length cipher */
@@ -695,6 +738,9 @@ const EVP_MD *EVP_dev_crypto_md5(void);
 #ifndef OPENSSL_NO_RC4
 const EVP_CIPHER *EVP_rc4(void);
 const EVP_CIPHER *EVP_rc4_40(void);
+#ifndef OPENSSL_NO_MD5
+const EVP_CIPHER *EVP_rc4_hmac_md5(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_IDEA
 const EVP_CIPHER *EVP_idea_ecb(void);
@@ -741,9 +787,10 @@ const EVP_CIPHER *EVP_aes_128_cfb8(void);
 const EVP_CIPHER *EVP_aes_128_cfb128(void);
 # define EVP_aes_128_cfb EVP_aes_128_cfb128
 const EVP_CIPHER *EVP_aes_128_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_128_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_128_ccm(void);
+const EVP_CIPHER *EVP_aes_128_gcm(void);
+const EVP_CIPHER *EVP_aes_128_xts(void);
 const EVP_CIPHER *EVP_aes_192_ecb(void);
 const EVP_CIPHER *EVP_aes_192_cbc(void);
 const EVP_CIPHER *EVP_aes_192_cfb1(void);
@@ -751,9 +798,9 @@ const EVP_CIPHER *EVP_aes_192_cfb8(void);
 const EVP_CIPHER *EVP_aes_192_cfb128(void);
 # define EVP_aes_192_cfb EVP_aes_192_cfb128
 const EVP_CIPHER *EVP_aes_192_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_192_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_192_ccm(void);
+const EVP_CIPHER *EVP_aes_192_gcm(void);
 const EVP_CIPHER *EVP_aes_256_ecb(void);
 const EVP_CIPHER *EVP_aes_256_cbc(void);
 const EVP_CIPHER *EVP_aes_256_cfb1(void);
@@ -761,8 +808,13 @@ const EVP_CIPHER *EVP_aes_256_cfb8(void);
 const EVP_CIPHER *EVP_aes_256_cfb128(void);
 # define EVP_aes_256_cfb EVP_aes_256_cfb128
 const EVP_CIPHER *EVP_aes_256_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_256_ctr(void);
+const EVP_CIPHER *EVP_aes_256_ccm(void);
+const EVP_CIPHER *EVP_aes_256_gcm(void);
+const EVP_CIPHER *EVP_aes_256_xts(void);
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
 #endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
@@ -869,6 +921,7 @@ struct ec_key_st *EVP_PKEY_get1_EC_KEY(EVP_PKEY *pkey);
 #endif
 
 EVP_PKEY *	EVP_PKEY_new(void);
+EVP_PKEY *	EVP_PKEY_dup(EVP_PKEY *pkey);
 void		EVP_PKEY_free(EVP_PKEY *pkey);
 
 EVP_PKEY *	d2i_PublicKey(int type,EVP_PKEY **a, const unsigned char **pp,
@@ -1047,13 +1100,22 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
 #define EVP_PKEY_CTRL_CMS_DECRYPT	10
 #define EVP_PKEY_CTRL_CMS_SIGN		11
 
+#define EVP_PKEY_CTRL_CIPHER		12
+
 #define EVP_PKEY_ALG_CTRL		0x1000
 
 
 #define EVP_PKEY_FLAG_AUTOARGLEN	2
+/* Method handles all operations: don't assume any digest related
+ * defaults.
+ */
+#define EVP_PKEY_FLAG_SIGCTX_CUSTOM	4
 
 const EVP_PKEY_METHOD *EVP_PKEY_meth_find(int type);
 EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags);
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth);
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src);
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth);
 int EVP_PKEY_meth_add0(const EVP_PKEY_METHOD *pmeth);
 
@@ -1071,7 +1133,7 @@ int EVP_PKEY_CTX_get_operation(EVP_PKEY_CTX *ctx);
 void EVP_PKEY_CTX_set0_keygen_info(EVP_PKEY_CTX *ctx, int *dat, int datlen);
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen);
+				const unsigned char *key, int keylen);
 
 void EVP_PKEY_CTX_set_data(EVP_PKEY_CTX *ctx, void *data);
 void *EVP_PKEY_CTX_get_data(EVP_PKEY_CTX *ctx);
@@ -1181,6 +1243,8 @@ void EVP_PKEY_meth_set_ctrl(EVP_PKEY_METHOD *pmeth,
 	int (*ctrl_str)(EVP_PKEY_CTX *ctx,
 					const char *type, const char *value));
 
+void EVP_add_alg_module(void);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -1190,8 +1254,14 @@ void ERR_load_EVP_strings(void);
 /* Error codes for the EVP functions. */
 
 /* Function codes. */
+#define EVP_F_AESNI_INIT_KEY				 165
+#define EVP_F_AESNI_XTS_CIPHER				 176
 #define EVP_F_AES_INIT_KEY				 133
+#define EVP_F_AES_XTS					 172
+#define EVP_F_AES_XTS_CIPHER				 175
+#define EVP_F_ALG_MODULE_INIT				 177
 #define EVP_F_CAMELLIA_INIT_KEY				 159
+#define EVP_F_CMAC_INIT					 173
 #define EVP_F_D2I_PKEY					 100
 #define EVP_F_DO_SIGVER_INIT				 161
 #define EVP_F_DSAPKEY2PKCS8				 134
@@ -1246,15 +1316,24 @@ void ERR_load_EVP_strings(void);
 #define EVP_F_EVP_RIJNDAEL				 126
 #define EVP_F_EVP_SIGNFINAL				 107
 #define EVP_F_EVP_VERIFYFINAL				 108
+#define EVP_F_FIPS_CIPHERINIT				 166
+#define EVP_F_FIPS_CIPHER_CTX_COPY			 170
+#define EVP_F_FIPS_CIPHER_CTX_CTRL			 167
+#define EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH		 171
+#define EVP_F_FIPS_DIGESTINIT				 168
+#define EVP_F_FIPS_MD_CTX_COPY				 169
+#define EVP_F_HMAC_INIT_EX				 174
 #define EVP_F_INT_CTX_NEW				 157
 #define EVP_F_PKCS5_PBE_KEYIVGEN			 117
 #define EVP_F_PKCS5_V2_PBE_KEYIVGEN			 118
+#define EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN			 164
 #define EVP_F_PKCS8_SET_BROKEN				 112
 #define EVP_F_PKEY_SET_TYPE				 158
 #define EVP_F_RC2_MAGIC_TO_METH				 109
 #define EVP_F_RC5_CTRL					 125
 
 /* Reason codes. */
+#define EVP_R_AES_IV_SETUP_FAILED			 162
 #define EVP_R_AES_KEY_SETUP_FAILED			 143
 #define EVP_R_ASN1_LIB					 140
 #define EVP_R_BAD_BLOCK_LENGTH				 136
@@ -1272,16 +1351,21 @@ void ERR_load_EVP_strings(void);
 #define EVP_R_DECODE_ERROR				 114
 #define EVP_R_DIFFERENT_KEY_TYPES			 101
 #define EVP_R_DIFFERENT_PARAMETERS			 153
+#define EVP_R_DISABLED_FOR_FIPS				 163
 #define EVP_R_ENCODE_ERROR				 115
+#define EVP_R_ERROR_LOADING_SECTION			 165
+#define EVP_R_ERROR_SETTING_FIPS_MODE			 166
 #define EVP_R_EVP_PBE_CIPHERINIT_ERROR			 119
 #define EVP_R_EXPECTING_AN_RSA_KEY			 127
 #define EVP_R_EXPECTING_A_DH_KEY			 128
 #define EVP_R_EXPECTING_A_DSA_KEY			 129
 #define EVP_R_EXPECTING_A_ECDSA_KEY			 141
 #define EVP_R_EXPECTING_A_EC_KEY			 142
+#define EVP_R_FIPS_MODE_NOT_SUPPORTED			 167
 #define EVP_R_INITIALIZATION_ERROR			 134
 #define EVP_R_INPUT_NOT_INITIALIZED			 111
 #define EVP_R_INVALID_DIGEST				 152
+#define EVP_R_INVALID_FIPS_MODE				 168
 #define EVP_R_INVALID_KEY_LENGTH			 130
 #define EVP_R_INVALID_OPERATION				 148
 #define EVP_R_IV_TOO_LARGE				 102
@@ -1303,8 +1387,10 @@ void ERR_load_EVP_strings(void);
 #define EVP_R_PRIVATE_KEY_DECODE_ERROR			 145
 #define EVP_R_PRIVATE_KEY_ENCODE_ERROR			 146
 #define EVP_R_PUBLIC_KEY_NOT_RSA			 106
+#define EVP_R_TOO_LARGE					 164
 #define EVP_R_UNKNOWN_CIPHER				 160
 #define EVP_R_UNKNOWN_DIGEST				 161
+#define EVP_R_UNKNOWN_OPTION				 169
 #define EVP_R_UNKNOWN_PBE_ALGORITHM			 121
 #define EVP_R_UNSUPORTED_NUMBER_OF_ROUNDS		 135
 #define EVP_R_UNSUPPORTED_ALGORITHM			 156
diff --git a/main/openssl/crypto/evp/evp_cnf.c b/main/openssl/crypto/evp/evp_cnf.c
new file mode 100644
index 00000000..2e4db302
--- /dev/null
+++ b/main/openssl/crypto/evp/evp_cnf.c
@@ -0,0 +1,125 @@
+/* evp_cnf.c */
+/* Written by Stephen Henson (steve@openssl.org) for the OpenSSL
+ * project 2007.
+ */
+/* ====================================================================
+ * Copyright (c) 2007 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <openssl/crypto.h>
+#include "cryptlib.h"
+#include <openssl/conf.h>
+#include <openssl/dso.h>
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
+
+/* Algorithm configuration module. */
+
+static int alg_module_init(CONF_IMODULE *md, const CONF *cnf)
+	{
+	int i;
+	const char *oid_section;
+	STACK_OF(CONF_VALUE) *sktmp;
+	CONF_VALUE *oval;
+	oid_section = CONF_imodule_get_value(md);
+	if(!(sktmp = NCONF_get_section(cnf, oid_section)))
+		{
+		EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_ERROR_LOADING_SECTION);
+		return 0;
+		}
+	for(i = 0; i < sk_CONF_VALUE_num(sktmp); i++)
+		{
+		oval = sk_CONF_VALUE_value(sktmp, i);
+		if (!strcmp(oval->name, "fips_mode"))
+			{
+			int m;
+			if (!X509V3_get_value_bool(oval, &m))
+				{
+				EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_INVALID_FIPS_MODE);
+				return 0;
+				}
+			if (m > 0)
+				{
+#ifdef OPENSSL_FIPS
+				if (!FIPS_mode() && !FIPS_mode_set(1))
+					{
+					EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_ERROR_SETTING_FIPS_MODE);
+					return 0;
+					}
+#else
+				EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_FIPS_MODE_NOT_SUPPORTED);
+				return 0;
+#endif
+				}
+			}
+		else
+			{
+			EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_UNKNOWN_OPTION);
+			ERR_add_error_data(4, "name=", oval->name,
+						", value=", oval->value);
+			}
+				
+		}
+	return 1;
+	}
+
+void EVP_add_alg_module(void)
+	{
+	CONF_module_add("alg_section", alg_module_init, 0);
+	}
diff --git a/main/openssl/crypto/evp/evp_enc.c b/main/openssl/crypto/evp/evp_enc.c
index c268d25c..0c54f05e 100644
--- a/main/openssl/crypto/evp/evp_enc.c
+++ b/main/openssl/crypto/evp/evp_enc.c
@@ -64,8 +64,18 @@
 #ifndef OPENSSL_NO_ENGINE
 #include <openssl/engine.h>
 #endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include "evp_locl.h"
 
+#ifdef OPENSSL_FIPS
+#define M_do_cipher(ctx, out, in, inl) FIPS_cipher(ctx, out, in, inl)
+#else
+#define M_do_cipher(ctx, out, in, inl) ctx->cipher->do_cipher(ctx, out, in, inl)
+#endif
+
+
 const char EVP_version[]="EVP" OPENSSL_VERSION_PTEXT;
 
 void EVP_CIPHER_CTX_init(EVP_CIPHER_CTX *ctx)
@@ -115,10 +125,14 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 		/* Ensure a context left lying around from last time is cleared
 		 * (the previous check attempted to avoid this if the same
 		 * ENGINE and EVP_CIPHER could be used). */
-		EVP_CIPHER_CTX_cleanup(ctx);
-
-		/* Restore encrypt field: it is zeroed by cleanup */
-		ctx->encrypt = enc;
+		if (ctx->cipher)
+			{
+			unsigned long flags = ctx->flags;
+			EVP_CIPHER_CTX_cleanup(ctx);
+			/* Restore encrypt and flags */
+			ctx->encrypt = enc;
+			ctx->flags = flags;
+			}
 #ifndef OPENSSL_NO_ENGINE
 		if(impl)
 			{
@@ -155,6 +169,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 			ctx->engine = NULL;
 #endif
 
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_cipherinit(ctx, cipher, key, iv, enc);
+#endif
 		ctx->cipher=cipher;
 		if (ctx->cipher->ctx_size)
 			{
@@ -188,6 +206,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 #ifndef OPENSSL_NO_ENGINE
 skip_to_init:
 #endif
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_cipherinit(ctx, cipher, key, iv, enc);
+#endif
 	/* we assume block size is a power of 2 in *cryptUpdate */
 	OPENSSL_assert(ctx->cipher->block_size == 1
 	    || ctx->cipher->block_size == 8
@@ -214,6 +236,13 @@ skip_to_init:
 			memcpy(ctx->iv, ctx->oiv, EVP_CIPHER_CTX_iv_length(ctx));
 			break;
 
+			case EVP_CIPH_CTR_MODE:
+			ctx->num = 0;
+			/* Don't reuse IV for CTR mode */
+			if(iv)
+				memcpy(ctx->iv, iv, EVP_CIPHER_CTX_iv_length(ctx));
+			break;
+
 			default:
 			return 0;
 			break;
@@ -280,6 +309,16 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 	{
 	int i,j,bl;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		i = M_do_cipher(ctx, out, in, inl);
+		if (i < 0)
+			return 0;
+		else
+			*outl = i;
+		return 1;
+		}
+
 	if (inl <= 0)
 		{
 		*outl = 0;
@@ -288,7 +327,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 
 	if(ctx->buf_len == 0 && (inl&(ctx->block_mask)) == 0)
 		{
-		if(ctx->cipher->do_cipher(ctx,out,in,inl))
+		if(M_do_cipher(ctx,out,in,inl))
 			{
 			*outl=inl;
 			return 1;
@@ -315,7 +354,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 			{
 			j=bl-i;
 			memcpy(&(ctx->buf[i]),in,j);
-			if(!ctx->cipher->do_cipher(ctx,out,ctx->buf,bl)) return 0;
+			if(!M_do_cipher(ctx,out,ctx->buf,bl)) return 0;
 			inl-=j;
 			in+=j;
 			out+=bl;
@@ -328,7 +367,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 	inl-=i;
 	if (inl > 0)
 		{
-		if(!ctx->cipher->do_cipher(ctx,out,in,inl)) return 0;
+		if(!M_do_cipher(ctx,out,in,inl)) return 0;
 		*outl+=inl;
 		}
 
@@ -350,6 +389,16 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	int n,ret;
 	unsigned int i, b, bl;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		ret = M_do_cipher(ctx, out, NULL, 0);
+		if (ret < 0)
+			return 0;
+		else 
+			*outl = ret;
+		return 1;
+		}
+
 	b=ctx->cipher->block_size;
 	OPENSSL_assert(b <= sizeof ctx->buf);
 	if (b == 1)
@@ -372,7 +421,7 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	n=b-bl;
 	for (i=bl; i<b; i++)
 		ctx->buf[i]=n;
-	ret=ctx->cipher->do_cipher(ctx,out,ctx->buf,b);
+	ret=M_do_cipher(ctx,out,ctx->buf,b);
 
 
 	if(ret)
@@ -387,6 +436,19 @@ int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 	int fix_len;
 	unsigned int b;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		fix_len = M_do_cipher(ctx, out, in, inl);
+		if (fix_len < 0)
+			{
+			*outl = 0;
+			return 0;
+			}
+		else
+			*outl = fix_len;
+		return 1;
+		}
+
 	if (inl <= 0)
 		{
 		*outl = 0;
@@ -440,8 +502,18 @@ int EVP_DecryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	{
 	int i,n;
 	unsigned int b;
-
 	*outl=0;
+
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		i = M_do_cipher(ctx, out, NULL, 0);
+		if (i < 0)
+			return 0;
+		else
+			*outl = i;
+		return 1;
+		}
+
 	b=ctx->cipher->block_size;
 	if (ctx->flags & EVP_CIPH_NO_PADDING)
 		{
@@ -496,6 +568,7 @@ void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx)
 
 int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c)
 	{
+#ifndef OPENSSL_FIPS
 	if (c->cipher != NULL)
 		{
 		if(c->cipher->cleanup && !c->cipher->cleanup(c))
@@ -506,12 +579,16 @@ int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c)
 		}
 	if (c->cipher_data)
 		OPENSSL_free(c->cipher_data);
+#endif
 #ifndef OPENSSL_NO_ENGINE
 	if (c->engine)
 		/* The EVP_CIPHER we used belongs to an ENGINE, release the
 		 * functional reference we held for this reason. */
 		ENGINE_finish(c->engine);
 #endif
+#ifdef OPENSSL_FIPS
+	FIPS_cipher_ctx_cleanup(c);
+#endif
 	memset(c,0,sizeof(EVP_CIPHER_CTX));
 	return 1;
 	}
diff --git a/main/openssl/crypto/evp/evp_err.c b/main/openssl/crypto/evp/evp_err.c
index d8bfec09..08eab988 100644
--- a/main/openssl/crypto/evp/evp_err.c
+++ b/main/openssl/crypto/evp/evp_err.c
@@ -1,6 +1,6 @@
 /* crypto/evp/evp_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,8 +70,14 @@
 
 static ERR_STRING_DATA EVP_str_functs[]=
 	{
+{ERR_FUNC(EVP_F_AESNI_INIT_KEY),	"AESNI_INIT_KEY"},
+{ERR_FUNC(EVP_F_AESNI_XTS_CIPHER),	"AESNI_XTS_CIPHER"},
 {ERR_FUNC(EVP_F_AES_INIT_KEY),	"AES_INIT_KEY"},
+{ERR_FUNC(EVP_F_AES_XTS),	"AES_XTS"},
+{ERR_FUNC(EVP_F_AES_XTS_CIPHER),	"AES_XTS_CIPHER"},
+{ERR_FUNC(EVP_F_ALG_MODULE_INIT),	"ALG_MODULE_INIT"},
 {ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY),	"CAMELLIA_INIT_KEY"},
+{ERR_FUNC(EVP_F_CMAC_INIT),	"CMAC_INIT"},
 {ERR_FUNC(EVP_F_D2I_PKEY),	"D2I_PKEY"},
 {ERR_FUNC(EVP_F_DO_SIGVER_INIT),	"DO_SIGVER_INIT"},
 {ERR_FUNC(EVP_F_DSAPKEY2PKCS8),	"DSAPKEY2PKCS8"},
@@ -86,7 +92,7 @@ static ERR_STRING_DATA EVP_str_functs[]=
 {ERR_FUNC(EVP_F_EVP_DIGESTINIT_EX),	"EVP_DigestInit_ex"},
 {ERR_FUNC(EVP_F_EVP_ENCRYPTFINAL_EX),	"EVP_EncryptFinal_ex"},
 {ERR_FUNC(EVP_F_EVP_MD_CTX_COPY_EX),	"EVP_MD_CTX_copy_ex"},
-{ERR_FUNC(EVP_F_EVP_MD_SIZE),	"EVP_MD_SIZE"},
+{ERR_FUNC(EVP_F_EVP_MD_SIZE),	"EVP_MD_size"},
 {ERR_FUNC(EVP_F_EVP_OPENINIT),	"EVP_OpenInit"},
 {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD),	"EVP_PBE_alg_add"},
 {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD_TYPE),	"EVP_PBE_alg_add_type"},
@@ -126,9 +132,17 @@ static ERR_STRING_DATA EVP_str_functs[]=
 {ERR_FUNC(EVP_F_EVP_RIJNDAEL),	"EVP_RIJNDAEL"},
 {ERR_FUNC(EVP_F_EVP_SIGNFINAL),	"EVP_SignFinal"},
 {ERR_FUNC(EVP_F_EVP_VERIFYFINAL),	"EVP_VerifyFinal"},
+{ERR_FUNC(EVP_F_FIPS_CIPHERINIT),	"FIPS_CIPHERINIT"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_COPY),	"FIPS_CIPHER_CTX_COPY"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_CTRL),	"FIPS_CIPHER_CTX_CTRL"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH),	"FIPS_CIPHER_CTX_SET_KEY_LENGTH"},
+{ERR_FUNC(EVP_F_FIPS_DIGESTINIT),	"FIPS_DIGESTINIT"},
+{ERR_FUNC(EVP_F_FIPS_MD_CTX_COPY),	"FIPS_MD_CTX_COPY"},
+{ERR_FUNC(EVP_F_HMAC_INIT_EX),	"HMAC_Init_ex"},
 {ERR_FUNC(EVP_F_INT_CTX_NEW),	"INT_CTX_NEW"},
 {ERR_FUNC(EVP_F_PKCS5_PBE_KEYIVGEN),	"PKCS5_PBE_keyivgen"},
 {ERR_FUNC(EVP_F_PKCS5_V2_PBE_KEYIVGEN),	"PKCS5_v2_PBE_keyivgen"},
+{ERR_FUNC(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN),	"PKCS5_V2_PBKDF2_KEYIVGEN"},
 {ERR_FUNC(EVP_F_PKCS8_SET_BROKEN),	"PKCS8_set_broken"},
 {ERR_FUNC(EVP_F_PKEY_SET_TYPE),	"PKEY_SET_TYPE"},
 {ERR_FUNC(EVP_F_RC2_MAGIC_TO_METH),	"RC2_MAGIC_TO_METH"},
@@ -138,6 +152,7 @@ static ERR_STRING_DATA EVP_str_functs[]=
 
 static ERR_STRING_DATA EVP_str_reasons[]=
 	{
+{ERR_REASON(EVP_R_AES_IV_SETUP_FAILED)   ,"aes iv setup failed"},
 {ERR_REASON(EVP_R_AES_KEY_SETUP_FAILED)  ,"aes key setup failed"},
 {ERR_REASON(EVP_R_ASN1_LIB)              ,"asn1 lib"},
 {ERR_REASON(EVP_R_BAD_BLOCK_LENGTH)      ,"bad block length"},
@@ -155,16 +170,21 @@ static ERR_STRING_DATA EVP_str_reasons[]=
 {ERR_REASON(EVP_R_DECODE_ERROR)          ,"decode error"},
 {ERR_REASON(EVP_R_DIFFERENT_KEY_TYPES)   ,"different key types"},
 {ERR_REASON(EVP_R_DIFFERENT_PARAMETERS)  ,"different parameters"},
+{ERR_REASON(EVP_R_DISABLED_FOR_FIPS)     ,"disabled for fips"},
 {ERR_REASON(EVP_R_ENCODE_ERROR)          ,"encode error"},
+{ERR_REASON(EVP_R_ERROR_LOADING_SECTION) ,"error loading section"},
+{ERR_REASON(EVP_R_ERROR_SETTING_FIPS_MODE),"error setting fips mode"},
 {ERR_REASON(EVP_R_EVP_PBE_CIPHERINIT_ERROR),"evp pbe cipherinit error"},
 {ERR_REASON(EVP_R_EXPECTING_AN_RSA_KEY)  ,"expecting an rsa key"},
 {ERR_REASON(EVP_R_EXPECTING_A_DH_KEY)    ,"expecting a dh key"},
 {ERR_REASON(EVP_R_EXPECTING_A_DSA_KEY)   ,"expecting a dsa key"},
 {ERR_REASON(EVP_R_EXPECTING_A_ECDSA_KEY) ,"expecting a ecdsa key"},
 {ERR_REASON(EVP_R_EXPECTING_A_EC_KEY)    ,"expecting a ec key"},
+{ERR_REASON(EVP_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"},
 {ERR_REASON(EVP_R_INITIALIZATION_ERROR)  ,"initialization error"},
 {ERR_REASON(EVP_R_INPUT_NOT_INITIALIZED) ,"input not initialized"},
 {ERR_REASON(EVP_R_INVALID_DIGEST)        ,"invalid digest"},
+{ERR_REASON(EVP_R_INVALID_FIPS_MODE)     ,"invalid fips mode"},
 {ERR_REASON(EVP_R_INVALID_KEY_LENGTH)    ,"invalid key length"},
 {ERR_REASON(EVP_R_INVALID_OPERATION)     ,"invalid operation"},
 {ERR_REASON(EVP_R_IV_TOO_LARGE)          ,"iv too large"},
@@ -186,8 +206,10 @@ static ERR_STRING_DATA EVP_str_reasons[]=
 {ERR_REASON(EVP_R_PRIVATE_KEY_DECODE_ERROR),"private key decode error"},
 {ERR_REASON(EVP_R_PRIVATE_KEY_ENCODE_ERROR),"private key encode error"},
 {ERR_REASON(EVP_R_PUBLIC_KEY_NOT_RSA)    ,"public key not rsa"},
+{ERR_REASON(EVP_R_TOO_LARGE)             ,"too large"},
 {ERR_REASON(EVP_R_UNKNOWN_CIPHER)        ,"unknown cipher"},
 {ERR_REASON(EVP_R_UNKNOWN_DIGEST)        ,"unknown digest"},
+{ERR_REASON(EVP_R_UNKNOWN_OPTION)        ,"unknown option"},
 {ERR_REASON(EVP_R_UNKNOWN_PBE_ALGORITHM) ,"unknown pbe algorithm"},
 {ERR_REASON(EVP_R_UNSUPORTED_NUMBER_OF_ROUNDS),"unsuported number of rounds"},
 {ERR_REASON(EVP_R_UNSUPPORTED_ALGORITHM) ,"unsupported algorithm"},
diff --git a/main/openssl/crypto/evp/evp_key.c b/main/openssl/crypto/evp/evp_key.c
index 839d6a3a..7961fbeb 100644
--- a/main/openssl/crypto/evp/evp_key.c
+++ b/main/openssl/crypto/evp/evp_key.c
@@ -120,7 +120,7 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
 	unsigned char md_buf[EVP_MAX_MD_SIZE];
 	int niv,nkey,addmd=0;
 	unsigned int mds=0,i;
-
+	int rv = 0;
 	nkey=type->key_len;
 	niv=type->iv_len;
 	OPENSSL_assert(nkey <= EVP_MAX_KEY_LENGTH);
@@ -134,17 +134,24 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
 		if (!EVP_DigestInit_ex(&c,md, NULL))
 			return 0;
 		if (addmd++)
-			EVP_DigestUpdate(&c,&(md_buf[0]),mds);
-		EVP_DigestUpdate(&c,data,datal);
+			if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+				goto err;
+		if (!EVP_DigestUpdate(&c,data,datal))
+			goto err;
 		if (salt != NULL)
-			EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN);
-		EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+			if (!EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN))
+				goto err;
+		if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+			goto err;
 
 		for (i=1; i<(unsigned int)count; i++)
 			{
-			EVP_DigestInit_ex(&c,md, NULL);
-			EVP_DigestUpdate(&c,&(md_buf[0]),mds);
-			EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+			if (!EVP_DigestInit_ex(&c,md, NULL))
+				goto err;
+			if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+				goto err;
+			if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+				goto err;
 			}
 		i=0;
 		if (nkey)
@@ -173,8 +180,10 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
 			}
 		if ((nkey == 0) && (niv == 0)) break;
 		}
+	rv = type->key_len;
+	err:
 	EVP_MD_CTX_cleanup(&c);
 	OPENSSL_cleanse(&(md_buf[0]),EVP_MAX_MD_SIZE);
-	return(type->key_len);
+	return rv;
 	}
 
diff --git a/main/openssl/crypto/evp/evp_lib.c b/main/openssl/crypto/evp/evp_lib.c
index 40951a04..b180e482 100644
--- a/main/openssl/crypto/evp/evp_lib.c
+++ b/main/openssl/crypto/evp/evp_lib.c
@@ -67,6 +67,8 @@ int EVP_CIPHER_param_to_asn1(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
 
 	if (c->cipher->set_asn1_parameters != NULL)
 		ret=c->cipher->set_asn1_parameters(c,type);
+	else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+		ret=EVP_CIPHER_set_asn1_iv(c, type);
 	else
 		ret=-1;
 	return(ret);
@@ -78,6 +80,8 @@ int EVP_CIPHER_asn1_to_param(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
 
 	if (c->cipher->get_asn1_parameters != NULL)
 		ret=c->cipher->get_asn1_parameters(c,type);
+	else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+		ret=EVP_CIPHER_get_asn1_iv(c, type);
 	else
 		ret=-1;
 	return(ret);
diff --git a/main/openssl/crypto/evp/evp_locl.h b/main/openssl/crypto/evp/evp_locl.h
index 292d74c1..08c0a66d 100644
--- a/main/openssl/crypto/evp/evp_locl.h
+++ b/main/openssl/crypto/evp/evp_locl.h
@@ -343,3 +343,43 @@ struct evp_pkey_method_st
 	} /* EVP_PKEY_METHOD */;
 
 void evp_pkey_set_cb_translate(BN_GENCB *cb, EVP_PKEY_CTX *ctx);
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+			     ASN1_TYPE *param,
+			     const EVP_CIPHER *c, const EVP_MD *md, int en_de);
+
+#ifdef OPENSSL_FIPS
+
+#ifdef OPENSSL_DOING_MAKEDEPEND
+#undef SHA1_Init
+#undef SHA1_Update
+#undef SHA224_Init
+#undef SHA256_Init
+#undef SHA384_Init
+#undef SHA512_Init
+#undef DES_set_key_unchecked
+#endif
+
+#define RIPEMD160_Init	private_RIPEMD160_Init
+#define WHIRLPOOL_Init	private_WHIRLPOOL_Init
+#define MD5_Init	private_MD5_Init
+#define MD4_Init	private_MD4_Init
+#define MD2_Init	private_MD2_Init
+#define MDC2_Init	private_MDC2_Init
+#define SHA_Init	private_SHA_Init
+#define SHA1_Init	private_SHA1_Init
+#define SHA224_Init	private_SHA224_Init
+#define SHA256_Init	private_SHA256_Init
+#define SHA384_Init	private_SHA384_Init
+#define SHA512_Init	private_SHA512_Init
+
+#define BF_set_key	private_BF_set_key
+#define CAST_set_key	private_CAST_set_key
+#define idea_set_encrypt_key	private_idea_set_encrypt_key
+#define SEED_set_key	private_SEED_set_key
+#define RC2_set_key	private_RC2_set_key
+#define RC4_set_key	private_RC4_set_key
+#define DES_set_key_unchecked	private_DES_set_key_unchecked
+#define Camellia_set_key	private_Camellia_set_key
+
+#endif
diff --git a/main/openssl/crypto/evp/evp_pbe.c b/main/openssl/crypto/evp/evp_pbe.c
index c9d932d2..f8c32d82 100644
--- a/main/openssl/crypto/evp/evp_pbe.c
+++ b/main/openssl/crypto/evp/evp_pbe.c
@@ -61,6 +61,7 @@
 #include <openssl/evp.h>
 #include <openssl/pkcs12.h>
 #include <openssl/x509.h>
+#include "evp_locl.h"
 
 /* Password based encryption (PBE) functions */
 
@@ -87,6 +88,10 @@ static const EVP_PBE_CTL builtin_pbe[] =
 	{EVP_PBE_TYPE_OUTER, NID_pbeWithSHA1AndRC2_CBC,
 			NID_rc2_64_cbc, NID_sha1, PKCS5_PBE_keyivgen},
 
+#ifndef OPENSSL_NO_HMAC
+	{EVP_PBE_TYPE_OUTER, NID_id_pbkdf2, -1, -1, PKCS5_v2_PBKDF2_keyivgen},
+#endif
+
 	{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And128BitRC4,
 			NID_rc4, NID_sha1, PKCS12_PBE_keyivgen},
 	{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And40BitRC4,
diff --git a/main/openssl/crypto/evp/evptests.txt b/main/openssl/crypto/evp/evptests.txt
index beb12144..c273707c 100644
--- a/main/openssl/crypto/evp/evptests.txt
+++ b/main/openssl/crypto/evp/evptests.txt
@@ -158,6 +158,19 @@ AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:B7B
 AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:E1C656305ED1A7A6563805746FE03EDC:30C81C46A35CE411E5FBC1191A0A52EF:71AB47A086E86EEDF39D1C5BBA97C408:0
 AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:41635BE625B48AFC1666DD42A09D96E7:F69F2445DF4F9B17AD2B417BE66C3710:0126141D67F37BE8538F5A8BE740E484:0
 
+# AES Counter test vectors from RFC3686
+aes-128-ctr:AE6852F8121067CC4BF7A5765577F39E:00000030000000000000000000000001:53696E676C6520626C6F636B206D7367:E4095D4FB7A7B3792D6175A3261311B8:1
+aes-128-ctr:7E24067817FAE0D743D6CE1F32539163:006CB6DBC0543B59DA48D90B00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:5104A106168A72D9790D41EE8EDAD388EB2E1EFC46DA57C8FCE630DF9141BE28:1
+aes-128-ctr:7691BE035E5020A8AC6E618529F9A0DC:00E0017B27777F3F4A1786F000000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:C1CF48A89F2FFDD9CF4652E9EFDB72D74540A42BDE6D7836D59A5CEAAEF3105325B2072F:1
+
+aes-192-ctr:16AF5B145FC9F579C175F93E3BFB0EED863D06CCFDB78515:0000004836733C147D6D93CB00000001:53696E676C6520626C6F636B206D7367:4B55384FE259C9C84E7935A003CBE928:1
+aes-192-ctr:7C5CB2401B3DC33C19E7340819E0F69C678C3DB8E6F6A91A:0096B03B020C6EADC2CB500D00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:453243FC609B23327EDFAAFA7131CD9F8490701C5AD4A79CFC1FE0FF42F4FB00:1
+aes-192-ctr:02BF391EE8ECB159B959617B0965279BF59B60A786D3E0FE:0007BDFD5CBD60278DCC091200000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:96893FC55E5C722F540B7DD1DDF7E758D288BC95C69165884536C811662F2188ABEE0935:1
+
+aes-256-ctr:776BEFF2851DB06F4C8A0542C8696F6C6A81AF1EEC96B4D37FC1D689E6C1C104:00000060DB5672C97AA8F0B200000001:53696E676C6520626C6F636B206D7367:145AD01DBF824EC7560863DC71E3E0C0:1
+aes-256-ctr:F6D66D6BD52D59BB0796365879EFF886C66DD51A5B6A99744B50590C87A23884:00FAAC24C1585EF15A43D87500000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:F05E231B3894612C49EE000B804EB2A9B8306B508F839D6A5530831D9344AF1C:1
+aes-256-ctr:FF7A617CE69148E4F1726E2F43581DE2AA62D9F805532EDFF1EED687FB54153D:001CC5B751A51D70A1C1114800000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:EB6C52821D0BBBF7CE7594462ACA4FAAB407DF866569FD07F48CC0B583D6071F1EC0E6B8:1
+
 # DES ECB tests (from destest)
 
 DES-ECB:0000000000000000::0000000000000000:8CA64DE9C1B123A7
diff --git a/main/openssl/crypto/evp/m_dss.c b/main/openssl/crypto/evp/m_dss.c
index 48c26895..6fb7e9a8 100644
--- a/main/openssl/crypto/evp/m_dss.c
+++ b/main/openssl/crypto/evp/m_dss.c
@@ -60,12 +60,13 @@
 #include "cryptlib.h"
 #include <openssl/evp.h>
 #include <openssl/objects.h>
-#include <openssl/x509.h>
+#include <openssl/sha.h>
 #ifndef OPENSSL_NO_DSA
 #include <openssl/dsa.h>
 #endif
 
 #ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
 
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
@@ -97,3 +98,4 @@ const EVP_MD *EVP_dss(void)
 	return(&dsa_md);
 	}
 #endif
+#endif
diff --git a/main/openssl/crypto/evp/m_dss1.c b/main/openssl/crypto/evp/m_dss1.c
index 4f03fb70..2df362a6 100644
--- a/main/openssl/crypto/evp/m_dss1.c
+++ b/main/openssl/crypto/evp/m_dss1.c
@@ -63,11 +63,13 @@
 
 #include <openssl/evp.h>
 #include <openssl/objects.h>
-#include <openssl/x509.h>
+#include <openssl/sha.h>
 #ifndef OPENSSL_NO_DSA
 #include <openssl/dsa.h>
 #endif
 
+#ifndef OPENSSL_FIPS 
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -98,3 +100,4 @@ const EVP_MD *EVP_dss1(void)
 	return(&dss1_md);
 	}
 #endif
+#endif
diff --git a/main/openssl/crypto/evp/m_ecdsa.c b/main/openssl/crypto/evp/m_ecdsa.c
index 8d87a49e..4b15fb0f 100644
--- a/main/openssl/crypto/evp/m_ecdsa.c
+++ b/main/openssl/crypto/evp/m_ecdsa.c
@@ -116,6 +116,8 @@
 #include <openssl/x509.h>
 
 #ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -146,3 +148,4 @@ const EVP_MD *EVP_ecdsa(void)
 	return(&ecdsa_md);
 	}
 #endif
+#endif
diff --git a/main/openssl/crypto/evp/m_md4.c b/main/openssl/crypto/evp/m_md4.c
index 1e0b7c5b..6d47f61b 100644
--- a/main/openssl/crypto/evp/m_md4.c
+++ b/main/openssl/crypto/evp/m_md4.c
@@ -69,6 +69,8 @@
 #include <openssl/rsa.h>
 #endif
 
+#include "evp_locl.h"
+
 static int init(EVP_MD_CTX *ctx)
 	{ return MD4_Init(ctx->md_data); }
 
diff --git a/main/openssl/crypto/evp/m_md5.c b/main/openssl/crypto/evp/m_md5.c
index 63c14211..9a8bae02 100644
--- a/main/openssl/crypto/evp/m_md5.c
+++ b/main/openssl/crypto/evp/m_md5.c
@@ -68,6 +68,7 @@
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return MD5_Init(ctx->md_data); }
diff --git a/main/openssl/crypto/evp/m_mdc2.c b/main/openssl/crypto/evp/m_mdc2.c
index b08d5598..3602bed3 100644
--- a/main/openssl/crypto/evp/m_mdc2.c
+++ b/main/openssl/crypto/evp/m_mdc2.c
@@ -69,6 +69,8 @@
 #include <openssl/rsa.h>
 #endif
 
+#include "evp_locl.h"
+
 static int init(EVP_MD_CTX *ctx)
 	{ return MDC2_Init(ctx->md_data); }
 
diff --git a/main/openssl/crypto/evp/m_ripemd.c b/main/openssl/crypto/evp/m_ripemd.c
index a1d60ee7..7bf4804c 100644
--- a/main/openssl/crypto/evp/m_ripemd.c
+++ b/main/openssl/crypto/evp/m_ripemd.c
@@ -68,6 +68,7 @@
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return RIPEMD160_Init(ctx->md_data); }
diff --git a/main/openssl/crypto/evp/m_sha1.c b/main/openssl/crypto/evp/m_sha1.c
index 9a2790fd..bd0c01ad 100644
--- a/main/openssl/crypto/evp/m_sha1.c
+++ b/main/openssl/crypto/evp/m_sha1.c
@@ -59,15 +59,18 @@
 #include <stdio.h>
 #include "cryptlib.h"
 
+#ifndef OPENSSL_FIPS
+
 #ifndef OPENSSL_NO_SHA
 
 #include <openssl/evp.h>
 #include <openssl/objects.h>
-#include <openssl/x509.h>
+#include <openssl/sha.h>
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
 
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -202,3 +205,5 @@ static const EVP_MD sha512_md=
 const EVP_MD *EVP_sha512(void)
 	{ return(&sha512_md); }
 #endif	/* ifndef OPENSSL_NO_SHA512 */
+
+#endif
diff --git a/main/openssl/crypto/evp/m_wp.c b/main/openssl/crypto/evp/m_wp.c
index 1ce47c04..c51bc2d5 100644
--- a/main/openssl/crypto/evp/m_wp.c
+++ b/main/openssl/crypto/evp/m_wp.c
@@ -9,6 +9,7 @@
 #include <openssl/objects.h>
 #include <openssl/x509.h>
 #include <openssl/whrlpool.h>
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return WHIRLPOOL_Init(ctx->md_data); }
diff --git a/main/openssl/crypto/evp/names.c b/main/openssl/crypto/evp/names.c
index f2869f5c..6311ad7c 100644
--- a/main/openssl/crypto/evp/names.c
+++ b/main/openssl/crypto/evp/names.c
@@ -66,6 +66,10 @@ int EVP_add_cipher(const EVP_CIPHER *c)
 	{
 	int r;
 
+	if (c == NULL) return 0;
+
+	OPENSSL_init();
+
 	r=OBJ_NAME_add(OBJ_nid2sn(c->nid),OBJ_NAME_TYPE_CIPHER_METH,(const char *)c);
 	if (r == 0) return(0);
 	check_defer(c->nid);
@@ -78,6 +82,7 @@ int EVP_add_digest(const EVP_MD *md)
 	{
 	int r;
 	const char *name;
+	OPENSSL_init();
 
 	name=OBJ_nid2sn(md->type);
 	r=OBJ_NAME_add(name,OBJ_NAME_TYPE_MD_METH,(const char *)md);
diff --git a/main/openssl/crypto/evp/p5_crpt.c b/main/openssl/crypto/evp/p5_crpt.c
index 7ecfa8da..294cc90d 100644
--- a/main/openssl/crypto/evp/p5_crpt.c
+++ b/main/openssl/crypto/evp/p5_crpt.c
@@ -82,6 +82,8 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen,
 	unsigned char *salt;
 	const unsigned char *pbuf;
 	int mdsize;
+	int rv = 0;
+	EVP_MD_CTX_init(&ctx);
 
 	/* Extract useful info from parameter */
 	if (param == NULL || param->type != V_ASN1_SEQUENCE ||
@@ -104,29 +106,38 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen,
 	if(!pass) passlen = 0;
 	else if(passlen == -1) passlen = strlen(pass);
 
-	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, md, NULL);
-	EVP_DigestUpdate(&ctx, pass, passlen);
-	EVP_DigestUpdate(&ctx, salt, saltlen);
+	if (!EVP_DigestInit_ex(&ctx, md, NULL))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx, pass, passlen))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx, salt, saltlen))
+		goto err;
 	PBEPARAM_free(pbe);
-	EVP_DigestFinal_ex(&ctx, md_tmp, NULL);
+	if (!EVP_DigestFinal_ex(&ctx, md_tmp, NULL))
+		goto err;
 	mdsize = EVP_MD_size(md);
 	if (mdsize < 0)
 	    return 0;
 	for (i = 1; i < iter; i++) {
-		EVP_DigestInit_ex(&ctx, md, NULL);
-		EVP_DigestUpdate(&ctx, md_tmp, mdsize);
-		EVP_DigestFinal_ex (&ctx, md_tmp, NULL);
+		if (!EVP_DigestInit_ex(&ctx, md, NULL))
+			goto err;
+		if (!EVP_DigestUpdate(&ctx, md_tmp, mdsize))
+			goto err;
+		if (!EVP_DigestFinal_ex (&ctx, md_tmp, NULL))
+			goto err;
 	}
-	EVP_MD_CTX_cleanup(&ctx);
 	OPENSSL_assert(EVP_CIPHER_key_length(cipher) <= (int)sizeof(md_tmp));
 	memcpy(key, md_tmp, EVP_CIPHER_key_length(cipher));
 	OPENSSL_assert(EVP_CIPHER_iv_length(cipher) <= 16);
 	memcpy(iv, md_tmp + (16 - EVP_CIPHER_iv_length(cipher)),
 						 EVP_CIPHER_iv_length(cipher));
-	EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de);
+	if (!EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de))
+		goto err;
 	OPENSSL_cleanse(md_tmp, EVP_MAX_MD_SIZE);
 	OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH);
 	OPENSSL_cleanse(iv, EVP_MAX_IV_LENGTH);
-	return 1;
+	rv = 1;
+	err:
+	EVP_MD_CTX_cleanup(&ctx);
+	return rv;
 }
diff --git a/main/openssl/crypto/evp/p5_crpt2.c b/main/openssl/crypto/evp/p5_crpt2.c
index 334379f3..fe3c6c88 100644
--- a/main/openssl/crypto/evp/p5_crpt2.c
+++ b/main/openssl/crypto/evp/p5_crpt2.c
@@ -62,6 +62,7 @@
 #include <openssl/x509.h>
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
+#include "evp_locl.h"
 
 /* set this to print out info about the keygen algorithm */
 /* #define DEBUG_PKCS5V2 */
@@ -84,19 +85,24 @@ int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
 	unsigned char digtmp[EVP_MAX_MD_SIZE], *p, itmp[4];
 	int cplen, j, k, tkeylen, mdlen;
 	unsigned long i = 1;
-	HMAC_CTX hctx;
+	HMAC_CTX hctx_tpl, hctx;
 
 	mdlen = EVP_MD_size(digest);
 	if (mdlen < 0)
 		return 0;
 
-	HMAC_CTX_init(&hctx);
+	HMAC_CTX_init(&hctx_tpl);
 	p = out;
 	tkeylen = keylen;
 	if(!pass)
 		passlen = 0;
 	else if(passlen == -1)
 		passlen = strlen(pass);
+	if (!HMAC_Init_ex(&hctx_tpl, pass, passlen, digest, NULL))
+		{
+		HMAC_CTX_cleanup(&hctx_tpl);
+		return 0;
+		}
 	while(tkeylen)
 		{
 		if(tkeylen > mdlen)
@@ -110,15 +116,36 @@ int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
 		itmp[1] = (unsigned char)((i >> 16) & 0xff);
 		itmp[2] = (unsigned char)((i >> 8) & 0xff);
 		itmp[3] = (unsigned char)(i & 0xff);
-		HMAC_Init_ex(&hctx, pass, passlen, digest, NULL);
-		HMAC_Update(&hctx, salt, saltlen);
-		HMAC_Update(&hctx, itmp, 4);
-		HMAC_Final(&hctx, digtmp, NULL);
+		if (!HMAC_CTX_copy(&hctx, &hctx_tpl))
+			{
+			HMAC_CTX_cleanup(&hctx_tpl);
+			return 0;
+			}
+		if (!HMAC_Update(&hctx, salt, saltlen)
+		    || !HMAC_Update(&hctx, itmp, 4)
+		    || !HMAC_Final(&hctx, digtmp, NULL))
+			{
+			HMAC_CTX_cleanup(&hctx_tpl);
+			HMAC_CTX_cleanup(&hctx);
+			return 0;
+			}
+		HMAC_CTX_cleanup(&hctx);
 		memcpy(p, digtmp, cplen);
 		for(j = 1; j < iter; j++)
 			{
-			HMAC(digest, pass, passlen,
-				 digtmp, mdlen, digtmp, NULL);
+			if (!HMAC_CTX_copy(&hctx, &hctx_tpl))
+				{
+				HMAC_CTX_cleanup(&hctx_tpl);
+				return 0;
+				}
+			if (!HMAC_Update(&hctx, digtmp, mdlen)
+			    || !HMAC_Final(&hctx, digtmp, NULL))
+				{
+				HMAC_CTX_cleanup(&hctx_tpl);
+				HMAC_CTX_cleanup(&hctx);
+				return 0;
+				}
+			HMAC_CTX_cleanup(&hctx);
 			for(k = 0; k < cplen; k++)
 				p[k] ^= digtmp[k];
 			}
@@ -126,7 +153,7 @@ int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
 		i++;
 		p+= cplen;
 		}
-	HMAC_CTX_cleanup(&hctx);
+	HMAC_CTX_cleanup(&hctx_tpl);
 #ifdef DEBUG_PKCS5V2
 	fprintf(stderr, "Password:\n");
 	h__dump (pass, passlen);
@@ -168,27 +195,24 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
                          ASN1_TYPE *param, const EVP_CIPHER *c, const EVP_MD *md,
                          int en_de)
 {
-	unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
 	const unsigned char *pbuf;
-	int saltlen, iter, plen;
-	unsigned int keylen;
+	int plen;
 	PBE2PARAM *pbe2 = NULL;
 	const EVP_CIPHER *cipher;
-	PBKDF2PARAM *kdf = NULL;
-	const EVP_MD *prfmd;
-	int prf_nid, hmac_md_nid;
+
+	int rv = 0;
 
 	if (param == NULL || param->type != V_ASN1_SEQUENCE ||
 	    param->value.sequence == NULL) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
-		return 0;
+		goto err;
 	}
 
 	pbuf = param->value.sequence->data;
 	plen = param->value.sequence->length;
 	if(!(pbe2 = d2i_PBE2PARAM(NULL, &pbuf, plen))) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
-		return 0;
+		goto err;
 	}
 
 	/* See if we recognise the key derivation function */
@@ -211,38 +235,63 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
 	}
 
 	/* Fixup cipher based on AlgorithmIdentifier */
-	EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de);
+	if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de))
+		goto err;
 	if(EVP_CIPHER_asn1_to_param(ctx, pbe2->encryption->parameter) < 0) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
 					EVP_R_CIPHER_PARAMETER_ERROR);
 		goto err;
 	}
+	rv = PKCS5_v2_PBKDF2_keyivgen(ctx, pass, passlen,
+					pbe2->keyfunc->parameter, c, md, en_de);
+	err:
+	PBE2PARAM_free(pbe2);
+	return rv;
+}
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+                         ASN1_TYPE *param,
+			 const EVP_CIPHER *c, const EVP_MD *md, int en_de)
+{
+	unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
+	const unsigned char *pbuf;
+	int saltlen, iter, plen;
+	int rv = 0;
+	unsigned int keylen = 0;
+	int prf_nid, hmac_md_nid;
+	PBKDF2PARAM *kdf = NULL;
+	const EVP_MD *prfmd;
+
+	if (EVP_CIPHER_CTX_cipher(ctx) == NULL)
+		{
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_NO_CIPHER_SET);
+		goto err;
+		}
 	keylen = EVP_CIPHER_CTX_key_length(ctx);
 	OPENSSL_assert(keylen <= sizeof key);
 
-	/* Now decode key derivation function */
+	/* Decode parameter */
 
-	if(!pbe2->keyfunc->parameter ||
-		 (pbe2->keyfunc->parameter->type != V_ASN1_SEQUENCE))
+	if(!param || (param->type != V_ASN1_SEQUENCE))
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
 		goto err;
 		}
 
-	pbuf = pbe2->keyfunc->parameter->value.sequence->data;
-	plen = pbe2->keyfunc->parameter->value.sequence->length;
+	pbuf = param->value.sequence->data;
+	plen = param->value.sequence->length;
+
 	if(!(kdf = d2i_PBKDF2PARAM(NULL, &pbuf, plen)) ) {
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
 		goto err;
 	}
 
-	PBE2PARAM_free(pbe2);
-	pbe2 = NULL;
+	keylen = EVP_CIPHER_CTX_key_length(ctx);
 
 	/* Now check the parameters of the kdf */
 
 	if(kdf->keylength && (ASN1_INTEGER_get(kdf->keylength) != (int)keylen)){
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
 						EVP_R_UNSUPPORTED_KEYLENGTH);
 		goto err;
 	}
@@ -254,19 +303,19 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
 
 	if (!EVP_PBE_find(EVP_PBE_TYPE_PRF, prf_nid, NULL, &hmac_md_nid, 0))
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
 		goto err;
 		}
 
 	prfmd = EVP_get_digestbynid(hmac_md_nid);
 	if (prfmd == NULL)
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
 		goto err;
 		}
 
 	if(kdf->salt->type != V_ASN1_OCTET_STRING) {
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
 						EVP_R_UNSUPPORTED_SALT_TYPE);
 		goto err;
 	}
@@ -278,15 +327,11 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
 	if(!PKCS5_PBKDF2_HMAC(pass, passlen, salt, saltlen, iter, prfmd,
 						   keylen, key))
 		goto err;
-	EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
-	OPENSSL_cleanse(key, keylen);
-	PBKDF2PARAM_free(kdf);
-	return 1;
-
+	rv = EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
 	err:
-	PBE2PARAM_free(pbe2);
+	OPENSSL_cleanse(key, keylen);
 	PBKDF2PARAM_free(kdf);
-	return 0;
+	return rv;
 }
 
 #ifdef DEBUG_PKCS5V2
diff --git a/main/openssl/crypto/evp/p_lib.c b/main/openssl/crypto/evp/p_lib.c
index e26ccd0d..bd1977d7 100644
--- a/main/openssl/crypto/evp/p_lib.c
+++ b/main/openssl/crypto/evp/p_lib.c
@@ -200,6 +200,12 @@ EVP_PKEY *EVP_PKEY_new(void)
 	return(ret);
 	}
 
+EVP_PKEY *EVP_PKEY_dup(EVP_PKEY *pkey)
+	{
+	CRYPTO_add(&pkey->references, 1, CRYPTO_LOCK_EVP_PKEY);
+	return pkey;
+	}
+
 /* Setup a public key ASN1 method and ENGINE from a NID or a string.
  * If pkey is NULL just return 1 or 0 if the algorithm exists.
  */
diff --git a/main/openssl/crypto/evp/p_open.c b/main/openssl/crypto/evp/p_open.c
index 53a59a29..c748fbea 100644
--- a/main/openssl/crypto/evp/p_open.c
+++ b/main/openssl/crypto/evp/p_open.c
@@ -115,7 +115,8 @@ int EVP_OpenFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	int i;
 
 	i=EVP_DecryptFinal_ex(ctx,out,outl);
-	EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+	if (i)
+		i = EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
 	return(i);
 	}
 #else /* !OPENSSL_NO_RSA */
diff --git a/main/openssl/crypto/evp/p_seal.c b/main/openssl/crypto/evp/p_seal.c
index d8324526..e5919b0f 100644
--- a/main/openssl/crypto/evp/p_seal.c
+++ b/main/openssl/crypto/evp/p_seal.c
@@ -110,6 +110,7 @@ int EVP_SealFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	{
 	int i;
 	i = EVP_EncryptFinal_ex(ctx,out,outl);
-	EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+	if (i) 
+		i = EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
 	return i;
 	}
diff --git a/main/openssl/crypto/evp/p_sign.c b/main/openssl/crypto/evp/p_sign.c
index bb893f5b..8afb6643 100644
--- a/main/openssl/crypto/evp/p_sign.c
+++ b/main/openssl/crypto/evp/p_sign.c
@@ -80,18 +80,20 @@ int EVP_SignFinal(EVP_MD_CTX *ctx, unsigned char *sigret, unsigned int *siglen,
 	{
 	unsigned char m[EVP_MAX_MD_SIZE];
 	unsigned int m_len;
-	int i,ok=0,v;
+	int i = 0,ok = 0,v;
 	EVP_MD_CTX tmp_ctx;
+	EVP_PKEY_CTX *pkctx = NULL;
 
 	*siglen=0;
 	EVP_MD_CTX_init(&tmp_ctx);
-	EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);   
-	EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+	if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+		goto err;  
+	if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+		goto err;
 	EVP_MD_CTX_cleanup(&tmp_ctx);
 
 	if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 		{
-		EVP_PKEY_CTX *pkctx = NULL;
 		size_t sltmp = (size_t)EVP_PKEY_size(pkey);
 		i = 0;
 		pkctx = EVP_PKEY_CTX_new(pkey, NULL);
diff --git a/main/openssl/crypto/evp/p_verify.c b/main/openssl/crypto/evp/p_verify.c
index 41d4b671..c66d63cc 100644
--- a/main/openssl/crypto/evp/p_verify.c
+++ b/main/openssl/crypto/evp/p_verify.c
@@ -67,17 +67,19 @@ int EVP_VerifyFinal(EVP_MD_CTX *ctx, const unsigned char *sigbuf,
 	{
 	unsigned char m[EVP_MAX_MD_SIZE];
 	unsigned int m_len;
-	int i,ok=0,v;
+	int i = 0,ok = 0,v;
 	EVP_MD_CTX tmp_ctx;
+	EVP_PKEY_CTX *pkctx = NULL;
 
 	EVP_MD_CTX_init(&tmp_ctx);
-	EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);     
-	EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+	if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+		goto err;    
+	if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+		goto err;
 	EVP_MD_CTX_cleanup(&tmp_ctx);
 
 	if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 		{
-		EVP_PKEY_CTX *pkctx = NULL;
 		i = -1;
 		pkctx = EVP_PKEY_CTX_new(pkey, NULL);
 		if (!pkctx)
diff --git a/main/openssl/crypto/evp/pmeth_gn.c b/main/openssl/crypto/evp/pmeth_gn.c
index 5d74161a..4651c813 100644
--- a/main/openssl/crypto/evp/pmeth_gn.c
+++ b/main/openssl/crypto/evp/pmeth_gn.c
@@ -199,7 +199,7 @@ int EVP_PKEY_CTX_get_keygen_info(EVP_PKEY_CTX *ctx, int idx)
 	}
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen)
+				const unsigned char *key, int keylen)
 	{
 	EVP_PKEY_CTX *mac_ctx = NULL;
 	EVP_PKEY *mac_key = NULL;
@@ -209,7 +209,8 @@ EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
 	if (EVP_PKEY_keygen_init(mac_ctx) <= 0)
 		goto merr;
 	if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN,
-				EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0)
+				EVP_PKEY_CTRL_SET_MAC_KEY,
+				keylen, (void *)key) <= 0)
 		goto merr;
 	if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0)
 		goto merr;
diff --git a/main/openssl/crypto/evp/pmeth_lib.c b/main/openssl/crypto/evp/pmeth_lib.c
index 5481d4b8..acfa7b6f 100644
--- a/main/openssl/crypto/evp/pmeth_lib.c
+++ b/main/openssl/crypto/evp/pmeth_lib.c
@@ -73,7 +73,7 @@ DECLARE_STACK_OF(EVP_PKEY_METHOD)
 STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL;
 
 extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth;
-extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth;
+extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth;
 
 static const EVP_PKEY_METHOD *standard_methods[] =
 	{
@@ -90,6 +90,7 @@ static const EVP_PKEY_METHOD *standard_methods[] =
 	&ec_pkey_meth,
 #endif
 	&hmac_pkey_meth,
+	&cmac_pkey_meth
 	};
 
 DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *,
@@ -203,6 +204,8 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
 	if (!pmeth)
 		return NULL;
 
+	memset(pmeth, 0, sizeof(EVP_PKEY_METHOD));
+
 	pmeth->pkey_id = id;
 	pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC;
 
@@ -235,6 +238,56 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
 	return pmeth;
 	}
 
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth)
+	{
+	if (ppkey_id)
+		*ppkey_id = meth->pkey_id;
+	if (pflags)
+		*pflags = meth->flags;
+	}
+
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src)
+	{
+
+	dst->init = src->init;
+	dst->copy = src->copy;
+	dst->cleanup = src->cleanup;
+
+	dst->paramgen_init = src->paramgen_init;
+	dst->paramgen = src->paramgen;
+
+	dst->keygen_init = src->keygen_init;
+	dst->keygen = src->keygen;
+
+	dst->sign_init = src->sign_init;
+	dst->sign = src->sign;
+
+	dst->verify_init = src->verify_init;
+	dst->verify = src->verify;
+
+	dst->verify_recover_init = src->verify_recover_init;
+	dst->verify_recover = src->verify_recover;
+
+	dst->signctx_init = src->signctx_init;
+	dst->signctx = src->signctx;
+
+	dst->verifyctx_init = src->verifyctx_init;
+	dst->verifyctx = src->verifyctx;
+
+	dst->encrypt_init = src->encrypt_init;
+	dst->encrypt = src->encrypt;
+
+	dst->decrypt_init = src->decrypt_init;
+	dst->decrypt = src->decrypt;
+
+	dst->derive_init = src->derive_init;
+	dst->derive = src->derive;
+
+	dst->ctrl = src->ctrl;
+	dst->ctrl_str = src->ctrl_str;
+	}
+
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth)
 	{
 	if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC))
diff --git a/main/openssl/crypto/hmac/hm_ameth.c b/main/openssl/crypto/hmac/hm_ameth.c
index 6d8a8914..e03f24ae 100644
--- a/main/openssl/crypto/hmac/hm_ameth.c
+++ b/main/openssl/crypto/hmac/hm_ameth.c
@@ -153,7 +153,7 @@ const EVP_PKEY_ASN1_METHOD hmac_asn1_meth =
 
 	hmac_size,
 	0,
-	0,0,0,0,0,0,
+	0,0,0,0,0,0,0,
 
 	hmac_key_free,
 	hmac_pkey_ctrl,
diff --git a/main/openssl/crypto/hmac/hm_pmeth.c b/main/openssl/crypto/hmac/hm_pmeth.c
index 71e8567a..0daa4451 100644
--- a/main/openssl/crypto/hmac/hm_pmeth.c
+++ b/main/openssl/crypto/hmac/hm_pmeth.c
@@ -100,7 +100,8 @@ static int pkey_hmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
 	dctx = dst->data;
 	dctx->md = sctx->md;
 	HMAC_CTX_init(&dctx->ctx);
-	HMAC_CTX_copy(&dctx->ctx, &sctx->ctx);
+	if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx))
+		return 0;
 	if (sctx->ktmp.data)
 		{
 		if (!ASN1_OCTET_STRING_set(&dctx->ktmp,
@@ -141,7 +142,8 @@ static int pkey_hmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
 static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
 	{
 	HMAC_PKEY_CTX *hctx = ctx->pctx->data;
-	HMAC_Update(&hctx->ctx, data, count);
+	if (!HMAC_Update(&hctx->ctx, data, count))
+		return 0;
 	return 1;
 	}
 
@@ -167,7 +169,8 @@ static int hmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 	if (!sig)
 		return 1;
 
-	HMAC_Final(&hctx->ctx, sig, &hlen);
+	if (!HMAC_Final(&hctx->ctx, sig, &hlen))
+		return 0;
 	*siglen = (size_t)hlen;
 	return 1;
 	}
@@ -192,8 +195,9 @@ static int pkey_hmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 
 		case EVP_PKEY_CTRL_DIGESTINIT:
 		key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr;
-		HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
-				ctx->engine);
+		if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
+				ctx->engine))
+			return 0;
 		break;
 
 		default:
diff --git a/main/openssl/crypto/hmac/hmac.c b/main/openssl/crypto/hmac/hmac.c
index 6c98fc43..ba27cbf5 100644
--- a/main/openssl/crypto/hmac/hmac.c
+++ b/main/openssl/crypto/hmac/hmac.c
@@ -61,12 +61,34 @@
 #include "cryptlib.h"
 #include <openssl/hmac.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, int len,
 		  const EVP_MD *md, ENGINE *impl)
 	{
 	int i,j,reset=0;
 	unsigned char pad[HMAC_MAX_MD_CBLOCK];
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		/* If we have an ENGINE need to allow non FIPS */
+		if ((impl || ctx->i_ctx.engine)
+			&&  !(ctx->i_ctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+			{
+			EVPerr(EVP_F_HMAC_INIT_EX, EVP_R_DISABLED_FOR_FIPS);
+			return 0;
+			}
+		/* Other algorithm blocking will be done in FIPS_cmac_init,
+		 * via FIPS_hmac_init_ex().
+		 */
+		if (!impl && !ctx->i_ctx.engine)
+			return FIPS_hmac_init_ex(ctx, key, len, md, NULL);
+		}
+#endif
+
 	if (md != NULL)
 		{
 		reset=1;
@@ -133,6 +155,10 @@ int HMAC_Init(HMAC_CTX *ctx, const void *key, int len, const EVP_MD *md)
 
 int HMAC_Update(HMAC_CTX *ctx, const unsigned char *data, size_t len)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		return FIPS_hmac_update(ctx, data, len);
+#endif
 	return EVP_DigestUpdate(&ctx->md_ctx,data,len);
 	}
 
@@ -140,6 +166,10 @@ int HMAC_Final(HMAC_CTX *ctx, unsigned char *md, unsigned int *len)
 	{
 	unsigned int i;
 	unsigned char buf[EVP_MAX_MD_SIZE];
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		return FIPS_hmac_final(ctx, md, len);
+#endif
 
 	if (!EVP_DigestFinal_ex(&ctx->md_ctx,buf,&i))
 		goto err;
@@ -179,6 +209,13 @@ int HMAC_CTX_copy(HMAC_CTX *dctx, HMAC_CTX *sctx)
 
 void HMAC_CTX_cleanup(HMAC_CTX *ctx)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		{
+		FIPS_hmac_ctx_cleanup(ctx);
+		return;
+		}
+#endif
 	EVP_MD_CTX_cleanup(&ctx->i_ctx);
 	EVP_MD_CTX_cleanup(&ctx->o_ctx);
 	EVP_MD_CTX_cleanup(&ctx->md_ctx);
diff --git a/main/openssl/crypto/ia64cpuid.S b/main/openssl/crypto/ia64cpuid.S
index d705fff7..7832b9b6 100644
--- a/main/openssl/crypto/ia64cpuid.S
+++ b/main/openssl/crypto/ia64cpuid.S
@@ -26,7 +26,7 @@ OPENSSL_atomic_add:
 { .mii;	mov		ar.ccv=r2
 	add		r8=r2,r33
 	mov		r3=r2		};;
-{ .mmi;	mf
+{ .mmi;	mf;;
 	cmpxchg4.acq	r2=[r32],r8,ar.ccv
 	nop.i		0		};;
 { .mib;	cmp.ne		p6,p0=r2,r3
diff --git a/main/openssl/crypto/md4/md4.h b/main/openssl/crypto/md4/md4.h
index c3ed9b3f..a55368a7 100644
--- a/main/openssl/crypto/md4/md4.h
+++ b/main/openssl/crypto/md4/md4.h
@@ -105,6 +105,9 @@ typedef struct MD4state_st
 	unsigned int num;
 	} MD4_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD4_Init(MD4_CTX *c);
+#endif
 int MD4_Init(MD4_CTX *c);
 int MD4_Update(MD4_CTX *c, const void *data, size_t len);
 int MD4_Final(unsigned char *md, MD4_CTX *c);
diff --git a/main/openssl/crypto/md4/md4_dgst.c b/main/openssl/crypto/md4/md4_dgst.c
index e0c42e85..b5b165b0 100644
--- a/main/openssl/crypto/md4/md4_dgst.c
+++ b/main/openssl/crypto/md4/md4_dgst.c
@@ -57,8 +57,9 @@
  */
 
 #include <stdio.h>
-#include "md4_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include "md4_locl.h"
 
 const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT;
 
@@ -70,7 +71,7 @@ const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT;
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L
 
-int MD4_Init(MD4_CTX *c)
+fips_md_init(MD4)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=INIT_DATA_A;
@@ -105,22 +106,23 @@ void md4_block_data_order (MD4_CTX *c, const void *data_, size_t num)
 
 	for (;num--;)
 		{
-	HOST_c2l(data,l); X( 0)=l;		HOST_c2l(data,l); X( 1)=l;
+	(void)HOST_c2l(data,l); X( 0)=l;
+	(void)HOST_c2l(data,l); X( 1)=l;
 	/* Round 0 */
-	R0(A,B,C,D,X( 0), 3,0);	HOST_c2l(data,l); X( 2)=l;
-	R0(D,A,B,C,X( 1), 7,0);	HOST_c2l(data,l); X( 3)=l;
-	R0(C,D,A,B,X( 2),11,0);	HOST_c2l(data,l); X( 4)=l;
-	R0(B,C,D,A,X( 3),19,0);	HOST_c2l(data,l); X( 5)=l;
-	R0(A,B,C,D,X( 4), 3,0);	HOST_c2l(data,l); X( 6)=l;
-	R0(D,A,B,C,X( 5), 7,0);	HOST_c2l(data,l); X( 7)=l;
-	R0(C,D,A,B,X( 6),11,0);	HOST_c2l(data,l); X( 8)=l;
-	R0(B,C,D,A,X( 7),19,0);	HOST_c2l(data,l); X( 9)=l;
-	R0(A,B,C,D,X( 8), 3,0);	HOST_c2l(data,l); X(10)=l;
-	R0(D,A,B,C,X( 9), 7,0);	HOST_c2l(data,l); X(11)=l;
-	R0(C,D,A,B,X(10),11,0);	HOST_c2l(data,l); X(12)=l;
-	R0(B,C,D,A,X(11),19,0);	HOST_c2l(data,l); X(13)=l;
-	R0(A,B,C,D,X(12), 3,0);	HOST_c2l(data,l); X(14)=l;
-	R0(D,A,B,C,X(13), 7,0);	HOST_c2l(data,l); X(15)=l;
+	R0(A,B,C,D,X( 0), 3,0);	(void)HOST_c2l(data,l); X( 2)=l;
+	R0(D,A,B,C,X( 1), 7,0);	(void)HOST_c2l(data,l); X( 3)=l;
+	R0(C,D,A,B,X( 2),11,0);	(void)HOST_c2l(data,l); X( 4)=l;
+	R0(B,C,D,A,X( 3),19,0);	(void)HOST_c2l(data,l); X( 5)=l;
+	R0(A,B,C,D,X( 4), 3,0);	(void)HOST_c2l(data,l); X( 6)=l;
+	R0(D,A,B,C,X( 5), 7,0);	(void)HOST_c2l(data,l); X( 7)=l;
+	R0(C,D,A,B,X( 6),11,0);	(void)HOST_c2l(data,l); X( 8)=l;
+	R0(B,C,D,A,X( 7),19,0);	(void)HOST_c2l(data,l); X( 9)=l;
+	R0(A,B,C,D,X( 8), 3,0);	(void)HOST_c2l(data,l); X(10)=l;
+	R0(D,A,B,C,X( 9), 7,0);	(void)HOST_c2l(data,l); X(11)=l;
+	R0(C,D,A,B,X(10),11,0);	(void)HOST_c2l(data,l); X(12)=l;
+	R0(B,C,D,A,X(11),19,0);	(void)HOST_c2l(data,l); X(13)=l;
+	R0(A,B,C,D,X(12), 3,0);	(void)HOST_c2l(data,l); X(14)=l;
+	R0(D,A,B,C,X(13), 7,0);	(void)HOST_c2l(data,l); X(15)=l;
 	R0(C,D,A,B,X(14),11,0);
 	R0(B,C,D,A,X(15),19,0);
 	/* Round 1 */
diff --git a/main/openssl/crypto/md4/md4_locl.h b/main/openssl/crypto/md4/md4_locl.h
index c8085b0e..99c3e500 100644
--- a/main/openssl/crypto/md4/md4_locl.h
+++ b/main/openssl/crypto/md4/md4_locl.h
@@ -77,10 +77,10 @@ void md4_block_data_order (MD4_CTX *c, const void *p,size_t num);
 #define HASH_FINAL		MD4_Final
 #define	HASH_MAKE_STRING(c,s)	do {	\
 	unsigned long ll;		\
-	ll=(c)->A; HOST_l2c(ll,(s));	\
-	ll=(c)->B; HOST_l2c(ll,(s));	\
-	ll=(c)->C; HOST_l2c(ll,(s));	\
-	ll=(c)->D; HOST_l2c(ll,(s));	\
+	ll=(c)->A; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->B; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->C; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->D; (void)HOST_l2c(ll,(s));	\
 	} while (0)
 #define	HASH_BLOCK_DATA_ORDER	md4_block_data_order
 
diff --git a/main/openssl/crypto/md5/asm/md5-586.S b/main/openssl/crypto/md5/asm/md5-586.S
new file mode 100644
index 00000000..23e4de7d
--- /dev/null
+++ b/main/openssl/crypto/md5/asm/md5-586.S
@@ -0,0 +1,679 @@
+.file	"crypto/md5/asm/md5-586.s"
+.text
+.globl	md5_block_asm_data_order
+.type	md5_block_asm_data_order,@function
+.align	16
+md5_block_asm_data_order:
+.L_md5_block_asm_data_order_begin:
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%edi
+	movl	16(%esp),%esi
+	movl	20(%esp),%ecx
+	pushl	%ebp
+	shll	$6,%ecx
+	pushl	%ebx
+	addl	%esi,%ecx
+	subl	$64,%ecx
+	movl	(%edi),%eax
+	pushl	%ecx
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+.L000start:
+
+
+	movl	%ecx,%edi
+	movl	(%esi),%ebp
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	3614090360(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	4(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	3905402710(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	8(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	606105819(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	12(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	3250441966(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	16(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	4118548399(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	20(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	1200080426(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	24(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2821735955(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	28(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	4249261313(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	32(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1770035416(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	36(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	2336552879(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	40(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	4294925233(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	44(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	2304563134(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	48(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1804603682(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	52(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	4254626195(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	56(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2792965006(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	60(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	1236535329(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	4(%esi),%ebp
+	addl	%ecx,%ebx
+
+
+
+	leal	4129170786(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	24(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	3225465664(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	44(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	643717713(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	3921069994(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	3593408605(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	40(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	38016083(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	60(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	3634488961(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	16(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	3889429448(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	36(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	568446438(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	56(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	3275163606(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	12(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	4107603335(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	32(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	1163531501(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	52(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	2850285829(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	8(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	4243563512(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	28(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	1735328473(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	48(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	2368359562(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	4294588738(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	32(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	2272392833(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	1839030562(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	56(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	4259657740(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	2763975236(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	16(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	1272893353(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	4139469664(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	40(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	3200236656(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	681279174(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	3936430074(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	3572445317(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	24(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	76029189(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	3654602809(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	48(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	3873151461(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	530742520(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	8(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	3299628645(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+
+
+	xorl	%edx,%edi
+	orl	%ebx,%edi
+	leal	4096336452(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	1126891415(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	56(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	2878612391(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	20(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	4237533241(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	48(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	1700485571(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	2399980690(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	40(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	4293915773(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	2240044497(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	32(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	1873313359(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	4264355552(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	24(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	2734768916(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	1309151649(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	16(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	4149444226(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	3174756917(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	8(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	718787259(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	3951481745(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	24(%esp),%ebp
+	addl	%edi,%ebx
+	addl	$64,%esi
+	roll	$21,%ebx
+	movl	(%ebp),%edi
+	addl	%ecx,%ebx
+	addl	%edi,%eax
+	movl	4(%ebp),%edi
+	addl	%edi,%ebx
+	movl	8(%ebp),%edi
+	addl	%edi,%ecx
+	movl	12(%ebp),%edi
+	addl	%edi,%edx
+	movl	%eax,(%ebp)
+	movl	%ebx,4(%ebp)
+	movl	(%esp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	cmpl	%esi,%edi
+	jae	.L000start
+	popl	%eax
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin
diff --git a/main/openssl/crypto/md5/asm/md5-x86_64.S b/main/openssl/crypto/md5/asm/md5-x86_64.S
new file mode 100644
index 00000000..235d5e4e
--- /dev/null
+++ b/main/openssl/crypto/md5/asm/md5-x86_64.S
@@ -0,0 +1,668 @@
+.text	
+.align	16
+
+.globl	md5_block_asm_data_order
+.type	md5_block_asm_data_order,@function
+md5_block_asm_data_order:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r14
+	pushq	%r15
+.Lprologue:
+
+
+
+
+	movq	%rdi,%rbp
+	shlq	$6,%rdx
+	leaq	(%rsi,%rdx,1),%rdi
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+
+
+
+
+
+
+
+	cmpq	%rdi,%rsi
+	je	.Lend				
+
+
+.Lloop:
+	movl	%eax,%r8d
+	movl	%ebx,%r9d
+	movl	%ecx,%r14d
+	movl	%edx,%r15d
+	movl	0(%rsi),%r10d
+	movl	%edx,%r11d
+	xorl	%ecx,%r11d
+	leal	-680876936(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	4(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-389564586(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	606105819(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1044525330(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	-176418897(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	1200080426(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1473231341(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-45705983(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1770035416(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-1958414417(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-42063(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1990404162(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1804603682(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-40341101(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1502002290(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	1236535329(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	movl	4(%rsi),%r10d
+	movl	%edx,%r11d
+	movl	%edx,%r12d
+	notl	%r11d
+	leal	-165796510(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1069501632(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	643717713(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-373897302(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-701558691(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	38016083(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-660478335(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-405537848(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	568446438(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1019803690(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-187363961(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	1163531501(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-1444681467(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-51403784(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	1735328473(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-1926607734(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	movl	20(%rsi),%r10d
+	movl	%ecx,%r11d
+	leal	-378558(%rax,%r10,1),%eax
+	movl	32(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-2022574463(%rdx,%r10,1),%edx
+	movl	44(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	1839030562(%rcx,%r10,1),%ecx
+	movl	56(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-35309556(%rbx,%r10,1),%ebx
+	movl	4(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-1530992060(%rax,%r10,1),%eax
+	movl	16(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	1272893353(%rdx,%r10,1),%edx
+	movl	28(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-155497632(%rcx,%r10,1),%ecx
+	movl	40(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-1094730640(%rbx,%r10,1),%ebx
+	movl	52(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	681279174(%rax,%r10,1),%eax
+	movl	0(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-358537222(%rdx,%r10,1),%edx
+	movl	12(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-722521979(%rcx,%r10,1),%ecx
+	movl	24(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	76029189(%rbx,%r10,1),%ebx
+	movl	36(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-640364487(%rax,%r10,1),%eax
+	movl	48(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-421815835(%rdx,%r10,1),%edx
+	movl	60(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	530742520(%rcx,%r10,1),%ecx
+	movl	8(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-995338651(%rbx,%r10,1),%ebx
+	movl	0(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	movl	0(%rsi),%r10d
+	movl	$4294967295,%r11d
+	xorl	%edx,%r11d
+	leal	-198630844(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	28(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	1126891415(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	56(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1416354905(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	20(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-57434055(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	48(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1700485571(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	12(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1894986606(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	40(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1051523(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	4(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-2054922799(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	32(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1873313359(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	60(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-30611744(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	24(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1560198380(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	52(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	1309151649(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	16(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	-145523070(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	44(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1120210379(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	8(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	718787259(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	36(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-343485551(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	0(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+
+	addl	%r8d,%eax
+	addl	%r9d,%ebx
+	addl	%r14d,%ecx
+	addl	%r15d,%edx
+
+
+	addq	$64,%rsi
+	cmpq	%rdi,%rsi
+	jb	.Lloop				
+
+
+.Lend:
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	movq	(%rsp),%r15
+	movq	8(%rsp),%r14
+	movq	16(%rsp),%r12
+	movq	24(%rsp),%rbx
+	movq	32(%rsp),%rbp
+	addq	$40,%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	md5_block_asm_data_order,.-md5_block_asm_data_order
diff --git a/main/openssl/crypto/md5/asm/md5-x86_64.pl b/main/openssl/crypto/md5/asm/md5-x86_64.pl
index 86788543..f11224d1 100755
--- a/main/openssl/crypto/md5/asm/md5-x86_64.pl
+++ b/main/openssl/crypto/md5/asm/md5-x86_64.pl
@@ -120,7 +120,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
 die "can't locate x86_64-xlate.pl";
 
 no warnings qw(uninitialized);
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $code .= <<EOF;
 .text
diff --git a/main/openssl/crypto/md5/md5.h b/main/openssl/crypto/md5/md5.h
index 4cbf8438..541cc925 100644
--- a/main/openssl/crypto/md5/md5.h
+++ b/main/openssl/crypto/md5/md5.h
@@ -105,6 +105,9 @@ typedef struct MD5state_st
 	unsigned int num;
 	} MD5_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD5_Init(MD5_CTX *c);
+#endif
 int MD5_Init(MD5_CTX *c);
 int MD5_Update(MD5_CTX *c, const void *data, size_t len);
 int MD5_Final(unsigned char *md, MD5_CTX *c);
diff --git a/main/openssl/crypto/md5/md5_dgst.c b/main/openssl/crypto/md5/md5_dgst.c
index beace632..265890de 100644
--- a/main/openssl/crypto/md5/md5_dgst.c
+++ b/main/openssl/crypto/md5/md5_dgst.c
@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include "md5_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 
 const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT;
 
@@ -70,7 +71,7 @@ const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT;
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L
 
-int MD5_Init(MD5_CTX *c)
+fips_md_init(MD5)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=INIT_DATA_A;
diff --git a/main/openssl/crypto/md5/md5_locl.h b/main/openssl/crypto/md5/md5_locl.h
index 968d5779..74d63d1f 100644
--- a/main/openssl/crypto/md5/md5_locl.h
+++ b/main/openssl/crypto/md5/md5_locl.h
@@ -86,10 +86,10 @@ void md5_block_data_order (MD5_CTX *c, const void *p,size_t num);
 #define HASH_FINAL		MD5_Final
 #define	HASH_MAKE_STRING(c,s)	do {	\
 	unsigned long ll;		\
-	ll=(c)->A; HOST_l2c(ll,(s));	\
-	ll=(c)->B; HOST_l2c(ll,(s));	\
-	ll=(c)->C; HOST_l2c(ll,(s));	\
-	ll=(c)->D; HOST_l2c(ll,(s));	\
+	ll=(c)->A; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->B; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->C; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->D; (void)HOST_l2c(ll,(s));	\
 	} while (0)
 #define	HASH_BLOCK_DATA_ORDER	md5_block_data_order
 
diff --git a/main/openssl/crypto/mdc2/mdc2.h b/main/openssl/crypto/mdc2/mdc2.h
index 72778a52..f3e8e579 100644
--- a/main/openssl/crypto/mdc2/mdc2.h
+++ b/main/openssl/crypto/mdc2/mdc2.h
@@ -81,6 +81,9 @@ typedef struct mdc2_ctx_st
 	} MDC2_CTX;
 
 
+#ifdef OPENSSL_FIPS
+int private_MDC2_Init(MDC2_CTX *c);
+#endif
 int MDC2_Init(MDC2_CTX *c);
 int MDC2_Update(MDC2_CTX *c, const unsigned char *data, size_t len);
 int MDC2_Final(unsigned char *md, MDC2_CTX *c);
diff --git a/main/openssl/crypto/mdc2/mdc2dgst.c b/main/openssl/crypto/mdc2/mdc2dgst.c
index 4aa406ed..d66ed6a1 100644
--- a/main/openssl/crypto/mdc2/mdc2dgst.c
+++ b/main/openssl/crypto/mdc2/mdc2dgst.c
@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <openssl/crypto.h>
 #include <openssl/des.h>
 #include <openssl/mdc2.h>
 
@@ -75,7 +76,7 @@
 			*((c)++)=(unsigned char)(((l)>>24L)&0xff))
 
 static void mdc2_body(MDC2_CTX *c, const unsigned char *in, size_t len);
-int MDC2_Init(MDC2_CTX *c)
+fips_md_init(MDC2)
 	{
 	c->num=0;
 	c->pad_type=1;
diff --git a/main/openssl/crypto/mem.c b/main/openssl/crypto/mem.c
index 6f80dd33..1cc62eaf 100644
--- a/main/openssl/crypto/mem.c
+++ b/main/openssl/crypto/mem.c
@@ -121,10 +121,11 @@ static void (*set_debug_options_func)(long) = NULL;
 static long (*get_debug_options_func)(void) = NULL;
 #endif
 
-
 int CRYPTO_set_mem_functions(void *(*m)(size_t), void *(*r)(void *, size_t),
 	void (*f)(void *))
 	{
+	/* Dummy call just to ensure OPENSSL_init() gets linked in */
+	OPENSSL_init();
 	if (!allow_customize)
 		return 0;
 	if ((m == 0) || (r == 0) || (f == 0))
@@ -186,6 +187,7 @@ int CRYPTO_set_mem_debug_functions(void (*m)(void *,int,const char *,int,int),
 	{
 	if (!allow_customize_debug)
 		return 0;
+	OPENSSL_init();
 	malloc_debug_func=m;
 	realloc_debug_func=r;
 	free_debug_func=f;
@@ -361,6 +363,10 @@ void *CRYPTO_realloc_clean(void *str, int old_len, int num, const char *file,
 
 	if (num <= 0) return NULL;
 
+	/* We don't support shrinking the buffer. Note the memcpy that copies
+	 * |old_len| bytes to the new buffer, below. */
+	if (num < old_len) return NULL;
+
 	if (realloc_debug_func != NULL)
 		realloc_debug_func(str, NULL, num, file, line, 0);
 	ret=malloc_ex_func(num,file,line);
diff --git a/main/openssl/crypto/modes/asm/ghash-alpha.pl b/main/openssl/crypto/modes/asm/ghash-alpha.pl
new file mode 100644
index 00000000..aa360293
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,460 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0";	# $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3";	# $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7";	# $8
+#################
+$Xi="a0";	# $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4";	# $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10";	# $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT";	# $28
+
+{ my $N;
+  sub loop() {
+
+	$N++;
+$code.=<<___;
+.align	4
+	extbl	$Xlo,7,$nlo
+	and	$nlo,0xf0,$nhi
+	sll	$nlo,4,$nlo
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Zlo,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Zhi,0($nlo)
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,6(zero)
+	extbl	$Xlo,6,$nlo
+
+	ldq	$Tlo1,8($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+	ldq	$Thi1,0($nhi)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	and	$nlo,0xf0,$nhi
+
+	xor	$Tlo1,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	xor	$Thi1,$Zhi,$Zhi
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Tlo0,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Thi0,0($nlo)
+
+.Looplo$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xlo,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Looplo$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,7(zero)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	unop
+
+
+.Loophi$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Loophi$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo0,$Zlo,$Zlo
+	xor	$Thi0,$Zhi,$Zhi
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	xor	$rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	gcm_gmult_4bit
+.align	4
+.ent	gcm_gmult_4bit
+gcm_gmult_4bit:
+	.frame	sp,0,ra
+	.prologue 0
+
+	ldq	$Xlo,8($Xi)
+	ldq	$Xhi,0($Xi)
+
+	bsr	$t0,picmeup
+	nop
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	ret	(ra)
+.end	gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.align	4
+.ent	gcm_ghash_4bit
+gcm_ghash_4bit:
+	lda	sp,-32(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	.mask	0x04000600,-32
+	.frame	sp,32,ra
+	.prologue 0
+
+	ldq_u	$inhi,0($inp)
+	ldq_u	$Thi0,7($inp)
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+	ldq	$Xhi,0($Xi)
+	ldq	$Xlo,8($Xi)
+
+	bsr	$t0,picmeup
+	nop
+
+.Louter:
+	extql	$inhi,$inp,$inhi
+	extqh	$Thi0,$inp,$Thi0
+	or	$inhi,$Thi0,$inhi
+	lda	$inp,16($inp)
+
+	extql	$inlo,$inp,$inlo
+	extqh	$Tlo0,$inp,$Tlo0
+	or	$inlo,$Tlo0,$inlo
+	subq	$len,16,$len
+
+	xor	$Xlo,$inlo,$Xlo
+	xor	$Xhi,$inhi,$Xhi
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+	beq	$len,.Ldone
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+	ldq_u	$inhi,0($inp)
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+	ldq_u	$Thi0,7($inp)
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	br	zero,.Louter
+
+.Ldone:
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	.set	noreorder
+	/*ldq	ra,0(sp)*/
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	lda	sp,32(sp)
+	ret	(ra)
+.end	gcm_ghash_4bit
+
+.align	4
+.ent	picmeup
+picmeup:
+	.frame	sp,0,$t0
+	.prologue 0
+	br	$rem_4bit,.Lpic
+.Lpic:	lda	$rem_4bit,12($rem_4bit)
+	ret	($t0)
+.end	picmeup
+	nop
+rem_4bit:
+	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
+	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
+	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
+	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
+.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
diff --git a/main/openssl/crypto/modes/asm/ghash-armv4.S b/main/openssl/crypto/modes/asm/ghash-armv4.S
new file mode 100644
index 00000000..d66c4cbf
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-armv4.S
@@ -0,0 +1,408 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+	sub	r2,pc,#8
+	sub	r2,r2,#32	@ &rem_4bit
+	b	.Lrem_4bit_got
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+	sub	r12,pc,#8
+	add	r3,r2,r3		@ r3 to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save r3/end too
+	sub	r12,r12,#48		@ &rem_4bit
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	r12,[r2,#15]
+	ldrb	r14,[r0,#15]
+.Louter:
+	eor	r12,r12,r14
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	mov	r3,#14
+
+	add	r7,r1,r12,lsl#4
+	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
+	add	r11,r1,r14
+	ldrb	r12,[r2,#14]
+
+	and	r14,r4,#0xf		@ rem
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	add	r14,r14,r14
+	eor	r4,r8,r4,lsr#4
+	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
+	eor	r4,r4,r5,lsl#28
+	ldrb	r14,[r0,#14]
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	eor	r12,r12,r14
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	eor	r7,r7,r8,lsl#16
+
+.Linner:
+	add	r11,r1,r12,lsl#4
+	and	r12,r4,#0xf		@ rem
+	subs	r3,r3,#1
+	add	r12,r12,r12
+	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
+	eor	r6,r10,r6,lsr#4
+	ldrplb	r12,[r2,r3]
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	add	r14,r14,r14
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	eor	r4,r8,r4,lsr#4
+	ldrplb	r8,[r0,r3]
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	ldrh	r9,[sp,r14]
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eorpl	r12,r12,r8
+	eor	r7,r11,r7,lsr#4
+	andpl	r14,r12,#0xf0
+	andpl	r12,r12,#0x0f
+	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	r3,[sp,#32]		@ re-load r3/end
+	add	r2,r2,#16
+	mov	r14,r4
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r4,r4
+	str	r4,[r0,#12]
+#elif defined(__ARMEB__)
+	str	r4,[r0,#12]
+#else
+	mov	r9,r4,lsr#8
+	strb	r4,[r0,#12+3]
+	mov	r10,r4,lsr#16
+	strb	r9,[r0,#12+2]
+	mov	r11,r4,lsr#24
+	strb	r10,[r0,#12+1]
+	strb	r11,[r0,#12]
+#endif
+	cmp	r2,r3
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r5,r5
+	str	r5,[r0,#8]
+#elif defined(__ARMEB__)
+	str	r5,[r0,#8]
+#else
+	mov	r9,r5,lsr#8
+	strb	r5,[r0,#8+3]
+	mov	r10,r5,lsr#16
+	strb	r9,[r0,#8+2]
+	mov	r11,r5,lsr#24
+	strb	r10,[r0,#8+1]
+	strb	r11,[r0,#8]
+#endif
+	ldrneb	r12,[r2,#15]
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r6,r6
+	str	r6,[r0,#4]
+#elif defined(__ARMEB__)
+	str	r6,[r0,#4]
+#else
+	mov	r9,r6,lsr#8
+	strb	r6,[r0,#4+3]
+	mov	r10,r6,lsr#16
+	strb	r9,[r0,#4+2]
+	mov	r11,r6,lsr#24
+	strb	r10,[r0,#4+1]
+	strb	r11,[r0,#4]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r7,r7
+	str	r7,[r0,#0]
+#elif defined(__ARMEB__)
+	str	r7,[r0,#0]
+#else
+	mov	r9,r7,lsr#8
+	strb	r7,[r0,#0+3]
+	mov	r10,r7,lsr#16
+	strb	r9,[r0,#0+2]
+	mov	r11,r7,lsr#24
+	strb	r10,[r0,#0+1]
+	strb	r11,[r0,#0]
+#endif
+	
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	r12,[r0,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	mov	r3,#14
+
+	add	r7,r1,r12,lsl#4
+	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
+	ldrb	r12,[r0,#14]
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	add	r14,r14,r14
+	eor	r4,r8,r4,lsr#4
+	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	and	r14,r12,#0xf0
+	eor	r7,r7,r8,lsl#16
+	and	r12,r12,#0x0f
+
+.Loop:
+	add	r11,r1,r12,lsl#4
+	and	r12,r4,#0xf		@ rem
+	subs	r3,r3,#1
+	add	r12,r12,r12
+	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
+	eor	r6,r10,r6,lsr#4
+	ldrplb	r12,[r0,r3]
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	add	r14,r14,r14
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	andpl	r14,r12,#0xf0
+	andpl	r12,r12,#0x0f
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r4,r4
+	str	r4,[r0,#12]
+#elif defined(__ARMEB__)
+	str	r4,[r0,#12]
+#else
+	mov	r9,r4,lsr#8
+	strb	r4,[r0,#12+3]
+	mov	r10,r4,lsr#16
+	strb	r9,[r0,#12+2]
+	mov	r11,r4,lsr#24
+	strb	r10,[r0,#12+1]
+	strb	r11,[r0,#12]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r5,r5
+	str	r5,[r0,#8]
+#elif defined(__ARMEB__)
+	str	r5,[r0,#8]
+#else
+	mov	r9,r5,lsr#8
+	strb	r5,[r0,#8+3]
+	mov	r10,r5,lsr#16
+	strb	r9,[r0,#8+2]
+	mov	r11,r5,lsr#24
+	strb	r10,[r0,#8+1]
+	strb	r11,[r0,#8]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r6,r6
+	str	r6,[r0,#4]
+#elif defined(__ARMEB__)
+	str	r6,[r0,#4]
+#else
+	mov	r9,r6,lsr#8
+	strb	r6,[r0,#4+3]
+	mov	r10,r6,lsr#16
+	strb	r9,[r0,#4+2]
+	mov	r11,r6,lsr#24
+	strb	r10,[r0,#4+1]
+	strb	r11,[r0,#4]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r7,r7
+	str	r7,[r0,#0]
+#elif defined(__ARMEB__)
+	str	r7,[r0,#0]
+#else
+	mov	r9,r7,lsr#8
+	strb	r7,[r0,#0+3]
+	mov	r10,r7,lsr#16
+	strb	r9,[r0,#0+2]
+	mov	r11,r7,lsr#24
+	strb	r10,[r0,#0+1]
+	strb	r11,[r0,#0]
+#endif
+	
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	sub		r1,#16		@ point at H in GCM128_CTX
+	vld1.64		d29,[r0,:64]!@ load Xi
+	vmov.i32	d5,#0xe1		@ our irreducible polynomial
+	vld1.64		d28,[r0,:64]!
+	vshr.u64	d5,#32
+	vldmia		r1,{d0-d1}	@ load H
+	veor		q12,q12
+#ifdef __ARMEL__
+	vrev64.8	q14,q14
+#endif
+	veor		q13,q13
+	veor		q11,q11
+	mov		r1,#16
+	veor		q10,q10
+	mov		r3,#16
+	veor		d2,d2
+	vdup.8		d4,d28[0]	@ broadcast lowest byte
+	b		.Linner_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		d21,[r0,:64]!	@ load Xi
+	vmov.i32	d5,#0xe1		@ our irreducible polynomial
+	vld1.64		d20,[r0,:64]!
+	vshr.u64	d5,#32
+	vldmia		r0,{d0-d1}		@ load H
+	veor		q12,q12
+	nop
+#ifdef __ARMEL__
+	vrev64.8	q10,q10
+#endif
+.Louter_neon:
+	vld1.64		d29,[r2]!	@ load inp
+	veor		q13,q13
+	vld1.64		d28,[r2]!
+	veor		q11,q11
+	mov		r1,#16
+#ifdef __ARMEL__
+	vrev64.8	q14,q14
+#endif
+	veor		d2,d2
+	veor		q14,q10			@ inp^=Xi
+	veor		q10,q10
+	vdup.8		d4,d28[0]	@ broadcast lowest byte
+.Linner_neon:
+	subs		r1,r1,#1
+	vmull.p8	q9,d1,d4		@ H.lo·Xi[i]
+	vmull.p8	q8,d0,d4		@ H.hi·Xi[i]
+	vext.8		q14,q12,#1		@ IN>>=8
+
+	veor		q10,q13		@ modulo-scheduled part
+	vshl.i64	d22,#48
+	vdup.8		d4,d28[0]	@ broadcast lowest byte
+	veor		d3,d18,d20
+
+	veor		d21,d22
+	vuzp.8		q9,q8
+	vsli.8		d2,d3,#1		@ compose the "carry" byte
+	vext.8		q10,q12,#1		@ Z>>=8
+
+	vmull.p8	q11,d2,d5		@ "carry"·0xe1
+	vshr.u8		d2,d3,#7		@ save Z's bottom bit
+	vext.8		q13,q9,q12,#1	@ Qlo>>=8
+	veor		q10,q8
+	bne		.Linner_neon
+
+	veor		q10,q13		@ modulo-scheduled artefact
+	vshl.i64	d22,#48
+	veor		d21,d22
+
+	@ finalization, normalize Z:Zo
+	vand		d2,d5		@ suffices to mask the bit
+	vshr.u64	d3,d20,#63
+	vshl.i64	q10,#1
+	subs		r3,#16
+	vorr		q10,q1		@ Z=Z:Zo<<1
+	bne		.Louter_neon
+
+#ifdef __ARMEL__
+	vrev64.8	q10,q10
+#endif
+	sub		r0,#16	
+	vst1.64		d21,[r0,:64]!	@ write out Xi
+	vst1.64		d20,[r0,:64]
+
+	.word	0xe12fff1e
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align  2
diff --git a/main/openssl/crypto/modes/asm/ghash-armv4.pl b/main/openssl/crypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 00000000..e46f8e34
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$Xi="r0";	# argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4";	# variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp;	# used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$_,$_
+	str	$_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+	str	$_,[$Xi,#$i]
+#else
+	mov	$Tlh,$_,lsr#8
+	strb	$_,[$Xi,#$i+3]
+	mov	$Thl,$_,lsr#16
+	strb	$Tlh,[$Xi,#$i+2]
+	mov	$Thh,$_,lsr#24
+	strb	$Thl,[$Xi,#$i+1]
+	strb	$Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+	sub	$rem_4bit,pc,#8
+	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
+	b	.Lrem_4bit_got
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+	sub	r12,pc,#8
+	add	$len,$inp,$len		@ $len to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
+	sub	r12,r12,#48		@ &rem_4bit
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	$nlo,[$inp,#15]
+	ldrb	$nhi,[$Xi,#15]
+.Louter:
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	add	$Thh,$Htbl,$nhi
+	ldrb	$nlo,[$inp,#14]
+
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	ldrb	$nhi,[$Xi,#14]
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$inp,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrplb	$Tll,[$Xi,$cnt]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tlh,[sp,$nhi]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eorpl	$nlo,$nlo,$Tll
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	$len,[sp,#32]		@ re-load $len/end
+	add	$inp,$inp,#16
+	mov	$nhi,$Zll
+___
+	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	$nlo,[$Xi,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	ldrb	$nlo,[$Xi,#14]
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	and	$nhi,$nlo,#0xf0
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+	and	$nlo,$nlo,#0x0f
+
+.Loop:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$Xi,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+___
+	&Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl;	# $Htbl is used once in the very beginning
+
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	sub		$Htbl,#16		@ point at H in GCM128_CTX
+	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
+	veor		$zero,$zero
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Qpost,$Qpost
+	veor		$R,$R
+	mov		$cnt,#16
+	veor		$Z,$Z
+	mov		$len,#16
+	veor		$Zo,$Zo
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	b		.Linner_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
+	veor		$zero,$zero
+	nop
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+.Louter_neon:
+	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
+	veor		$Qpost,$Qpost
+	vld1.64		`&Dlo($IN)`,[$inp]!
+	veor		$R,$R
+	mov		$cnt,#16
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Zo,$Zo
+	veor		$IN,$Z			@ inp^=Xi
+	veor		$Z,$Z
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+.Linner_neon:
+	subs		$cnt,$cnt,#1
+	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo·Xi[i]
+	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi·Xi[i]
+	vext.8		$IN,$zero,#1		@ IN>>=8
+
+	veor		$Z,$Qpost		@ modulo-scheduled part
+	vshl.i64	`&Dlo("$R")`,#48
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+	vuzp.8		$Qlo,$Qhi
+	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
+	vext.8		$Z,$zero,#1		@ Z>>=8
+
+	vmull.p8	$R,$Zo,$mod		@ "carry"·0xe1
+	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
+	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
+	veor		$Z,$Qhi
+	bne		.Linner_neon
+
+	veor		$Z,$Qpost		@ modulo-scheduled artefact
+	vshl.i64	`&Dlo("$R")`,#48
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+
+	@ finalization, normalize Z:Zo
+	vand		$Zo,$mod		@ suffices to mask the bit
+	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+	vshl.i64	$Z,#1
+	subs		$len,#16
+	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
+	bne		.Louter_neon
+
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+	sub		$Xi,#16	
+	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
+	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
+
+	bx	lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/main/openssl/crypto/modes/asm/ghash-ia64.pl b/main/openssl/crypto/modes/asm/ghash-ia64.pl
new file mode 100755
index 00000000..0354c954
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-ia64.pl
@@ -0,0 +1,463 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed yet another instruction
+# slot by unrolling the loop... Resulting performance is 4.45 cycles
+# per processed byte and 50% better than "256B" version. On original
+# Itanium performance should remain the same as the "256B" version,
+# i.e. ~8.5 cycles.
+
+$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
+                $big_endian=0 if (/\-DL_ENDIAN/);  }
+if (!defined($big_endian))
+             {  $big_endian=(unpack('L',pack('N',1))==1);  }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
+	($p17)	xor	xi[1]=xi[1],in[1]	};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p19)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p19)	ld8	rem=[rem]
+	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
+{ .mmi;	($p16)	ld1	in[0]=[inp],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p19)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p19)	xor	Hhi=Hhi,rem
+	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
+
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
+	(p18)	xor	Zhi=Zhi,Hhi		};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p18)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p18)	ld8	rem=[rem]
+	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
+{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p18)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p18)	xor	Hhi=Hhi,rem
+	(p17)	add	Hi[0]=Htbl,Hi[0]
+	br.ctop.sptk	$label			};;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2;	prevlc=r3;	prevpr=r8;
+mask0xf0=r21;
+rem=r22;	rem_4bitp=r23;
+Xi=r24;		Htbl=r25;
+inp=r26;	end=r27;
+Hhi=r28;	Hlo=r29;
+Zhi=r30;	Zlo=r31;
+
+.align	128
+.skip	16					// aligns loop body
+.global	gcm_gmult_4bit#
+.proc	gcm_gmult_4bit#
+gcm_gmult_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,2,6,0,8
+	$ADDP	Xi=15,in0			// &Xi[15]
+	mov	rem_4bitp=ip		}
+{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotr	in[3],xi[3],Hi[2]
+
+{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
+	mov	mask0xf0=0xf0
+	brp.loop.imp	.Loop1,.Lend1-16};;
+{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
+					};;
+{ .mii;	shladd	Hi[1]=xi[2],4,r0
+	mov	pr.rot=0x7<<16
+	mov	ar.lc=13		};;
+{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
+	mov	ar.ec=3
+	xor	Zlo=Zlo,Zlo		};;
+{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
+	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+	xor	Zhi=Zhi,Zhi		};;
+___
+	&loop	(".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
+{ .mib;	mux1	Zlo=Zlo,\@rev		};;
+{ .mib;	mux1	Zhi=Zhi,\@rev		};;
+{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
+	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
+{ .mib;	st8	[Hlo]=Zlo
+	mov	pr=prevpr,0x1ffff	};;
+{ .mib;	st8	[Hhi]=Zhi
+	mov	ar.lc=prevlc
+	br.ret.sptk.many	b0	};;
+.endp	gcm_gmult_4bit#
+___
+
+######################################################################
+# "528B" (well, "512B" actualy) streamed GHASH
+#
+$Xip="in0";
+$Htbl="in1";
+$inp="in2";
+$len="in3";
+$rem_8bit="loc0";
+$mask0xff="loc1";
+($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
+
+sub load_htable() {
+    for (my $i=0;$i<8;$i++) {
+	$code.=<<___;
+{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
+	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
+{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
+	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
+___
+	$code.=shift	if (($i+$#_)==7);
+	$code.="\t};;\n"
+    }
+}
+
+$code.=<<___;
+prevsp=r3;
+
+.align	32
+.skip	16					// aligns loop body
+.global	gcm_ghash_4bit#
+.proc	gcm_ghash_4bit#
+gcm_ghash_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,4,2,0,0
+	.vframe	prevsp
+	mov	prevsp=sp
+	mov	$rem_8bit=ip		};;
+	.body
+{ .mfi;	$ADDP	r8=0+0,$Htbl
+	$ADDP	r9=0+8,$Htbl		}
+{ .mfi;	$ADDP	r10=128+0,$Htbl
+	$ADDP	r11=128+8,$Htbl		};;
+___
+	&load_htable(
+	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
+	"	$ADDP	$len=$len,$inp",	# &inp[len]
+	"	$ADDP	$inp=15,$inp",		# &inp[15]
+	"	mov	$mask0xff=0xff",
+	"	add	sp=-512,sp",
+	"	andcm	sp=sp,$mask0xff",	# align stack frame
+	"	add	r14=0,sp",
+	"	add	r15=8,sp");
+$code.=<<___;
+{ .mmi;	$sum	1<<1				// go big-endian
+	add	r8=256+0,sp
+	add	r9=256+8,sp		}
+{ .mmi;	add	r10=256+128+0,sp
+	add	r11=256+128+8,sp
+	add	$len=-17,$len		};;
+___
+for($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
+my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
+$code.=<<___;
+{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
+	st8	[r9]=$rhi,16			// Htable[$i].hi
+	shrp	$rlo=$rhi,$rlo,4	}//;;
+{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
+	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
+	shr.u	$rhi=$rhi,4		};;
+{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
+	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
+___
+}
+$code.=<<___;
+{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
+	ld8	r17=[r9],16		};;	// Htable[8].hi
+{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
+	ld8	r19=[r9],16		}	// Htable[9].hi
+{ .mmi;	rum	1<<5				// clear um.mfh
+	shrp	r16=r17,r16,4		};;
+___
+for($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
+$code.=<<___;
+{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
+	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
+	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+___
+}
+$code.=<<___;
+{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
+	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
+	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
+___
+
+$in="r15";
+@xi=("r16","r17");
+@rem=("r18","r19");
+($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
+($Atbl,$Btbl)=("r26","r27");
+
+$code.=<<___;	# (p16)
+{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17)
+{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
+	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+.align	32
+.LOOP:
+{ .mmi;
+(p6)	st8	[$Xip]=$Zhi,13
+	xor	$Zlo=$Zlo,$Zlo
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17),(p18)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1		}	//(p16) *inp--
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+for ($i=1;$i<14;$i++) {
+# Above and below fragments are derived from this one by removing
+# unsuitable (p??) instructions.
+$code.=<<___;	# (p16),(p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1			//(p16) *inp--
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+}
+
+$code.=<<___;	# (p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p18),(p19)
+{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
+{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
+	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
+{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
+{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p19)
+{ .mmi;	cmp.ltu	p6,p0=$inp,$len
+	add	$inp=32,$inp
+	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	add	$Xip=9,$Xip		};;	//	&Xi.lo
+{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
+{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
+(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
+{ .mmi;	st8	[$Xip]=$Zlo,-8
+(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
+	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
+{ .mmi;
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
+{ .mib;
+(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
+(p6)	br.cond.dptk.many	.LOOP	};;
+
+{ .mib;	st8	[$Xip]=$Zhi		};;
+{ .mib;	$rum	1<<1				// return to little-endian
+	.restore	sp
+	mov	sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	gcm_ghash_4bit#
+___
+$code.=<<___;
+.align	128
+.type	rem_4bit#,\@object
+rem_4bit:
+        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size	rem_4bit#,128
+.type	rem_8bit#,\@object
+rem_8bit:
+	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
+	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
+	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
+	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
+	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
+	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
+	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
+	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
+	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
+	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
+	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
+	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
+	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
+	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
+	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
+	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
+	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
+	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
+	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
+	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
+	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
+	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
+	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
+	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
+	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
+	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
+	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
+	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
+	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
+	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
+	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
+	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
+.size	rem_8bit#,512
+stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/modes/asm/ghash-parisc.pl b/main/openssl/crypto/modes/asm/ghash-parisc.pl
new file mode 100644
index 00000000..d5ad96b4
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-parisc.pl
@@ -0,0 +1,731 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19.6 cycles, which is more than twice as
+# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
+# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$NREGS		=6;
+} else {
+	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$NREGS		=11;
+}
+
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+				#                 [+ argument transfer]
+
+################# volatile registers
+$Xi="%r26";	# argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl;	# variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+	$Zhl="%r6";
+	$Zlh="%r7";
+	$Hhl="%r8";
+	$Hlh="%r9";
+	$Thl="%r10";
+	$Tlh="%r11";
+}
+$rem2="%r6";	# used in PA-RISC 2.0 code
+
+$code.=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	64
+gcm_gmult_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_gmult
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_gmult
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_gmult_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+	ldbx	$cnt($Xi),$nlo
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$Tll,$Zll,$Zll
+	addib,uv -1,$cnt,L\$oop_gmult_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	std	$Zhh,0($Xi)
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_gmult
+	nop
+
+L\$parisc1_gmult
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_gmult_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_gmult_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	stw	$Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+gcm_ghash_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_ghash
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_ghash
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+L\$outer_ghash_pa2
+	ldb	15($inp),$nhi
+	xor	$nhi,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_ghash_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldbx	$cnt($Xi),$nlo
+	ldbx	$cnt($inp),$byte
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	xor	$byte,$nlo,$nlo
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+
+	ldd	$rem($rem_4bit),$rem
+	addib,uv -1,$cnt,L\$oop_ghash_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	ldo	16($inp),$inp
+	std	$Zhh,0($Xi)
+	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+	copy	$Zll,$nlo
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_ghash
+	nop
+
+L\$parisc1_ghash
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+L\$outer_ghash_pa1
+	ldb	15($inp),$byte
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_ghash_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldbx	$cnt($inp),$byte
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$byte,$nlo,$nlo
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_ghash_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	ldo	16($inp),$inp
+	stw	$Zhh,0($Xi)
+	comb,<>	$inp,$len,L\$outer_ghash_pa1
+	copy	$Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	64
+L\$rem_4bit
+	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+	.ALIGN	64
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
+    {	sprintf "\t.WORD\t0x%08x\t; %s",
+		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $depd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "depd$mod\t$args";
+
+    # I only have ",z" completer, it's impicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
+    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+    	my $cpos=63-$2;
+	my $len=32-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	if ($SIZE_T==4) {
+		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+		s/cmpb,\*/comb,/;
+		s/,\*/,/;
+	}
+	s/\bbv\b/bve/	if ($SIZE_T==8);
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/main/openssl/crypto/modes/asm/ghash-s390x.pl b/main/openssl/crypto/modes/asm/ghash-s390x.pl
new file mode 100644
index 00000000..6a40d5d8
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -0,0 +1,262 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$softonly=0;
+
+$Zhi="%r0";
+$Zlo="%r1";
+
+$Xi="%r2";	# argument block
+$Htbl="%r3";
+$inp="%r4";
+$len="%r5";
+
+$rem0="%r6";	# variables
+$rem1="%r7";
+$nlo="%r8";
+$nhi="%r9";
+$xi="%r10";
+$cnt="%r11";
+$tmp="%r12";
+$x78="%r13";
+$rem_4bit="%r14";
+
+$sp="%r15";
+
+$code.=<<___;
+.text
+
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+___
+$code.=<<___ if(!$softonly && 0);	# hardware is slow for single block...
+	larl	%r1,OPENSSL_s390xcap_P
+	lg	%r0,0(%r1)
+	tmhl	%r0,0x4000	# check for message-security-assist
+	jz	.Lsoft_gmult
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb93e0004	# kimd %r0,%r4
+	lg	%r1,24($sp)
+	tmhh	%r1,0x4000	# check for function 65
+	jz	.Lsoft_gmult
+	stg	%r0,16($sp)	# arrange 16 bytes of zero input
+	stg	%r0,24($sp)
+	lghi	%r0,65		# function 65
+	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
+	la	$inp,16($sp)
+	lghi	$len,16
+	.long	0xb93e0004	# kimd %r0,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+	br	%r14
+.align	32
+.Lsoft_gmult:
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	lghi	$len,1
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	j	.Lgmult_shortcut
+.type	gcm_gmult_4bit,\@function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+	larl	%r1,OPENSSL_s390xcap_P
+	lg	%r0,0(%r1)
+	tmhl	%r0,0x4000	# check for message-security-assist
+	jz	.Lsoft_ghash
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb93e0004	# kimd %r0,%r4
+	lg	%r1,24($sp)
+	tmhh	%r1,0x4000	# check for function 65
+	jz	.Lsoft_ghash
+	lghi	%r0,65		# function 65
+	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
+	.long	0xb93e0004	# kimd %r0,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+	br	%r14
+.align	32
+.Lsoft_ghash:
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	srlg	$len,$len,4
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	lg	$Zhi,0+1($Xi)
+	lghi	$tmp,0
+.Louter:
+	xg	$Zhi,0($inp)		# Xi ^= inp 
+	xg	$Zlo,8($inp)
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+
+.Lgmult_shortcut:
+	lghi	$tmp,0xf0
+	sllg	$nlo,$Zlo,4
+	srlg	$xi,$Zlo,8		# extract second byte
+	ngr	$nlo,$tmp
+	lgr	$nhi,$Zlo
+	lghi	$cnt,14
+	ngr	$nhi,$tmp
+
+	lg	$Zlo,8($nlo,$Htbl)
+	lg	$Zhi,0($nlo,$Htbl)
+
+	sllg	$nlo,$xi,4
+	sllg	$rem0,$Zlo,3
+	ngr	$nlo,$tmp
+	ngr	$rem0,$x78
+	ngr	$xi,$tmp
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	j	.Lghash_inner
+.align	16
+.Lghash_inner:
+	srlg	$Zlo,$Zlo,4
+	sllg	$tmp,$Zhi,60
+	xg	$Zlo,8($nlo,$Htbl)
+	srlg	$Zhi,$Zhi,4
+	llgc	$xi,0($cnt,$Xi)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$nlo,$xi,4
+	xg	$Zhi,0($rem0,$rem_4bit)
+	nill	$nlo,0xf0
+	sllg	$rem0,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem0,$x78
+	nill	$xi,0xf0
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	xg	$Zhi,0($rem1,$rem_4bit)
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	brct	$cnt,.Lghash_inner
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nlo,$Htbl)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$xi,$Zlo,3
+	xg	$Zhi,0($rem0,$rem_4bit)
+	xgr	$Zlo,$tmp
+	ngr	$xi,$x78
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	xgr	$Zlo,$tmp
+	xg	$Zhi,0($rem1,$rem_4bit)
+
+	lg	$tmp,0($xi,$rem_4bit)
+	la	$inp,16($inp)
+	sllg	$tmp,$tmp,4		# correct last rem_4bit[rem]
+	brctg	$len,.Louter
+
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+	lm${g}	%r6,%r14,6*$SIZE_T($sp)
+	br	%r14
+.type	gcm_ghash_4bit,\@function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+	.long	`0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+	.long	`0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+	.long	`0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
+.type	rem_4bit,\@object
+.size	rem_4bit,(.-rem_4bit)
+.string	"GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/modes/asm/ghash-sparcv9.pl b/main/openssl/crypto/modes/asm/ghash-sparcv9.pl
new file mode 100644
index 00000000..70e7b044
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-sparcv9.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.3.x	cc 5.2		this assembler
+#
+# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
+# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
+#
+# Here is data collected on UltraSPARC T1 system running Linux:
+#
+#		gcc 4.4.1			this assembler
+#
+# 32-bit build	566				50	(+1000%)
+# 64-bit build	56				50	(+12%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+
+$output=shift;
+open STDOUT,">$output";
+
+$Zhi="%o0";	# 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+
+$nhi="%l0";	# small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+
+$Xi="%i0";	# input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
+
+$code.=<<___;
+.section	".text",#alloc,#execinstr
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type	rem_4bit,#object
+.size	rem_4bit,(.-rem_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$inp+15],$nlo
+	ldub	[$Xi+15],$xi0
+	ldub	[$Xi+14],$xi1
+	add	$len,$inp,$len
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+.Louter:
+	xor	$xi0,$nlo,$nlo
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$inp+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	xor	$xi1,$nlo,$nlo
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lghash_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lghash_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	ldub	[$Xi+$cnt],$xi1
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$xi1,$nlo,$nlo
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lghash_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+
+	add	$inp,16,$inp
+	cmp	$inp,$len
+	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+15],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+	srl	$Zlo,8,$xi1
+	and	$Zlo,0xff,$xi0
+	ba	.Louter
+	and	$xi1,0xff,$xi1
+.align	32
+.Ldone:
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_ghash_4bit,#function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+
+undef $inp;
+undef $len;
+
+$code.=<<___;
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$Xi+15],$nlo
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$Xi+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lgmult_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lgmult_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$Xi+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lgmult_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_gmult_4bit,#function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+.asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/modes/asm/ghash-x86.S b/main/openssl/crypto/modes/asm/ghash-x86.S
new file mode 100644
index 00000000..cb9ae20d
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-x86.S
@@ -0,0 +1,728 @@
+.file	"ghash-x86.s"
+.text
+.globl	gcm_gmult_4bit_x86
+.type	gcm_gmult_4bit_x86,@function
+.align	16
+gcm_gmult_4bit_x86:
+.L_gcm_gmult_4bit_x86_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	subl	$84,%esp
+	movl	104(%esp),%edi
+	movl	108(%esp),%esi
+	movl	(%edi),%ebp
+	movl	4(%edi),%edx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%ebx
+	movl	$0,16(%esp)
+	movl	$471859200,20(%esp)
+	movl	$943718400,24(%esp)
+	movl	$610271232,28(%esp)
+	movl	$1887436800,32(%esp)
+	movl	$1822425088,36(%esp)
+	movl	$1220542464,40(%esp)
+	movl	$1423966208,44(%esp)
+	movl	$3774873600,48(%esp)
+	movl	$4246732800,52(%esp)
+	movl	$3644850176,56(%esp)
+	movl	$3311403008,60(%esp)
+	movl	$2441084928,64(%esp)
+	movl	$2376073216,68(%esp)
+	movl	$2847932416,72(%esp)
+	movl	$3051356160,76(%esp)
+	movl	%ebp,(%esp)
+	movl	%edx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%ebx,12(%esp)
+	shrl	$20,%ebx
+	andl	$240,%ebx
+	movl	4(%esi,%ebx,1),%ebp
+	movl	(%esi,%ebx,1),%edx
+	movl	12(%esi,%ebx,1),%ecx
+	movl	8(%esi,%ebx,1),%ebx
+	xorl	%eax,%eax
+	movl	$15,%edi
+	jmp	.L000x86_loop
+.align	16
+.L000x86_loop:
+	movb	%bl,%al
+	shrdl	$4,%ecx,%ebx
+	andb	$15,%al
+	shrdl	$4,%edx,%ecx
+	shrdl	$4,%ebp,%edx
+	shrl	$4,%ebp
+	xorl	16(%esp,%eax,4),%ebp
+	movb	(%esp,%edi,1),%al
+	andb	$240,%al
+	xorl	8(%esi,%eax,1),%ebx
+	xorl	12(%esi,%eax,1),%ecx
+	xorl	(%esi,%eax,1),%edx
+	xorl	4(%esi,%eax,1),%ebp
+	decl	%edi
+	js	.L001x86_break
+	movb	%bl,%al
+	shrdl	$4,%ecx,%ebx
+	andb	$15,%al
+	shrdl	$4,%edx,%ecx
+	shrdl	$4,%ebp,%edx
+	shrl	$4,%ebp
+	xorl	16(%esp,%eax,4),%ebp
+	movb	(%esp,%edi,1),%al
+	shlb	$4,%al
+	xorl	8(%esi,%eax,1),%ebx
+	xorl	12(%esi,%eax,1),%ecx
+	xorl	(%esi,%eax,1),%edx
+	xorl	4(%esi,%eax,1),%ebp
+	jmp	.L000x86_loop
+.align	16
+.L001x86_break:
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	bswap	%ebp
+	movl	104(%esp),%edi
+	movl	%ebx,12(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,4(%edi)
+	movl	%ebp,(%edi)
+	addl	$84,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_gmult_4bit_x86,.-.L_gcm_gmult_4bit_x86_begin
+.globl	gcm_ghash_4bit_x86
+.type	gcm_ghash_4bit_x86,@function
+.align	16
+gcm_ghash_4bit_x86:
+.L_gcm_ghash_4bit_x86_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	subl	$84,%esp
+	movl	104(%esp),%ebx
+	movl	108(%esp),%esi
+	movl	112(%esp),%edi
+	movl	116(%esp),%ecx
+	addl	%edi,%ecx
+	movl	%ecx,116(%esp)
+	movl	(%ebx),%ebp
+	movl	4(%ebx),%edx
+	movl	8(%ebx),%ecx
+	movl	12(%ebx),%ebx
+	movl	$0,16(%esp)
+	movl	$471859200,20(%esp)
+	movl	$943718400,24(%esp)
+	movl	$610271232,28(%esp)
+	movl	$1887436800,32(%esp)
+	movl	$1822425088,36(%esp)
+	movl	$1220542464,40(%esp)
+	movl	$1423966208,44(%esp)
+	movl	$3774873600,48(%esp)
+	movl	$4246732800,52(%esp)
+	movl	$3644850176,56(%esp)
+	movl	$3311403008,60(%esp)
+	movl	$2441084928,64(%esp)
+	movl	$2376073216,68(%esp)
+	movl	$2847932416,72(%esp)
+	movl	$3051356160,76(%esp)
+.align	16
+.L002x86_outer_loop:
+	xorl	12(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	4(%edi),%edx
+	xorl	(%edi),%ebp
+	movl	%ebx,12(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edx,4(%esp)
+	movl	%ebp,(%esp)
+	shrl	$20,%ebx
+	andl	$240,%ebx
+	movl	4(%esi,%ebx,1),%ebp
+	movl	(%esi,%ebx,1),%edx
+	movl	12(%esi,%ebx,1),%ecx
+	movl	8(%esi,%ebx,1),%ebx
+	xorl	%eax,%eax
+	movl	$15,%edi
+	jmp	.L003x86_loop
+.align	16
+.L003x86_loop:
+	movb	%bl,%al
+	shrdl	$4,%ecx,%ebx
+	andb	$15,%al
+	shrdl	$4,%edx,%ecx
+	shrdl	$4,%ebp,%edx
+	shrl	$4,%ebp
+	xorl	16(%esp,%eax,4),%ebp
+	movb	(%esp,%edi,1),%al
+	andb	$240,%al
+	xorl	8(%esi,%eax,1),%ebx
+	xorl	12(%esi,%eax,1),%ecx
+	xorl	(%esi,%eax,1),%edx
+	xorl	4(%esi,%eax,1),%ebp
+	decl	%edi
+	js	.L004x86_break
+	movb	%bl,%al
+	shrdl	$4,%ecx,%ebx
+	andb	$15,%al
+	shrdl	$4,%edx,%ecx
+	shrdl	$4,%ebp,%edx
+	shrl	$4,%ebp
+	xorl	16(%esp,%eax,4),%ebp
+	movb	(%esp,%edi,1),%al
+	shlb	$4,%al
+	xorl	8(%esi,%eax,1),%ebx
+	xorl	12(%esi,%eax,1),%ecx
+	xorl	(%esi,%eax,1),%edx
+	xorl	4(%esi,%eax,1),%ebp
+	jmp	.L003x86_loop
+.align	16
+.L004x86_break:
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	bswap	%ebp
+	movl	112(%esp),%edi
+	leal	16(%edi),%edi
+	cmpl	116(%esp),%edi
+	movl	%edi,112(%esp)
+	jb	.L002x86_outer_loop
+	movl	104(%esp),%edi
+	movl	%ebx,12(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,4(%edi)
+	movl	%ebp,(%edi)
+	addl	$84,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
+.type	_mmx_gmult_4bit_inner,@function
+.align	16
+_mmx_gmult_4bit_inner:
+	xorl	%ecx,%ecx
+	movl	%ebx,%edx
+	movb	%dl,%cl
+	shlb	$4,%cl
+	andl	$240,%edx
+	movq	8(%esi,%ecx,1),%mm0
+	movq	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	14(%edi),%cl
+	psllq	$60,%mm2
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	13(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	12(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	11(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	10(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	9(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	8(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	7(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	6(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	5(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	4(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	3(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	2(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	1(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	movl	4(%eax,%ebp,8),%edi
+	psrlq	$32,%mm0
+	movd	%mm1,%edx
+	psrlq	$32,%mm1
+	movd	%mm0,%ecx
+	movd	%mm1,%ebp
+	shll	$4,%edi
+	bswap	%ebx
+	bswap	%edx
+	bswap	%ecx
+	xorl	%edi,%ebp
+	bswap	%ebp
+	ret
+.size	_mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner
+.globl	gcm_gmult_4bit_mmx
+.type	gcm_gmult_4bit_mmx,@function
+.align	16
+gcm_gmult_4bit_mmx:
+.L_gcm_gmult_4bit_mmx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	call	.L005pic_point
+.L005pic_point:
+	popl	%eax
+	leal	.Lrem_4bit-.L005pic_point(%eax),%eax
+	movzbl	15(%edi),%ebx
+	call	_mmx_gmult_4bit_inner
+	movl	20(%esp),%edi
+	emms
+	movl	%ebx,12(%edi)
+	movl	%edx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%ebp,(%edi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_gmult_4bit_mmx,.-.L_gcm_gmult_4bit_mmx_begin
+.globl	gcm_ghash_4bit_mmx
+.type	gcm_ghash_4bit_mmx,@function
+.align	16
+gcm_ghash_4bit_mmx:
+.L_gcm_ghash_4bit_mmx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%ebp
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ecx
+	call	.L006pic_point
+.L006pic_point:
+	popl	%eax
+	leal	.Lrem_4bit-.L006pic_point(%eax),%eax
+	addl	%edi,%ecx
+	movl	%ecx,32(%esp)
+	subl	$20,%esp
+	movl	12(%ebp),%ebx
+	movl	4(%ebp),%edx
+	movl	8(%ebp),%ecx
+	movl	(%ebp),%ebp
+	jmp	.L007mmx_outer_loop
+.align	16
+.L007mmx_outer_loop:
+	xorl	12(%edi),%ebx
+	xorl	4(%edi),%edx
+	xorl	8(%edi),%ecx
+	xorl	(%edi),%ebp
+	movl	%edi,48(%esp)
+	movl	%ebx,12(%esp)
+	movl	%edx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%ebp,(%esp)
+	movl	%esp,%edi
+	shrl	$24,%ebx
+	call	_mmx_gmult_4bit_inner
+	movl	48(%esp),%edi
+	leal	16(%edi),%edi
+	cmpl	52(%esp),%edi
+	jb	.L007mmx_outer_loop
+	movl	40(%esp),%edi
+	emms
+	movl	%ebx,12(%edi)
+	movl	%edx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%ebp,(%edi)
+	addl	$20,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
+.align	64
+.Lrem_4bit:
+.long	0,0,0,29491200,0,58982400,0,38141952
+.long	0,117964800,0,113901568,0,76283904,0,88997888
+.long	0,235929600,0,265420800,0,227803136,0,206962688
+.long	0,152567808,0,148504576,0,177995776,0,190709760
+.align	64
+.L008rem_8bit:
+.value	0,450,900,582,1800,1738,1164,1358
+.value	3600,4050,3476,3158,2328,2266,2716,2910
+.value	7200,7650,8100,7782,6952,6890,6316,6510
+.value	4656,5106,4532,4214,5432,5370,5820,6014
+.value	14400,14722,15300,14854,16200,16010,15564,15630
+.value	13904,14226,13780,13334,12632,12442,13020,13086
+.value	9312,9634,10212,9766,9064,8874,8428,8494
+.value	10864,11186,10740,10294,11640,11450,12028,12094
+.value	28800,28994,29444,29382,30600,30282,29708,30158
+.value	32400,32594,32020,31958,31128,30810,31260,31710
+.value	27808,28002,28452,28390,27560,27242,26668,27118
+.value	25264,25458,24884,24822,26040,25722,26172,26622
+.value	18624,18690,19268,19078,20424,19978,19532,19854
+.value	18128,18194,17748,17558,16856,16410,16988,17310
+.value	21728,21794,22372,22182,21480,21034,20588,20910
+.value	23280,23346,22900,22710,24056,23610,24188,24510
+.value	57600,57538,57988,58182,58888,59338,58764,58446
+.value	61200,61138,60564,60758,59416,59866,60316,59998
+.value	64800,64738,65188,65382,64040,64490,63916,63598
+.value	62256,62194,61620,61814,62520,62970,63420,63102
+.value	55616,55426,56004,56070,56904,57226,56780,56334
+.value	55120,54930,54484,54550,53336,53658,54236,53790
+.value	50528,50338,50916,50982,49768,50090,49644,49198
+.value	52080,51890,51444,51510,52344,52666,53244,52798
+.value	37248,36930,37380,37830,38536,38730,38156,38094
+.value	40848,40530,39956,40406,39064,39258,39708,39646
+.value	36256,35938,36388,36838,35496,35690,35116,35054
+.value	33712,33394,32820,33270,33976,34170,34620,34558
+.value	43456,43010,43588,43910,44744,44810,44364,44174
+.value	42960,42514,42068,42390,41176,41242,41820,41630
+.value	46560,46114,46692,47014,45800,45866,45420,45230
+.value	48112,47666,47220,47542,48376,48442,49020,48830
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte	0
diff --git a/main/openssl/crypto/modes/asm/ghash-x86.pl b/main/openssl/crypto/modes/asm/ghash-x86.pl
new file mode 100644
index 00000000..2426cd0c
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-x86.pl
@@ -0,0 +1,1342 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla MMX. Former will be executed on
+# 486 and Pentium, latter on all others. MMX GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#		gcc 2.95.3(*)	MMX assembler	x86 assembler
+#
+# Pentium	105/111(**)	-		50
+# PIII		68 /75		12.2		24
+# P4		125/125		17.8		84(***)
+# Opteron	66 /70		10.1		30
+# Core2		54 /67		8.4		18
+#
+# (*)	gcc 3.4.x was observed to generate few percent slower code,
+#	which is one of reasons why 2.95.3 results were chosen,
+#	another reason is lack of 3.4.x results for older CPUs;
+#	comparison with MMX results is not completely fair, because C
+#	results are for vanilla "256B" implementation, while
+#	assembler results are for "528B";-)
+# (**)	second number is result for code compiled with -fPIC flag,
+#	which is actually more relevant, because assembler code is
+#	position-independent;
+# (***)	see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
+# particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp  = "edi";
+$Htbl = "esi";
+
+$unroll = 0;	# Affects x86 loop. Folded loop performs ~7% worse
+		# than unrolled, which has to be weighted against
+		# 2.5x x86-specific code size reduction.
+
+sub x86_loop {
+    my $off = shift;
+    my $rem = "eax";
+
+	&mov	($Zhh,&DWP(4,$Htbl,$Zll));
+	&mov	($Zhl,&DWP(0,$Htbl,$Zll));
+	&mov	($Zlh,&DWP(12,$Htbl,$Zll));
+	&mov	($Zll,&DWP(8,$Htbl,$Zll));
+	&xor	($rem,$rem);	# avoid partial register stalls on PIII
+
+	# shrd practically kills P4, 2.5x deterioration, but P4 has
+	# MMX code-path to execute. shrd runs tad faster [than twice
+	# the shifts, move's and or's] on pre-MMX Pentium (as well as
+	# PIII and Core2), *but* minimizes code size, spares register
+	# and thus allows to fold the loop...
+	if (!$unroll) {
+	my $cnt = $inp;
+	&mov	($cnt,15);
+	&jmp	(&label("x86_loop"));
+	&set_label("x86_loop",16);
+	    for($i=1;$i<=2;$i++) {
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		&mov	(&LB($rem),&BP($off,"esp",$cnt));
+		if ($i&1) {
+			&and	(&LB($rem),0xf0);
+		} else {
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+
+		if ($i&1) {
+			&dec	($cnt);
+			&js	(&label("x86_break"));
+		} else {
+			&jmp	(&label("x86_loop"));
+		}
+	    }
+	&set_label("x86_break",16);
+	} else {
+	    for($i=1;$i<32;$i++) {
+		&comment($i);
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		if ($i&1) {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&and	(&LB($rem),0xf0);
+		} else {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+	    }
+	}
+	&bswap	($Zll);
+	&bswap	($Zlh);
+	&bswap	($Zhl);
+	if (!$x86only) {
+		&bswap	($Zhh);
+	} else {
+		&mov	("eax",$Zhh);
+		&bswap	("eax");
+		&mov	($Zhh,"eax");
+	}
+}
+
+if ($unroll) {
+    &function_begin_B("_x86_gmult_4bit_inner");
+	&x86_loop(4);
+	&ret	();
+    &function_end_B("_x86_gmult_4bit_inner");
+}
+
+sub deposit_rem_4bit {
+    my $bias = shift;
+
+	&mov	(&DWP($bias+0, "esp"),0x0000<<16);
+	&mov	(&DWP($bias+4, "esp"),0x1C20<<16);
+	&mov	(&DWP($bias+8, "esp"),0x3840<<16);
+	&mov	(&DWP($bias+12,"esp"),0x2460<<16);
+	&mov	(&DWP($bias+16,"esp"),0x7080<<16);
+	&mov	(&DWP($bias+20,"esp"),0x6CA0<<16);
+	&mov	(&DWP($bias+24,"esp"),0x48C0<<16);
+	&mov	(&DWP($bias+28,"esp"),0x54E0<<16);
+	&mov	(&DWP($bias+32,"esp"),0xE100<<16);
+	&mov	(&DWP($bias+36,"esp"),0xFD20<<16);
+	&mov	(&DWP($bias+40,"esp"),0xD940<<16);
+	&mov	(&DWP($bias+44,"esp"),0xC560<<16);
+	&mov	(&DWP($bias+48,"esp"),0x9180<<16);
+	&mov	(&DWP($bias+52,"esp"),0x8DA0<<16);
+	&mov	(&DWP($bias+56,"esp"),0xA9C0<<16);
+	&mov	(&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+
+$suffix = $x86only ? "" : "_x86";
+
+&function_begin("gcm_gmult_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for stack alignment
+	&mov	($inp,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+
+	&mov	($Zhh,&DWP(0,$inp));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$inp));
+	&mov	($Zlh,&DWP(8,$inp));
+	&mov	($Zll,&DWP(12,$inp));
+
+	&deposit_rem_4bit(16);
+
+	&mov	(&DWP(0,"esp"),$Zhh);		# copy Xi[16] on stack
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(0));
+	}
+
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+
+&function_begin("gcm_ghash_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for 64-bit alignment
+	&mov	($Zll,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+	&mov	($inp,&wparam(2));		# load in
+	&mov	("ecx",&wparam(3));		# load len
+	&add	("ecx",$inp);
+	&mov	(&wparam(3),"ecx");
+
+	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zll));
+	&mov	($Zlh,&DWP(8,$Zll));
+	&mov	($Zll,&DWP(12,$Zll));
+
+	&deposit_rem_4bit(16);
+
+    &set_label("x86_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));		# xor with input
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&DWP(12,"esp"),$Zll);		# dump it on stack
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(2));
+	}
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&mov	(&wparam(2),$inp)	if (!$unroll);
+	&jb	(&label("x86_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+
+if (!$x86only) {{{
+
+&static_label("rem_4bit");
+
+if (!$sse2) {{	# pure-MMX "May" version...
+
+$S=12;		# shift factor for rem_4bit
+
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary µ-archs.
+{
+    my $cnt;
+    my $rem_4bit = "eax";
+    my @rem = ($Zhh,$Zll);
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem[0],$Zlo);
+
+	for ($cnt=28;$cnt>=-2;$cnt--) {
+	    my $odd = $cnt&1;
+	    my $nix = $odd ? $nlo : $nhi;
+
+		&shl	(&LB($nlo),4)			if ($odd);
+		&psrlq	($Zlo,4);
+		&movq	($tmp,$Zhi);
+		&psrlq	($Zhi,4);
+		&pxor	($Zlo,&QWP(8,$Htbl,$nix));
+		&mov	(&LB($nlo),&BP($cnt/2,$inp))	if (!$odd && $cnt>=0);
+		&psllq	($tmp,60);
+		&and	($nhi,0xf0)			if ($odd);
+		&pxor	($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+		&and	($rem[0],0xf);
+		&pxor	($Zhi,&QWP(0,$Htbl,$nix));
+		&mov	($nhi,$nlo)			if (!$odd && $cnt>=0);
+		&movd	($rem[1],$Zlo);
+		&pxor	($Zlo,$tmp);
+
+		push	(@rem,shift(@rem));		# "rotate" registers
+	}
+
+	&mov	($inp,&DWP(4,$rem_4bit,$rem[1],8));	# last rem_4bit[rem]
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+	&shl	($inp,4);	# compensate for rem_4bit[i] being >>4
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&xor	($Zhh,$inp);
+	&bswap	($Zhh);
+
+	&ret	();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+	&mov	($Zhh,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+	&mov	($inp,&wparam(2));	# load in
+	&mov	($Zlh,&wparam(3));	# load len
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&add	($Zlh,$inp);
+	&mov	(&wparam(3),$Zlh);	# len to point at the end of input
+	&stack_push(4+1);		# +1 for stack alignment
+
+	&mov	($Zll,&DWP(12,$Zhh));	# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zhh));
+	&mov	($Zlh,&DWP(8,$Zhh));
+	&mov	($Zhh,&DWP(0,$Zhh));
+	&jmp	(&label("mmx_outer_loop"));
+
+    &set_label("mmx_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&wparam(2),$inp);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&mov	($inp,"esp");
+	&shr	($Zll,24);
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(2));
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&jb	(&label("mmx_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+
+	&stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+
+}} else {{	# "June" MMX version...
+		# ... has slower "April" gcm_gmult_4bit_mmx with folded
+		# loop. This is done to conserve code size...
+$S=16;		# shift factor for rem_4bit
+
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+    my $inp = shift;
+    my $rem_4bit = shift;
+    my $cnt = $Zhh;
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+    my $rem = $Zll;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&mov	($cnt,14);
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem,$Zlo);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_loop",16);
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&mov	(&LB($nlo),&BP(0,$inp,$cnt));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&dec	($cnt);
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&mov	($nhi,$nlo);
+	&pxor	($Zlo,$tmp);
+	&js	(&label("mmx_break"));
+
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_break",16);
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&bswap	($Zhh);
+}
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&mmx_loop($inp,"eax");
+
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+
+&static_label("rem_8bit");
+
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+  my $rem_8bit = "esi";
+  my $Htbl = "ebx";
+
+    # parameter block
+    &mov	("eax",&wparam(0));		# Xi
+    &mov	("ebx",&wparam(1));		# Htable
+    &mov	("ecx",&wparam(2));		# inp
+    &mov	("edx",&wparam(3));		# len
+    &mov	("ebp","esp");			# original %esp
+    &call	(&label("pic_point"));
+    &set_label	("pic_point");
+    &blindpop	($rem_8bit);
+    &lea	($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+
+    &sub	("esp",512+16+16);		# allocate stack frame...
+    &and	("esp",-64);			# ...and align it
+    &sub	("esp",16);			# place for (u8)(H[]<<4)
+
+    &add	("edx","ecx");			# pointer to the end of input
+    &mov	(&DWP(528+16+0,"esp"),"eax");	# save Xi
+    &mov	(&DWP(528+16+8,"esp"),"edx");	# save inp+len
+    &mov	(&DWP(528+16+12,"esp"),"ebp");	# save original %esp
+
+    { my @lo  = ("mm0","mm1","mm2");
+      my @hi  = ("mm3","mm4","mm5");
+      my @tmp = ("mm6","mm7");
+      my ($off1,$off2,$i) = (0,0,);
+
+      &add	($Htbl,128);			# optimize for size
+      &lea	("edi",&DWP(16+128,"esp"));
+      &lea	("ebp",&DWP(16+256+128,"esp"));
+
+      # decompose Htable (low and high parts are kept separately),
+      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+      for ($i=0;$i<18;$i++) {
+
+	&mov	("edx",&DWP(16*$i+8-128,$Htbl))		if ($i<16);
+	&movq	($lo[0],&QWP(16*$i+8-128,$Htbl))	if ($i<16);
+	&psllq	($tmp[1],60)				if ($i>1);
+	&movq	($hi[0],&QWP(16*$i+0-128,$Htbl))	if ($i<16);
+	&por	($lo[2],$tmp[1])			if ($i>1);
+	&movq	(&QWP($off1-128,"edi"),$lo[1])		if ($i>0 && $i<17);
+	&psrlq	($lo[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off1,"edi"),$hi[1])		if ($i>0 && $i<17);
+	&movq	($tmp[0],$hi[1])			if ($i>0 && $i<17);
+	&movq	(&QWP($off2-128,"ebp"),$lo[2])		if ($i>1);
+	&psrlq	($hi[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off2,"ebp"),$hi[2])		if ($i>1);
+	&shl	("edx",4)				if ($i<16);
+	&mov	(&BP($i,"esp"),&LB("edx"))		if ($i<16);
+
+	unshift	(@lo,pop(@lo));			# "rotate" registers
+	unshift	(@hi,pop(@hi));
+	unshift	(@tmp,pop(@tmp));
+	$off1 += 8	if ($i>0);
+	$off2 += 8	if ($i>1);
+      }
+    }
+
+    &movq	($Zhi,&QWP(0,"eax"));
+    &mov	("ebx",&DWP(8,"eax"));
+    &mov	("edx",&DWP(12,"eax"));		# load Xi
+
+&set_label("outer",16);
+  { my $nlo = "eax";
+    my $dat = "edx";
+    my @nhi = ("edi","ebp");
+    my @rem = ("ebx","ecx");
+    my @red = ("mm0","mm1","mm2");
+    my $tmp = "mm3";
+
+    &xor	($dat,&DWP(12,"ecx"));		# merge input data
+    &xor	("ebx",&DWP(8,"ecx"));
+    &pxor	($Zhi,&QWP(0,"ecx"));
+    &lea	("ecx",&DWP(16,"ecx"));		# inp+=16
+    #&mov	(&DWP(528+12,"esp"),$dat);	# save inp^Xi
+    &mov	(&DWP(528+8,"esp"),"ebx");
+    &movq	(&QWP(528+0,"esp"),$Zhi);
+    &mov	(&DWP(528+16+4,"esp"),"ecx");	# save inp
+
+    &xor	($nlo,$nlo);
+    &rol	($dat,8);
+    &mov	(&LB($nlo),&LB($dat));
+    &mov	($nhi[1],$nlo);
+    &and	(&LB($nlo),0x0f);
+    &shr	($nhi[1],4);
+    &pxor	($red[0],$red[0]);
+    &rol	($dat,8);			# next byte
+    &pxor	($red[1],$red[1]);
+    &pxor	($red[2],$red[2]);
+
+    # Just like in "May" verson modulo-schedule for critical path in
+    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+    # is scheduled so late that rem_8bit[] has to be shifted *right*
+    # by 16, which is why last argument to pinsrw is 2, which
+    # corresponds to <<32=<<48>>16...
+    for ($j=11,$i=0;$i<15;$i++) {
+
+      if ($i>0) {
+	&pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+	&rol	($dat,8);				# next byte
+	&pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+
+	&pxor	($Zlo,$tmp);
+	&pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+	&xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+      } else {
+	&movq	($Zlo,&QWP(16,"esp",$nlo,8));
+	&movq	($Zhi,&QWP(16+128,"esp",$nlo,8));
+      }
+
+	&mov	(&LB($nlo),&LB($dat));
+	&mov	($dat,&DWP(528+$j,"esp"))		if (--$j%4==0);
+
+	&movd	($rem[0],$Zlo);
+	&movz	($rem[1],&LB($rem[1]))			if ($i>0);
+	&psrlq	($Zlo,8);				# Z>>=8
+
+	&movq	($tmp,$Zhi);
+	&mov	($nhi[0],$nlo);
+	&psrlq	($Zhi,8);
+
+	&pxor	($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));	# Z^=H[nhi]>>4
+	&and	(&LB($nlo),0x0f);
+	&psllq	($tmp,56);
+
+	&pxor	($Zhi,$red[1])				if ($i>1);
+	&shr	($nhi[0],4);
+	&pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2)	if ($i>0);
+
+	unshift	(@red,pop(@red));			# "rotate" registers
+	unshift	(@rem,pop(@rem));
+	unshift	(@nhi,pop(@nhi));
+    }
+
+    &pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+    &pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+    &xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+    &movz	($rem[1],&LB($rem[1]));
+
+    &pxor	($red[2],$red[2]);			# clear 2nd word
+    &psllq	($red[1],4);
+
+    &movd	($rem[0],$Zlo);
+    &psrlq	($Zlo,4);				# Z>>=4
+
+    &movq	($tmp,$Zhi);
+    &psrlq	($Zhi,4);
+    &shl	($rem[0],4);				# rem<<4
+
+    &pxor	($Zlo,&QWP(16,"esp",$nhi[1],8));	# Z^=H[nhi]
+    &psllq	($tmp,60);
+    &movz	($rem[0],&LB($rem[0]));
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+
+    &pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+    &pxor	($Zhi,$red[1]);
+
+    &movd	($dat,$Zlo);
+    &pinsrw	($red[2],&WP(0,$rem_8bit,$rem[0],2),3);	# last is <<48
+
+    &psllq	($red[0],12);				# correct by <<16>>4
+    &pxor	($Zhi,$red[0]);
+    &psrlq	($Zlo,32);
+    &pxor	($Zhi,$red[2]);
+
+    &mov	("ecx",&DWP(528+16+4,"esp"));	# restore inp
+    &movd	("ebx",$Zlo);
+    &movq	($tmp,$Zhi);			# 01234567
+    &psllw	($Zhi,8);			# 1.3.5.7.
+    &psrlw	($tmp,8);			# .0.2.4.6
+    &por	($Zhi,$tmp);			# 10325476
+    &bswap	($dat);
+    &pshufw	($Zhi,$Zhi,0b00011011);		# 76543210
+    &bswap	("ebx");
+    
+    &cmp	("ecx",&DWP(528+16+8,"esp"));	# are we done?
+    &jne	(&label("outer"));
+  }
+
+    &mov	("eax",&DWP(528+16+0,"esp"));	# restore Xi
+    &mov	(&DWP(12,"eax"),"edx");
+    &mov	(&DWP(8,"eax"),"ebx");
+    &movq	(&QWP(0,"eax"),$Zhi);
+
+    &mov	("esp",&DWP(528+16+12,"esp"));	# restore original %esp
+    &emms	();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1");	$Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 {	# minimal "register" pressure
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($Xhi,$Xi);		#
+	&pshufd		($T1,$Xi,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110);
+	&pxor		($T1,$Xi);		#
+	&pxor		($T2,$Hkey);
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$T2,0x00);		#######
+	&xorps		($T1,$Xi);		#
+	&xorps		($T1,$Xhi);		#
+
+	&movdqa		($T2,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T2,8);		#
+	&pxor		($Xhi,$T1);
+	&pxor		($Xi,$T2);		#
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($Xhi,$Xi);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pshufd		($T2,$T1,0b01001110);	#
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T2,$T1);		#
+	&pxor		($T3,$Hkey);
+	&pclmulqdq	($T2,$T3,0x00);		#######
+	&pxor		($T2,$Xi);		#
+	&pxor		($T2,$Xhi);		#
+
+	&movdqa		($T3,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+}
+
+if (1) {		# Algorithm 9 with <<1 twist.
+			# Reduction is shorter and uses only two
+			# temporary registers, which makes it better
+			# candidate for interleaving with 64x64
+			# multiplication. Pre-modulo-scheduled loop
+			# was found to be ~20% faster than Algorithm 5
+			# below. Algorithm 9 was therefore chosen for
+			# further optimization...
+
+sub reduction_alg9 {	# 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+	# 1st phase
+	&movdqa		($T1,$Xi);		#
+	&psllq		($Xi,1);
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,5);		#
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,57);		#
+	&movdqa		($T2,$Xi);		#
+	&pslldq		($Xi,8);
+	&psrldq		($T2,8);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);		#
+
+	# 2nd phase
+	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,5);
+	&pxor		($Xi,$T2);		#
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$T2);		#
+	&pxor		($T2,$Xhi);
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$T2);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# <<1 twist
+	&pshufd		($T2,$Hkey,0b11111111);	# broadcast uppermost dword
+	&movdqa		($T1,$Hkey);
+	&psllq		($Hkey,1);
+	&pxor		($T3,$T3);		#
+	&psrlq		($T1,63);
+	&pcmpgtd	($T3,$T2);		# broadcast carry bit
+	&pslldq		($T1,8);
+	&por		($Hkey,$T1);		# H<<=1
+
+	# magic reduction
+	&pand		($T3,&QWP(16,$const));	# 0x1c2_polynomial
+	&pxor		($Hkey,$T3);		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movups		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&sub		($len,0x20);
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+
+	&movdqa		($T3,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	 &pxor		($Xhi,$T1);		# "Ii+Xi", consume early
+
+	  &movdqa	($T1,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &psllq	($Xi,1);
+	  &pxor		($Xi,$T1);		#
+	  &psllq	($Xi,5);		#
+	  &pxor		($Xi,$T1);		#
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	  &psllq	($Xi,57);		#
+	  &movdqa	($T2,$Xi);		#
+	  &pslldq	($Xi,8);
+	  &psrldq	($T2,8);		#	
+	  &pxor		($Xi,$T1);
+	&pshufd		($T1,$T3,0b01001110);
+	  &pxor		($Xhi,$T2);		#
+	&pxor		($T1,$T3);
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T3,$Hkey);		#
+
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,5);
+	  &pxor		($Xi,$T2);		#
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$T2);		#
+	  &pxor		($T2,$Xhi);
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$T2);		#
+
+	&pclmulqdq	($T1,$T3,0x00);		#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&xorps		($T1,$Xn);		#
+	&xorps		($T1,$Xhn);		#
+
+	&movdqa		($T3,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhn,$T1);
+	&pxor		($Xn,$T3);		#
+	&movdqa		($T3,&QWP(0,$const));
+
+	&lea		($inp,&DWP(32,$inp));
+	&sub		($len,0x20);
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg9	($Xhi,$Xi);
+
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+} else {		# Algorith 5. Kept for reference purposes.
+
+sub reduction_alg5 {	# 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+
+	# <<1
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($T2,$Xhi);
+	&pslld		($Xi,1);
+	&pslld		($Xhi,1);		#
+	&psrld		($T1,31);
+	&psrld		($T2,31);		#
+	&movdqa		($T3,$T1);
+	&pslldq		($T1,4);
+	&psrldq		($T3,12);		#
+	&pslldq		($T2,4);
+	&por		($Xhi,$T3);		#
+	&por		($Xi,$T1);
+	&por		($Xhi,$T2);		#
+
+	# 1st phase
+	&movdqa		($T1,$Xi);
+	&movdqa		($T2,$Xi);
+	&movdqa		($T3,$Xi);		#
+	&pslld		($T1,31);
+	&pslld		($T2,30);
+	&pslld		($Xi,25);		#
+	&pxor		($T1,$T2);
+	&pxor		($T1,$Xi);		#
+	&movdqa		($T2,$T1);		#
+	&pslldq		($T1,12);
+	&psrldq		($T2,4);		#
+	&pxor		($T3,$T1);
+
+	# 2nd phase
+	&pxor		($Xhi,$T3);		#
+	&movdqa		($Xi,$T3);
+	&movdqa		($T1,$T3);
+	&psrld		($Xi,1);		#
+	&psrld		($T1,2);
+	&psrld		($T3,7);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+	&pxor		($Xi,$Xhi);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($Xn,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$Xn);
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&pshufb		($Xi,$Xn);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	#######
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
+}}	# $sse2
+
+&set_label("rem_4bit",64);
+	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+&set_label("rem_8bit",64);
+	&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+	&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+	&data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+	&data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+	&data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+	&data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+	&data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+	&data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+	&data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+	&data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+	&data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+	&data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+	&data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+	&data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+	&data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+	&data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+	&data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+	&data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+	&data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+	&data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+	&data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+	&data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+	&data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+	&data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+	&data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+	&data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+	&data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+	&data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+	&data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+	&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+	&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+	&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}}	# !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/main/openssl/crypto/modes/asm/ghash-x86_64.S b/main/openssl/crypto/modes/asm/ghash-x86_64.S
new file mode 100644
index 00000000..62d39c65
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-x86_64.S
@@ -0,0 +1,1026 @@
+.text	
+
+.globl	gcm_gmult_4bit
+.type	gcm_gmult_4bit,@function
+.align	16
+gcm_gmult_4bit:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+.Lgmult_prologue:
+
+	movzbq	15(%rdi),%r8
+	leaq	.Lrem_4bit(%rip),%r11
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	movb	%r8b,%al
+	movb	%r8b,%bl
+	shlb	$4,%al
+	movq	$14,%rcx
+	movq	8(%rsi,%rax,1),%r8
+	movq	(%rsi,%rax,1),%r9
+	andb	$240,%bl
+	movq	%r8,%rdx
+	jmp	.Loop1
+
+.align	16
+.Loop1:
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	movb	(%rdi,%rcx,1),%al
+	shrq	$4,%r9
+	xorq	8(%rsi,%rbx,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rbx,1),%r9
+	movb	%al,%bl
+	xorq	(%r11,%rdx,8),%r9
+	movq	%r8,%rdx
+	shlb	$4,%al
+	xorq	%r10,%r8
+	decq	%rcx
+	js	.Lbreak1
+
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	xorq	8(%rsi,%rax,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rax,1),%r9
+	andb	$240,%bl
+	xorq	(%r11,%rdx,8),%r9
+	movq	%r8,%rdx
+	xorq	%r10,%r8
+	jmp	.Loop1
+
+.align	16
+.Lbreak1:
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	xorq	8(%rsi,%rax,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rax,1),%r9
+	andb	$240,%bl
+	xorq	(%r11,%rdx,8),%r9
+	movq	%r8,%rdx
+	xorq	%r10,%r8
+
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	xorq	8(%rsi,%rbx,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rbx,1),%r9
+	xorq	%r10,%r8
+	xorq	(%r11,%rdx,8),%r9
+
+	bswapq	%r8
+	bswapq	%r9
+	movq	%r8,8(%rdi)
+	movq	%r9,(%rdi)
+
+	movq	16(%rsp),%rbx
+	leaq	24(%rsp),%rsp
+.Lgmult_epilogue:
+	.byte	0xf3,0xc3
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+.globl	gcm_ghash_4bit
+.type	gcm_ghash_4bit,@function
+.align	16
+gcm_ghash_4bit:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$280,%rsp
+.Lghash_prologue:
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	subq	$-128,%rsi
+	leaq	16+128(%rsp),%rbp
+	xorl	%edx,%edx
+	movq	0+0-128(%rsi),%r8
+	movq	0+8-128(%rsi),%rax
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	16+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	16+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,0(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,0(%rbp)
+	movq	32+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,0-128(%rbp)
+	movq	32+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,1(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,8(%rbp)
+	movq	48+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,8-128(%rbp)
+	movq	48+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,2(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,16(%rbp)
+	movq	64+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,16-128(%rbp)
+	movq	64+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,3(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,24(%rbp)
+	movq	80+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,24-128(%rbp)
+	movq	80+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,4(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,32(%rbp)
+	movq	96+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,32-128(%rbp)
+	movq	96+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,5(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,40(%rbp)
+	movq	112+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,40-128(%rbp)
+	movq	112+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,6(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,48(%rbp)
+	movq	128+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,48-128(%rbp)
+	movq	128+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,7(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,56(%rbp)
+	movq	144+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,56-128(%rbp)
+	movq	144+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,8(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,64(%rbp)
+	movq	160+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,64-128(%rbp)
+	movq	160+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,9(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,72(%rbp)
+	movq	176+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,72-128(%rbp)
+	movq	176+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,10(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,80(%rbp)
+	movq	192+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,80-128(%rbp)
+	movq	192+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,11(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,88(%rbp)
+	movq	208+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,88-128(%rbp)
+	movq	208+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,12(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,96(%rbp)
+	movq	224+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,96-128(%rbp)
+	movq	224+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,13(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,104(%rbp)
+	movq	240+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,104-128(%rbp)
+	movq	240+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,14(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,112(%rbp)
+	shlb	$4,%dl
+	movq	%rax,112-128(%rbp)
+	shlq	$60,%r10
+	movb	%dl,15(%rsp)
+	orq	%r10,%rbx
+	movq	%r9,120(%rbp)
+	movq	%rbx,120-128(%rbp)
+	addq	$-128,%rsi
+	movq	8(%rdi),%r8
+	movq	0(%rdi),%r9
+	addq	%r14,%r15
+	leaq	.Lrem_8bit(%rip),%r11
+	jmp	.Louter_loop
+.align	16
+.Louter_loop:
+	xorq	(%r14),%r9
+	movq	8(%r14),%rdx
+	leaq	16(%r14),%r14
+	xorq	%r8,%rdx
+	movq	%r9,(%rdi)
+	movq	%rdx,8(%rdi)
+	shrq	$32,%rdx
+	xorq	%rax,%rax
+	roll	$8,%edx
+	movb	%dl,%al
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	shrl	$4,%ebx
+	roll	$8,%edx
+	movq	8(%rsi,%rax,1),%r8
+	movq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	8(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	4(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	0(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	andl	$240,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	-4(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	movzwq	(%r11,%r12,2),%r12
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	shlq	$48,%r12
+	xorq	%r10,%r8
+	xorq	%r12,%r9
+	movzbq	%r8b,%r13
+	shrq	$4,%r8
+	movq	%r9,%r10
+	shlb	$4,%r13b
+	shrq	$4,%r9
+	xorq	8(%rsi,%rcx,1),%r8
+	movzwq	(%r11,%r13,2),%r13
+	shlq	$60,%r10
+	xorq	(%rsi,%rcx,1),%r9
+	xorq	%r10,%r8
+	shlq	$48,%r13
+	bswapq	%r8
+	xorq	%r13,%r9
+	bswapq	%r9
+	cmpq	%r15,%r14
+	jb	.Louter_loop
+	movq	%r8,8(%rdi)
+	movq	%r9,(%rdi)
+
+	leaq	280(%rsp),%rsi
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lghash_epilogue:
+	.byte	0xf3,0xc3
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+	movdqu	(%rsi),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+
+
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+
+
+	pand	.L0x1c2_polynomial(%rip),%xmm5
+	pxor	%xmm5,%xmm2
+
+
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	movdqu	%xmm2,(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	.byte	0xf3,0xc3
+.size	gcm_init_clmul,.-gcm_init_clmul
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lbswap_mask(%rip),%xmm5
+	movdqu	(%rsi),%xmm2
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	16
+gcm_ghash_clmul:
+	movdqa	.Lbswap_mask(%rip),%xmm5
+
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm2
+.byte	102,15,56,0,197
+
+	subq	$16,%rcx
+	jz	.Lodd_tail
+
+	movdqu	16(%rsi),%xmm8
+
+
+
+
+
+	movdqu	(%rdx),%xmm3
+	movdqu	16(%rdx),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm6,%xmm7
+	pshufd	$78,%xmm6,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm6,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm6,%xmm3
+	pxor	%xmm7,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm7
+	pxor	%xmm4,%xmm6
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm8,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm8,%xmm4
+
+	leaq	32(%rdx),%rdx
+	subq	$32,%rcx
+	jbe	.Leven_tail
+
+.Lmod_loop:
+.byte	102,65,15,58,68,192,0
+.byte	102,65,15,58,68,200,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqu	(%rdx),%xmm3
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+
+	movdqu	16(%rdx),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+
+	movdqa	%xmm6,%xmm7
+	pshufd	$78,%xmm6,%xmm9
+	pshufd	$78,%xmm2,%xmm10
+	pxor	%xmm6,%xmm9
+	pxor	%xmm2,%xmm10
+	pxor	%xmm3,%xmm1
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+.byte	102,15,58,68,250,17
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+
+.byte	102,69,15,58,68,202,0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm8,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm8,%xmm4
+
+	pxor	%xmm6,%xmm9
+	pxor	%xmm7,%xmm9
+	movdqa	%xmm9,%xmm10
+	psrldq	$8,%xmm9
+	pslldq	$8,%xmm10
+	pxor	%xmm9,%xmm7
+	pxor	%xmm10,%xmm6
+
+	leaq	32(%rdx),%rdx
+	subq	$32,%rcx
+	ja	.Lmod_loop
+
+.Leven_tail:
+.byte	102,65,15,58,68,192,0
+.byte	102,65,15,58,68,200,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	testq	%rcx,%rcx
+	jnz	.Ldone
+
+.Lodd_tail:
+	movdqu	(%rdx),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.Ldone:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.LSEH_end_gcm_ghash_clmul:
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align	64
+.type	.Lrem_4bit,@object
+.Lrem_4bit:
+.long	0,0,0,471859200,0,943718400,0,610271232
+.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
+.type	.Lrem_8bit,@object
+.Lrem_8bit:
+.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/main/openssl/crypto/modes/asm/ghash-x86_64.pl b/main/openssl/crypto/modes/asm/ghash-x86_64.pl
new file mode 100644
index 00000000..38d779ed
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -0,0 +1,806 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.4.x(*)	assembler
+#
+# P4		28.6		14.0		+100%
+# Opteron	19.3		7.7		+150%
+# Core2		17.8		8.1(**)		+120%
+#
+# (*)	comparison is not completely fair, because C results are
+#	for vanilla "256B" implementation, while assembler results
+#	are for "528B";-)
+# (**)	it's mystery [to me] why Core2 result is not same as for
+#	Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+
+$Xi="%rdi";
+$Htbl="%rsi";
+
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/	or
+			$r =~ s/%[er]([sd]i)/%\1l/	or
+			$r =~ s/%[er](bp)/%\1l/		or
+			$r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+{ my $N;
+  sub loop() {
+  my $inp = shift;
+
+	$N++;
+$code.=<<___;
+	xor	$nlo,$nlo
+	xor	$nhi,$nhi
+	mov	`&LB("$Zlo")`,`&LB("$nlo")`
+	mov	`&LB("$Zlo")`,`&LB("$nhi")`
+	shl	\$4,`&LB("$nlo")`
+	mov	\$14,$cnt
+	mov	8($Htbl,$nlo),$Zlo
+	mov	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	mov	$Zlo,$rem
+	jmp	.Loop$N
+
+.align	16
+.Loop$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	mov	($inp,$cnt),`&LB("$nlo")`
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	mov	`&LB("$nlo")`,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	shl	\$4,`&LB("$nlo")`
+	xor	$tmp,$Zlo
+	dec	$cnt
+	js	.Lbreak$N
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+	jmp	.Loop$N
+
+.align	16
+.Lbreak$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	xor	$tmp,$Zlo
+	xor	($rem_4bit,$rem,8),$Zhi
+
+	bswap	$Zlo
+	bswap	$Zhi
+___
+}}
+
+$code=<<___;
+.text
+
+.globl	gcm_gmult_4bit
+.type	gcm_gmult_4bit,\@function,2
+.align	16
+gcm_gmult_4bit:
+	push	%rbx
+	push	%rbp		# %rbp and %r12 are pushed exclusively in
+	push	%r12		# order to reuse Win64 exception handler...
+.Lgmult_prologue:
+
+	movzb	15($Xi),$Zlo
+	lea	.Lrem_4bit(%rip),$rem_4bit
+___
+	&loop	($Xi);
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	mov	16(%rsp),%rbx
+	lea	24(%rsp),%rsp
+.Lgmult_epilogue:
+	ret
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.type	gcm_ghash_4bit,\@function,4
+.align	16
+gcm_ghash_4bit:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$280,%rsp
+.Lghash_prologue:
+	mov	$inp,%r14		# reassign couple of args
+	mov	$len,%r15
+___
+{ my $inp="%r14";
+  my $dat="%edx";
+  my $len="%r15";
+  my @nhi=("%ebx","%ecx");
+  my @rem=("%r12","%r13");
+  my $Hshr4="%rbp";
+
+	&sub	($Htbl,-128);		# size optimization
+	&lea	($Hshr4,"16+128(%rsp)");
+	{ my @lo =($nlo,$nhi);
+          my @hi =($Zlo,$Zhi);
+
+	  &xor	($dat,$dat);
+	  for ($i=0,$j=-2;$i<18;$i++,$j++) {
+	    &mov	("$j(%rsp)",&LB($dat))		if ($i>1);
+	    &or		($lo[0],$tmp)			if ($i>1);
+	    &mov	(&LB($dat),&LB($lo[1]))		if ($i>0 && $i<17);
+	    &shr	($lo[1],4)			if ($i>0 && $i<17);
+	    &mov	($tmp,$hi[1])			if ($i>0 && $i<17);
+	    &shr	($hi[1],4)			if ($i>0 && $i<17);
+	    &mov	("8*$j($Hshr4)",$hi[0])		if ($i>1);
+	    &mov	($hi[0],"16*$i+0-128($Htbl)")	if ($i<16);
+	    &shl	(&LB($dat),4)			if ($i>0 && $i<17);
+	    &mov	("8*$j-128($Hshr4)",$lo[0])	if ($i>1);
+	    &mov	($lo[0],"16*$i+8-128($Htbl)")	if ($i<16);
+	    &shl	($tmp,60)			if ($i>0 && $i<17);
+
+	    push	(@lo,shift(@lo));
+	    push	(@hi,shift(@hi));
+	  }
+	}
+	&add	($Htbl,-128);
+	&mov	($Zlo,"8($Xi)");
+	&mov	($Zhi,"0($Xi)");
+	&add	($len,$inp);		# pointer to the end of data
+	&lea	($rem_8bit,".Lrem_8bit(%rip)");
+	&jmp	(".Louter_loop");
+
+$code.=".align	16\n.Louter_loop:\n";
+	&xor	($Zhi,"($inp)");
+	&mov	("%rdx","8($inp)");
+	&lea	($inp,"16($inp)");
+	&xor	("%rdx",$Zlo);
+	&mov	("($Xi)",$Zhi);
+	&mov	("8($Xi)","%rdx");
+	&shr	("%rdx",32);
+
+	&xor	($nlo,$nlo);
+	&rol	($dat,8);
+	&mov	(&LB($nlo),&LB($dat));
+	&movz	($nhi[0],&LB($dat));
+	&shl	(&LB($nlo),4);
+	&shr	($nhi[0],4);
+
+	for ($j=11,$i=0;$i<15;$i++) {
+	    &rol	($dat,8);
+	    &xor	($Zlo,"8($Htbl,$nlo)")			if ($i>0);
+	    &xor	($Zhi,"($Htbl,$nlo)")			if ($i>0);
+	    &mov	($Zlo,"8($Htbl,$nlo)")			if ($i==0);
+	    &mov	($Zhi,"($Htbl,$nlo)")			if ($i==0);
+
+	    &mov	(&LB($nlo),&LB($dat));
+	    &xor	($Zlo,$tmp)				if ($i>0);
+	    &movzw	($rem[1],"($rem_8bit,$rem[1],2)")	if ($i>0);
+
+	    &movz	($nhi[1],&LB($dat));
+	    &shl	(&LB($nlo),4);
+	    &movzb	($rem[0],"(%rsp,$nhi[0])");
+
+	    &shr	($nhi[1],4)				if ($i<14);
+	    &and	($nhi[1],0xf0)				if ($i==14);
+	    &shl	($rem[1],48)				if ($i>0);
+	    &xor	($rem[0],$Zlo);
+
+	    &mov	($tmp,$Zhi);
+	    &xor	($Zhi,$rem[1])				if ($i>0);
+	    &shr	($Zlo,8);
+
+	    &movz	($rem[0],&LB($rem[0]));
+	    &mov	($dat,"$j($Xi)")			if (--$j%4==0);
+	    &shr	($Zhi,8);
+
+	    &xor	($Zlo,"-128($Hshr4,$nhi[0],8)");
+	    &shl	($tmp,56);
+	    &xor	($Zhi,"($Hshr4,$nhi[0],8)");
+
+	    unshift	(@nhi,pop(@nhi));		# "rotate" registers
+	    unshift	(@rem,pop(@rem));
+	}
+	&movzw	($rem[1],"($rem_8bit,$rem[1],2)");
+	&xor	($Zlo,"8($Htbl,$nlo)");
+	&xor	($Zhi,"($Htbl,$nlo)");
+
+	&shl	($rem[1],48);
+	&xor	($Zlo,$tmp);
+
+	&xor	($Zhi,$rem[1]);
+	&movz	($rem[0],&LB($Zlo));
+	&shr	($Zlo,4);
+
+	&mov	($tmp,$Zhi);
+	&shl	(&LB($rem[0]),4);
+	&shr	($Zhi,4);
+
+	&xor	($Zlo,"8($Htbl,$nhi[0])");
+	&movzw	($rem[0],"($rem_8bit,$rem[0],2)");
+	&shl	($tmp,60);
+
+	&xor	($Zhi,"($Htbl,$nhi[0])");
+	&xor	($Zlo,$tmp);
+	&shl	($rem[0],48);
+
+	&bswap	($Zlo);
+	&xor	($Zhi,$rem[0]);
+
+	&bswap	($Zhi);
+	&cmp	($inp,$len);
+	&jb	(".Louter_loop");
+}
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	lea	280(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lghash_epilogue:
+	ret
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+
+######################################################################
+# PCLMULQDQ version.
+
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 {	# minimal register pressure
+my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+
+$code.=<<___ if (!defined($modulo));
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey,$T2
+___
+$code.=<<___;
+	pclmulqdq	\$0x00,$Hkey,$Xi	#######
+	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
+	pclmulqdq	\$0x00,$T2,$T1		#######
+	pxor		$Xi,$T1			#
+	pxor		$Xhi,$T1		#
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+}
+
+sub reduction_alg9 {	# 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	# 1st phase
+	movdqa		$Xi,$T1			#
+	psllq		\$1,$Xi
+	pxor		$T1,$Xi			#
+	psllq		\$5,$Xi			#
+	pxor		$T1,$Xi			#
+	psllq		\$57,$Xi		#
+	movdqa		$Xi,$T2			#
+	pslldq		\$8,$Xi
+	psrldq		\$8,$T2			#	
+	pxor		$T1,$Xi
+	pxor		$T2,$Xhi		#
+
+	# 2nd phase
+	movdqa		$Xi,$T2
+	psrlq		\$5,$Xi
+	pxor		$T2,$Xi			#
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+	pxor		$Xhi,$T2
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+___
+}
+
+{ my ($Htbl,$Xip)=@_4args;
+
+$code.=<<___;
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,\@abi-omnipotent
+.align	16
+gcm_init_clmul:
+	movdqu		($Xip),$Hkey
+	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	movdqa		$Hkey,$T1
+	psllq		\$1,$Hkey
+	pxor		$T3,$T3			#
+	psrlq		\$63,$T1
+	pcmpgtd		$T2,$T3			# broadcast carry bit
+	pslldq		\$8,$T1
+	por		$T1,$Hkey		# H<<=1
+
+	# magic reduction
+	pand		.L0x1c2_polynomial(%rip),$T3
+	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	movdqa		$Hkey,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqu		$Hkey,($Htbl)		# save H
+	movdqu		$Xi,16($Htbl)		# save H^2
+	ret
+.size	gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+{ my ($Xip,$Htbl)=@_4args;
+
+$code.=<<___;
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,\@abi-omnipotent
+.align	16
+gcm_gmult_clmul:
+	movdqu		($Xip),$Xi
+	movdqa		.Lbswap_mask(%rip),$T3
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+	ret
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+  my $Xn="%xmm6";
+  my $Xhn="%xmm7";
+  my $Hkey2="%xmm8";
+  my $T1n="%xmm9";
+  my $T2n="%xmm10";
+
+$code.=<<___;
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,\@abi-omnipotent
+.align	16
+gcm_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_ghash_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp)
+	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp)
+	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp)
+___
+$code.=<<___;
+	movdqa		.Lbswap_mask(%rip),$T3
+
+	movdqu		($Xip),$Xi
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+
+	sub		\$0x10,$len
+	jz		.Lodd_tail
+
+	movdqu		16($Htbl),$Hkey2
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	movdqu		($inp),$T1		# Ii
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey2,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey2,$T2
+
+	lea		32($inp),$inp		# i+=2
+	sub		\$0x20,$len
+	jbe		.Leven_tail
+
+.Lmod_loop:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	movdqu		($inp),$T1		# Ii
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+
+	movdqa		$Xn,$Xhn		#
+	pshufd		\$0b01001110,$Xn,$T1n
+	pshufd		\$0b01001110,$Hkey,$T2n
+	pxor		$Xn,$T1n		#
+	pxor		$Hkey,$T2n
+	 pxor		$T1,$Xhi		# "Ii+Xi", consume early
+
+	  movdqa	$Xi,$T1			# 1st phase
+	  psllq		\$1,$Xi
+	  pxor		$T1,$Xi			#
+	  psllq		\$5,$Xi			#
+	  pxor		$T1,$Xi			#
+	pclmulqdq	\$0x00,$Hkey,$Xn	#######
+	  psllq		\$57,$Xi		#
+	  movdqa	$Xi,$T2			#
+	  pslldq	\$8,$Xi
+	  psrldq	\$8,$T2			#	
+	  pxor		$T1,$Xi
+	  pxor		$T2,$Xhi		#
+
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
+	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$5,$Xi
+	  pxor		$T2,$Xi			#
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+	  pxor		$Xhi,$T2
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+
+	pclmulqdq	\$0x00,$T2n,$T1n	#######
+	 movdqa		$Xi,$Xhi		#
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pshufd		\$0b01001110,$Hkey2,$T2
+	 pxor		$Xi,$T1			#
+	 pxor		$Hkey2,$T2
+
+	pxor		$Xn,$T1n		#
+	pxor		$Xhn,$T1n		#
+	movdqa		$T1n,$T2n		#
+	psrldq		\$8,$T1n
+	pslldq		\$8,$T2n		#
+	pxor		$T1n,$Xhn
+	pxor		$T2n,$Xn		#
+
+	lea		32($inp),$inp
+	sub		\$0x20,$len
+	ja		.Lmod_loop
+
+.Leven_tail:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+___
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	test		$len,$len
+	jnz		.Ldone
+
+.Lodd_tail:
+	movdqu		($inp),$T1		# Ii
+	pshufb		$T3,$T1
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	add	\$0x58,%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_gcm_ghash_clmul:
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align	64
+.type	.Lrem_4bit,\@object
+.Lrem_4bit:
+	.long	0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+	.long	0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+	.long	0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+	.long	0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type	.Lrem_8bit,\@object
+.Lrem_8bit:
+	.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+	.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+	.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+	.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+	.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+	.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+	.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+	.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+	.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+	.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+	.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+	.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+	.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+	.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+	.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+	.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+	.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+	.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+	.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+	.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+	.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+	.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+	.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+	.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+	.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+	.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+	.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+	.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+	.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+	.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+	.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+	.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	24(%rax),%rax		# adjust "rsp"
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_gcm_gmult_4bit
+	.rva	.LSEH_end_gcm_gmult_4bit
+	.rva	.LSEH_info_gcm_gmult_4bit
+
+	.rva	.LSEH_begin_gcm_ghash_4bit
+	.rva	.LSEH_end_gcm_ghash_4bit
+	.rva	.LSEH_info_gcm_ghash_4bit
+
+	.rva	.LSEH_begin_gcm_ghash_clmul
+	.rva	.LSEH_end_gcm_ghash_clmul
+	.rva	.LSEH_info_gcm_ghash_clmul
+
+.section	.xdata
+.align	8
+.LSEH_info_gcm_gmult_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lgmult_prologue,.Lgmult_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_clmul:
+	.byte	0x01,0x1f,0x0b,0x00
+	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6
+	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/modes/cbc128.c b/main/openssl/crypto/modes/cbc128.c
index 8f8bd563..0e54f754 100644
--- a/main/openssl/crypto/modes/cbc128.c
+++ b/main/openssl/crypto/modes/cbc128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,12 +59,7 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT 1
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
+#ifndef STRICT_ALIGNMENT
 #  define STRICT_ALIGNMENT 0
 #endif
 
@@ -121,7 +117,7 @@ void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
 			unsigned char ivec[16], block128_f block)
 {
 	size_t n;
-	union { size_t align; unsigned char c[16]; } tmp;
+	union { size_t t[16/sizeof(size_t)]; unsigned char c[16]; } tmp;
 
 	assert(in && out && key && ivec);
 
@@ -141,11 +137,13 @@ void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
 				out += 16;
 			}
 		}
-		else {
+		else  if (16%sizeof(size_t) == 0) { /* always true */
 			while (len>=16) {
+				size_t *out_t=(size_t *)out, *iv_t=(size_t *)iv;
+
 				(*block)(in, out, key);
-				for(n=0; n<16; n+=sizeof(size_t))
-					*(size_t *)(out+n) ^= *(size_t *)(iv+n);
+				for(n=0; n<16/sizeof(size_t); n++)
+					out_t[n] ^= iv_t[n];
 				iv = in;
 				len -= 16;
 				in  += 16;
@@ -169,15 +167,16 @@ void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
 				out += 16;
 			}
 		}
-		else {
-			size_t c;
+		else if (16%sizeof(size_t) == 0) { /* always true */
 			while (len>=16) {
+				size_t c, *out_t=(size_t *)out, *ivec_t=(size_t *)ivec;
+				const size_t *in_t=(const size_t *)in;
+
 				(*block)(in, tmp.c, key);
-				for(n=0; n<16; n+=sizeof(size_t)) {
-					c = *(size_t *)(in+n);
-					*(size_t *)(out+n) =
-					*(size_t *)(tmp.c+n) ^ *(size_t *)(ivec+n);
-					*(size_t *)(ivec+n) = c;
+				for(n=0; n<16/sizeof(size_t); n++) {
+					c = in_t[n];
+					out_t[n] = tmp.t[n] ^ ivec_t[n];
+					ivec_t[n] = c;
 				}
 				len -= 16;
 				in  += 16;
diff --git a/main/openssl/crypto/modes/ccm128.c b/main/openssl/crypto/modes/ccm128.c
new file mode 100644
index 00000000..3ce11d0d
--- /dev/null
+++ b/main/openssl/crypto/modes/ccm128.c
@@ -0,0 +1,441 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+/* First you setup M and L parameters and pass the key schedule.
+ * This is called once per session setup... */
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+	unsigned int M,unsigned int L,void *key,block128_f block)
+{
+	memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
+	ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
+	ctx->blocks = 0;
+	ctx->block = block;
+	ctx->key = key;
+}
+
+/* !!! Following interfaces are to be called *once* per packet !!! */
+
+/* Then you setup per-message nonce and pass the length of the message */
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+	const unsigned char *nonce,size_t nlen,size_t mlen)
+{
+	unsigned int L = ctx->nonce.c[0]&7;	/* the L parameter */
+
+	if (nlen<(14-L)) return -1;		/* nonce is too short */
+
+	if (sizeof(mlen)==8 && L>=3) {
+		ctx->nonce.c[8]  = (u8)(mlen>>(56%(sizeof(mlen)*8)));
+		ctx->nonce.c[9]  = (u8)(mlen>>(48%(sizeof(mlen)*8)));
+		ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
+		ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
+	}
+	else
+		ctx->nonce.u[1] = 0;
+
+	ctx->nonce.c[12] = (u8)(mlen>>24);
+	ctx->nonce.c[13] = (u8)(mlen>>16);
+	ctx->nonce.c[14] = (u8)(mlen>>8);
+	ctx->nonce.c[15] = (u8)mlen;
+
+	ctx->nonce.c[0] &= ~0x40;	/* clear Adata flag */
+	memcpy(&ctx->nonce.c[1],nonce,14-L);
+
+	return 0;
+}
+
+/* Then you pass additional authentication data, this is optional */
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+	const unsigned char *aad,size_t alen)
+{	unsigned int i;
+	block128_f block = ctx->block;
+
+	if (alen==0) return;
+
+	ctx->nonce.c[0] |= 0x40;	/* set Adata flag */
+	(*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
+	ctx->blocks++;
+
+	if (alen<(0x10000-0x100)) {
+		ctx->cmac.c[0] ^= (u8)(alen>>8);
+		ctx->cmac.c[1] ^= (u8)alen;
+		i=2;
+	}
+	else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
+		ctx->cmac.c[0] ^= 0xFF;
+		ctx->cmac.c[1] ^= 0xFF;
+		ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
+		ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
+		ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
+		ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
+		ctx->cmac.c[6] ^= (u8)(alen>>24);
+		ctx->cmac.c[7] ^= (u8)(alen>>16);
+		ctx->cmac.c[8] ^= (u8)(alen>>8);
+		ctx->cmac.c[9] ^= (u8)alen;
+		i=10;
+	}
+	else {
+		ctx->cmac.c[0] ^= 0xFF;
+		ctx->cmac.c[1] ^= 0xFE;
+		ctx->cmac.c[2] ^= (u8)(alen>>24);
+		ctx->cmac.c[3] ^= (u8)(alen>>16);
+		ctx->cmac.c[4] ^= (u8)(alen>>8);
+		ctx->cmac.c[5] ^= (u8)alen;
+		i=6;
+	}
+
+	do {
+		for(;i<16 && alen;++i,++aad,--alen)
+			ctx->cmac.c[i] ^= *aad;
+		(*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
+		ctx->blocks++;
+		i=0;
+	} while (alen);
+}
+
+/* Finally you encrypt or decrypt the message */
+
+/* counter part of nonce may not be larger than L*8 bits,
+ * L is not larger than 8, therefore 64-bit counter... */
+static void ctr64_inc(unsigned char *counter) {
+	unsigned int n=8;
+	u8  c;
+
+	counter += 8;
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key),
+		ctx->blocks++;
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;	/* length mismatch */
+
+	ctx->blocks += ((len+15)>>3)|1;
+	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		union { u64 u[2]; u8 c[16]; } temp;
+
+		memcpy (temp.c,inp,16);
+		ctx->cmac.u[0] ^= temp.u[0];
+		ctx->cmac.u[1] ^= temp.u[1];
+#else
+		ctx->cmac.u[0] ^= ((u64*)inp)[0];
+		ctx->cmac.u[1] ^= ((u64*)inp)[1];
+#endif
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+		temp.u[0] ^= scratch.u[0];
+		temp.u[1] ^= scratch.u[1];
+		memcpy(out,temp.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
+		((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
+#endif
+		inp += 16;
+		out += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		union { u64 u[2]; u8 c[16]; } temp;
+#endif
+		(*block)(ctx->nonce.c,scratch.c,key);
+		ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+		memcpy (temp.c,inp,16);
+		ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+		ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+		memcpy (out,scratch.c,16);
+#else
+		ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
+		ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
+#endif
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+
+		inp += 16;
+		out += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i)
+			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+static void ctr64_add (unsigned char *counter,size_t inc)
+{	size_t n=8, val=0;
+
+	counter += 8;
+	do {
+		--n;
+		val += counter[n] + (inc&0xff);
+		counter[n] = (unsigned char)val;
+		val >>= 8;	/* carry bit */
+		inc >>= 8;
+	} while(n && (inc || val));
+}
+
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len,ccm128_f stream)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key),
+		ctx->blocks++;
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;	/* length mismatch */
+
+	ctx->blocks += ((len+15)>>3)|1;
+	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
+
+	if ((n=len/16)) {
+		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+		n   *= 16;
+		inp += n;
+		out += n;
+		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
+	}
+
+	if (len) {
+		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len,ccm128_f stream)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;
+
+	if ((n=len/16)) {
+		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+		n   *= 16;
+		inp += n;
+		out += n;
+		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
+	}
+
+	if (len) {
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i)
+			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
+{	unsigned int M = (ctx->nonce.c[0]>>3)&7;	/* the M parameter */
+
+	M *= 2; M += 2;
+	if (len<M)	return 0;
+	memcpy(tag,ctx->cmac.c,M);
+	return M;
+}
diff --git a/main/openssl/crypto/modes/cfb128.c b/main/openssl/crypto/modes/cfb128.c
index e5938c61..4e6f5d35 100644
--- a/main/openssl/crypto/modes/cfb128.c
+++ b/main/openssl/crypto/modes/cfb128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit cfb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;
diff --git a/main/openssl/crypto/modes/ctr128.c b/main/openssl/crypto/modes/ctr128.c
index 932037f5..ee642c58 100644
--- a/main/openssl/crypto/modes/ctr128.c
+++ b/main/openssl/crypto/modes/ctr128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,17 +59,6 @@
 #endif
 #include <assert.h>
 
-typedef unsigned int u32;
-typedef unsigned char u8;
-
-#define STRICT_ALIGNMENT
-#if defined(__i386)	|| defined(__i386__)	|| \
-    defined(__x86_64)	|| defined(__x86_64__)	|| \
-    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
-    defined(__s390__)	|| defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* NOTE: the IV/counter CTR mode is big-endian.  The code itself
  * is endian-neutral. */
 
@@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
 
 	*num=n;
 }
+
+/* increment upper 96 bits of 128-bit counter by 1 */
+static void ctr96_inc(unsigned char *counter) {
+	u32 n=12;
+	u8  c;
+
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], unsigned char ecount_buf[16],
+			unsigned int *num, ctr128_f func)
+{
+	unsigned int n,ctr32;
+
+	assert(in && out && key && ecount_buf && num);
+	assert(*num < 16);
+
+	n = *num;
+
+	while (n && len) {
+		*(out++) = *(in++) ^ ecount_buf[n];
+		--len;
+		n = (n+1) % 16;
+	}
+
+	ctr32 = GETU32(ivec+12);
+	while (len>=16) {
+		size_t blocks = len/16;
+		/*
+		 * 1<<28 is just a not-so-small yet not-so-large number...
+		 * Below condition is practically never met, but it has to
+		 * be checked for code correctness.
+		 */
+		if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
+			blocks = (1U<<28);
+		/*
+		 * As (*func) operates on 32-bit counter, caller
+		 * has to handle overflow. 'if' below detects the
+		 * overflow, which is then handled by limiting the
+		 * amount of blocks to the exact overflow point...
+		 */
+		ctr32 += (u32)blocks;
+		if (ctr32 < blocks) {
+			blocks -= ctr32;
+			ctr32   = 0;
+		}
+		(*func)(in,out,blocks,key,ivec);
+		/* (*ctr) does not update ivec, caller does: */
+		PUTU32(ivec+12,ctr32);
+		/* ... overflow was detected, propogate carry. */
+		if (ctr32 == 0)	ctr96_inc(ivec);
+		blocks *= 16;
+		len -= blocks;
+		out += blocks;
+		in  += blocks;
+	}
+	if (len) {
+		memset(ecount_buf,0,16);
+		(*func)(ecount_buf,ecount_buf,1,key,ivec);
+		++ctr32;
+		PUTU32(ivec+12,ctr32);
+		if (ctr32 == 0)	ctr96_inc(ivec);
+		while (len--) {
+			out[n] = in[n] ^ ecount_buf[n];
+			++n;
+		}
+	}
+
+	*num=n;
+}
diff --git a/main/openssl/crypto/modes/gcm128.c b/main/openssl/crypto/modes/gcm128.c
new file mode 100644
index 00000000..250063de
--- /dev/null
+++ b/main/openssl/crypto/modes/gcm128.c
@@ -0,0 +1,1817 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#define OPENSSL_FIPSAPI
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
+/* redefine, because alignment is ensured */
+#undef	GETU32
+#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
+#undef	PUTU32
+#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+#endif
+
+#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V)	do { \
+	if (sizeof(size_t)==8) { \
+		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^T; \
+	} \
+	else { \
+		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^((u64)T<<32); \
+	} \
+} while(0)
+
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8. 8 is effectively reserved for testing purposes.
+ * TABLE_BITS>1 are lookup-table-driven implementations referred to as
+ * "Shoup's" in GCM specification. In other words OpenSSL does not cover
+ * whole spectrum of possible table driven implementations. Why? In
+ * non-"Shoup's" case memory access pattern is segmented in such manner,
+ * that it's trivial to see that cache timing information can reveal
+ * fair portion of intermediate hash value. Given that ciphertext is
+ * always available to attacker, it's possible for him to attempt to
+ * deduce secret parameter H and if successful, tamper with messages
+ * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
+ * not as trivial, but there is no reason to believe that it's resistant
+ * to cache-timing attack. And the thing about "8-bit" implementation is
+ * that it consumes 16 (sixteen) times more memory, 4KB per individual
+ * key + 1KB shared. Well, on pros side it should be twice as fast as
+ * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
+ * was observed to run ~75% faster, closer to 100% for commercial
+ * compilers... Yet "4-bit" procedure is preferred, because it's
+ * believed to provide better security-performance balance and adequate
+ * all-round performance. "All-round" refers to things like:
+ *
+ * - shorter setup time effectively improves overall timing for
+ *   handling short messages;
+ * - larger table allocation can become unbearable because of VM
+ *   subsystem penalties (for example on Windows large enough free
+ *   results in VM working set trimming, meaning that consequent
+ *   malloc would immediately incur working set expansion);
+ * - larger table has larger cache footprint, which can affect
+ *   performance of other code paths (not necessarily even from same
+ *   thread in Hyper-Threading world);
+ *
+ * Value of 1 is not appropriate for performance reasons.
+ */
+#if	TABLE_BITS==8
+
+static void gcm_init_8bit(u128 Htable[256], u64 H[2])
+{
+	int  i, j;
+	u128 V;
+
+	Htable[0].hi = 0;
+	Htable[0].lo = 0;
+	V.hi = H[0];
+	V.lo = H[1];
+
+	for (Htable[128]=V, i=64; i>0; i>>=1) {
+		REDUCE1BIT(V);
+		Htable[i] = V;
+	}
+
+	for (i=2; i<256; i<<=1) {
+		u128 *Hi = Htable+i, H0 = *Hi;
+		for (j=1; j<i; ++j) {
+			Hi[j].hi = H0.hi^Htable[j].hi;
+			Hi[j].lo = H0.lo^Htable[j].lo;
+		}
+	}
+}
+
+static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
+{
+	u128 Z = { 0, 0};
+	const u8 *xi = (const u8 *)Xi+15;
+	size_t rem, n = *xi;
+	const union { long one; char little; } is_endian = {1};
+	static const size_t rem_8bit[256] = {
+		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
+		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
+		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
+		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
+		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
+		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
+		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
+		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
+		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
+		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
+		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
+		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
+		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
+		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
+		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
+		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
+		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
+		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
+		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
+		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
+		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
+		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
+		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
+		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
+		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
+		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
+		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
+		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
+		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
+		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
+		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
+		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
+		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
+		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
+		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
+		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
+		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
+		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
+		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
+		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
+		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
+		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
+		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
+		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
+		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
+		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
+		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
+		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
+		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
+		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
+		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
+		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
+		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
+		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
+		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
+		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
+		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
+		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
+		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
+		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
+		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
+		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
+		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
+		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
+
+	while (1) {
+		Z.hi ^= Htable[n].hi;
+		Z.lo ^= Htable[n].lo;
+
+		if ((u8 *)Xi==xi)	break;
+
+		n = *(--xi);
+
+		rem  = (size_t)Z.lo&0xff;
+		Z.lo = (Z.hi<<56)|(Z.lo>>8);
+		Z.hi = (Z.hi>>8);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_8bit[rem];
+		else
+			Z.hi ^= (u64)rem_8bit[rem]<<32;
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
+
+#elif	TABLE_BITS==4
+
+static void gcm_init_4bit(u128 Htable[16], u64 H[2])
+{
+	u128 V;
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+	int  i;
+#endif
+
+	Htable[0].hi = 0;
+	Htable[0].lo = 0;
+	V.hi = H[0];
+	V.lo = H[1];
+
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+	for (Htable[8]=V, i=4; i>0; i>>=1) {
+		REDUCE1BIT(V);
+		Htable[i] = V;
+	}
+
+	for (i=2; i<16; i<<=1) {
+		u128 *Hi = Htable+i;
+		int   j;
+		for (V=*Hi, j=1; j<i; ++j) {
+			Hi[j].hi = V.hi^Htable[j].hi;
+			Hi[j].lo = V.lo^Htable[j].lo;
+		}
+	}
+#else
+	Htable[8] = V;
+	REDUCE1BIT(V);
+	Htable[4] = V;
+	REDUCE1BIT(V);
+	Htable[2] = V;
+	REDUCE1BIT(V);
+	Htable[1] = V;
+	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
+	V=Htable[4];
+	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
+	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
+	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
+	V=Htable[8];
+	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
+	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
+	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
+	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
+	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
+	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
+	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
+#endif
+#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
+	/*
+	 * ARM assembler expects specific dword order in Htable.
+	 */
+	{
+	int j;
+	const union { long one; char little; } is_endian = {1};
+
+	if (is_endian.little)
+		for (j=0;j<16;++j) {
+			V = Htable[j];
+			Htable[j].hi = V.lo;
+			Htable[j].lo = V.hi;
+		}
+	else
+		for (j=0;j<16;++j) {
+			V = Htable[j];
+			Htable[j].hi = V.lo<<32|V.lo>>32;
+			Htable[j].lo = V.hi<<32|V.hi>>32;
+		}
+	}
+#endif
+}
+
+#ifndef GHASH_ASM
+static const size_t rem_4bit[16] = {
+	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
+
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
+{
+	u128 Z;
+	int cnt = 15;
+	size_t rem, nlo, nhi;
+	const union { long one; char little; } is_endian = {1};
+
+	nlo  = ((const u8 *)Xi)[15];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi = Htable[nlo].hi;
+	Z.lo = Htable[nlo].lo;
+
+	while (1) {
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nhi].hi;
+		Z.lo ^= Htable[nhi].lo;
+
+		if (--cnt<0)		break;
+
+		nlo  = ((const u8 *)Xi)[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+/*
+ * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
+ * details... Compiler-generated code doesn't seem to give any
+ * performance improvement, at least not on x86[_64]. It's here
+ * mostly as reference and a placeholder for possible future
+ * non-trivial optimization[s]...
+ */
+static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)
+{
+    u128 Z;
+    int cnt;
+    size_t rem, nlo, nhi;
+    const union { long one; char little; } is_endian = {1};
+
+#if 1
+    do {
+	cnt  = 15;
+	nlo  = ((const u8 *)Xi)[15];
+	nlo ^= inp[15];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi = Htable[nlo].hi;
+	Z.lo = Htable[nlo].lo;
+
+	while (1) {
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nhi].hi;
+		Z.lo ^= Htable[nhi].lo;
+
+		if (--cnt<0)		break;
+
+		nlo  = ((const u8 *)Xi)[cnt];
+		nlo ^= inp[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+	}
+#else
+    /*
+     * Extra 256+16 bytes per-key plus 512 bytes shared tables
+     * [should] give ~50% improvement... One could have PACK()-ed
+     * the rem_8bit even here, but the priority is to minimize
+     * cache footprint...
+     */ 
+    u128 Hshr4[16];	/* Htable shifted right by 4 bits */
+    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
+    static const unsigned short rem_8bit[256] = {
+	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
+	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
+	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
+	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
+	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
+	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
+	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
+	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
+	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
+	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
+	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
+	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
+	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
+	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
+	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
+	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
+	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
+	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
+	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
+	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
+	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
+	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
+	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
+	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
+	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
+	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
+	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
+	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
+	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
+	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
+	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
+	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
+    /*
+     * This pre-processing phase slows down procedure by approximately
+     * same time as it makes each loop spin faster. In other words
+     * single block performance is approximately same as straightforward
+     * "4-bit" implementation, and then it goes only faster...
+     */
+    for (cnt=0; cnt<16; ++cnt) {
+	Z.hi = Htable[cnt].hi;
+	Z.lo = Htable[cnt].lo;
+	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
+	Hshr4[cnt].hi = (Z.hi>>4);
+	Hshl4[cnt]    = (u8)(Z.lo<<4);
+    }
+
+    do {
+	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
+		nlo  = ((const u8 *)Xi)[cnt];
+		nlo ^= inp[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+
+		rem = (size_t)Z.lo&0xff;
+
+		Z.lo = (Z.hi<<56)|(Z.lo>>8);
+		Z.hi = (Z.hi>>8);
+
+		Z.hi ^= Hshr4[nhi].hi;
+		Z.lo ^= Hshr4[nhi].lo;
+		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
+	}
+
+	nlo  = ((const u8 *)Xi)[0];
+	nlo ^= inp[0];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi ^= Htable[nlo].hi;
+	Z.lo ^= Htable[nlo].lo;
+
+	rem = (size_t)Z.lo&0xf;
+
+	Z.lo = (Z.hi<<60)|(Z.lo>>4);
+	Z.hi = (Z.hi>>4);
+
+	Z.hi ^= Htable[nhi].hi;
+	Z.lo ^= Htable[nhi].lo;
+	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
+#endif
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+    } while (inp+=16, len-=16);
+}
+#endif
+#else
+void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+
+#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
+#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
+#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
+ * trashing effect. In other words idea is to hash data while it's
+ * still in L1 cache after encryption pass... */
+#define GHASH_CHUNK       (3*1024)
+#endif
+
+#else	/* TABLE_BITS */
+
+static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
+{
+	u128 V,Z = { 0,0 };
+	long X;
+	int  i,j;
+	const long *xi = (const long *)Xi;
+	const union { long one; char little; } is_endian = {1};
+
+	V.hi = H[0];	/* H is in host byte order, no byte swapping */
+	V.lo = H[1];
+
+	for (j=0; j<16/sizeof(long); ++j) {
+		if (is_endian.little) {
+			if (sizeof(long)==8) {
+#ifdef BSWAP8
+				X = (long)(BSWAP8(xi[j]));
+#else
+				const u8 *p = (const u8 *)(xi+j);
+				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
+#endif
+			}
+			else {
+				const u8 *p = (const u8 *)(xi+j);
+				X = (long)GETU32(p);
+			}
+		}
+		else
+			X = xi[j];
+
+		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
+			u64 M = (u64)(X>>(8*sizeof(long)-1));
+			Z.hi ^= V.hi&M;
+			Z.lo ^= V.lo&M;
+
+			REDUCE1BIT(V);
+		}
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
+
+#endif
+
+#if	TABLE_BITS==4 && defined(GHASH_ASM)
+# if	!defined(I386_ONLY) && \
+	(defined(__i386)	|| defined(__i386__)	|| \
+	 defined(__x86_64)	|| defined(__x86_64__)	|| \
+	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
+#  define GHASH_ASM_X86_OR_64
+#  define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#   define GHASH_ASM_X86
+void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# elif defined(__arm__) || defined(__arm)
+#  include "arm_arch.h"
+#  if __ARM_ARCH__>=7
+#   define GHASH_ASM_ARM
+#   define GCM_FUNCREF_4BIT
+void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# endif
+#endif
+
+#ifdef GCM_FUNCREF_4BIT
+# undef  GCM_MUL
+# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# ifdef GHASH
+#  undef  GHASH
+#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+# endif
+#endif
+
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
+{
+	const union { long one; char little; } is_endian = {1};
+
+	memset(ctx,0,sizeof(*ctx));
+	ctx->block = block;
+	ctx->key   = key;
+
+	(*block)(ctx->H.c,ctx->H.c,key);
+
+	if (is_endian.little) {
+		/* H is stored in host byte order */
+#ifdef BSWAP8
+		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+#else
+		u8 *p = ctx->H.c;
+		u64 hi,lo;
+		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
+		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
+		ctx->H.u[0] = hi;
+		ctx->H.u[1] = lo;
+#endif
+	}
+
+#if	TABLE_BITS==8
+	gcm_init_8bit(ctx->Htable,ctx->H.u);
+#elif	TABLE_BITS==4
+# if	defined(GHASH_ASM_X86_OR_64)
+#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
+	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
+	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
+		gcm_init_clmul(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_clmul;
+		ctx->ghash = gcm_ghash_clmul;
+		return;
+	}
+#  endif
+	gcm_init_4bit(ctx->Htable,ctx->H.u);
+#  if	defined(GHASH_ASM_X86)			/* x86 only */
+#   if	defined(OPENSSL_IA32_SSE2)
+	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
+#   else
+	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
+#   endif
+		ctx->gmult = gcm_gmult_4bit_mmx;
+		ctx->ghash = gcm_ghash_4bit_mmx;
+	} else {
+		ctx->gmult = gcm_gmult_4bit_x86;
+		ctx->ghash = gcm_ghash_4bit_x86;
+	}
+#  else
+	ctx->gmult = gcm_gmult_4bit;
+	ctx->ghash = gcm_ghash_4bit;
+#  endif
+# elif	defined(GHASH_ASM_ARM)
+	if (OPENSSL_armcap_P & ARMV7_NEON) {
+		ctx->gmult = gcm_gmult_neon;
+		ctx->ghash = gcm_ghash_neon;
+	} else {
+		gcm_init_4bit(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_4bit;
+		ctx->ghash = gcm_ghash_4bit;
+	}
+# else
+	gcm_init_4bit(ctx->Htable,ctx->H.u);
+# endif
+#endif
+}
+
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int ctr;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+#endif
+
+	ctx->Yi.u[0]  = 0;
+	ctx->Yi.u[1]  = 0;
+	ctx->Xi.u[0]  = 0;
+	ctx->Xi.u[1]  = 0;
+	ctx->len.u[0] = 0;	/* AAD length */
+	ctx->len.u[1] = 0;	/* message length */
+	ctx->ares = 0;
+	ctx->mres = 0;
+
+	if (len==12) {
+		memcpy(ctx->Yi.c,iv,12);
+		ctx->Yi.c[15]=1;
+		ctr=1;
+	}
+	else {
+		size_t i;
+		u64 len0 = len;
+
+		while (len>=16) {
+			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
+			GCM_MUL(ctx,Yi);
+			iv += 16;
+			len -= 16;
+		}
+		if (len) {
+			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
+			GCM_MUL(ctx,Yi);
+		}
+		len0 <<= 3;
+		if (is_endian.little) {
+#ifdef BSWAP8
+			ctx->Yi.u[1]  ^= BSWAP8(len0);
+#else
+			ctx->Yi.c[8]  ^= (u8)(len0>>56);
+			ctx->Yi.c[9]  ^= (u8)(len0>>48);
+			ctx->Yi.c[10] ^= (u8)(len0>>40);
+			ctx->Yi.c[11] ^= (u8)(len0>>32);
+			ctx->Yi.c[12] ^= (u8)(len0>>24);
+			ctx->Yi.c[13] ^= (u8)(len0>>16);
+			ctx->Yi.c[14] ^= (u8)(len0>>8);
+			ctx->Yi.c[15] ^= (u8)(len0);
+#endif
+		}
+		else
+			ctx->Yi.u[1]  ^= len0;
+
+		GCM_MUL(ctx,Yi);
+
+		if (is_endian.little)
+			ctr = GETU32(ctx->Yi.c+12);
+		else
+			ctr = ctx->Yi.d[3];
+	}
+
+	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
+	++ctr;
+	if (is_endian.little)
+		PUTU32(ctx->Yi.c+12,ctr);
+	else
+		ctx->Yi.d[3] = ctr;
+}
+
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
+{
+	size_t i;
+	unsigned int n;
+	u64 alen = ctx->len.u[0];
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	if (ctx->len.u[1]) return -2;
+
+	alen += len;
+	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
+		return -1;
+	ctx->len.u[0] = alen;
+
+	n = ctx->ares;
+	if (n) {
+		while (n && len) {
+			ctx->Xi.c[n] ^= *(aad++);
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL(ctx,Xi);
+		else {
+			ctx->ares = n;
+			return 0;
+		}
+	}
+
+#ifdef GHASH
+	if ((i = (len&(size_t)-16))) {
+		GHASH(ctx,aad,i);
+		aad += i;
+		len -= i;
+	}
+#else
+	while (len>=16) {
+		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
+		GCM_MUL(ctx,Xi);
+		aad += 16;
+		len -= 16;
+	}
+#endif
+	if (len) {
+		n = (unsigned int)len;
+		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
+	}
+
+	ctx->ares = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64        mlen  = ctx->len.u[1];
+	block128_f block = ctx->block;
+	void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+#if 0
+	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
+#endif
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+	if (16%sizeof(size_t) == 0) do {	/* always true actually */
+		if (n) {
+			while (n && len) {
+				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+				--len;
+				n = (n+1)%16;
+			}
+			if (n==0) GCM_MUL(ctx,Xi);
+			else {
+				ctx->mres = n;
+				return 0;
+			}
+		}
+#if defined(STRICT_ALIGNMENT)
+		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+			break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+		while (len>=GHASH_CHUNK) {
+		    size_t j=GHASH_CHUNK;
+
+		    while (j) {
+		    	size_t *out_t=(size_t *)out;
+		    	const size_t *in_t=(const size_t *)in;
+
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16/sizeof(size_t); ++i)
+				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+			out += 16;
+			in  += 16;
+			j   -= 16;
+		    }
+		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
+		    len -= GHASH_CHUNK;
+		}
+		if ((i = (len&(size_t)-16))) {
+		    size_t j=i;
+
+		    while (len>=16) {
+		    	size_t *out_t=(size_t *)out;
+		    	const size_t *in_t=(const size_t *)in;
+
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16/sizeof(size_t); ++i)
+				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+			out += 16;
+			in  += 16;
+			len -= 16;
+		    }
+		    GHASH(ctx,out-j,j);
+		}
+#else
+		while (len>=16) {
+		    	size_t *out_t=(size_t *)out;
+		    	const size_t *in_t=(const size_t *)in;
+
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16/sizeof(size_t); ++i)
+				ctx->Xi.t[i] ^=
+				out_t[i] = in_t[i]^ctx->EKi.t[i];
+			GCM_MUL(ctx,Xi);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		}
+#endif
+		if (len) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			while (len--) {
+				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+				++n;
+			}
+		}
+
+		ctx->mres = n;
+		return 0;
+	} while(0);
+#endif
+	for (i=0;i<len;++i) {
+		if (n==0) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+		}
+		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
+		n = (n+1)%16;
+		if (n==0)
+			GCM_MUL(ctx,Xi);
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64        mlen  = ctx->len.u[1];
+	block128_f block = ctx->block;
+	void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to decrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+	if (16%sizeof(size_t) == 0) do {	/* always true actually */
+		if (n) {
+			while (n && len) {
+				u8 c = *(in++);
+				*(out++) = c^ctx->EKi.c[n];
+				ctx->Xi.c[n] ^= c;
+				--len;
+				n = (n+1)%16;
+			}
+			if (n==0) GCM_MUL (ctx,Xi);
+			else {
+				ctx->mres = n;
+				return 0;
+			}
+		}
+#if defined(STRICT_ALIGNMENT)
+		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+			break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+		while (len>=GHASH_CHUNK) {
+		    size_t j=GHASH_CHUNK;
+
+		    GHASH(ctx,in,GHASH_CHUNK);
+		    while (j) {
+		    	size_t *out_t=(size_t *)out;
+		    	const size_t *in_t=(const size_t *)in;
+
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16/sizeof(size_t); ++i)
+				out_t[i] = in_t[i]^ctx->EKi.t[i];
+			out += 16;
+			in  += 16;
+			j   -= 16;
+		    }
+		    len -= GHASH_CHUNK;
+		}
+		if ((i = (len&(size_t)-16))) {
+		    GHASH(ctx,in,i);
+		    while (len>=16) {
+		    	size_t *out_t=(size_t *)out;
+		    	const size_t *in_t=(const size_t *)in;
+
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16/sizeof(size_t); ++i)
+				out_t[i] = in_t[i]^ctx->EKi.t[i];
+			out += 16;
+			in  += 16;
+			len -= 16;
+		    }
+		}
+#else
+		while (len>=16) {
+		    	size_t *out_t=(size_t *)out;
+		    	const size_t *in_t=(const size_t *)in;
+
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16/sizeof(size_t); ++i) {
+				size_t c = in[i];
+				out[i] = c^ctx->EKi.t[i];
+				ctx->Xi.t[i] ^= c;
+			}
+			GCM_MUL(ctx,Xi);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		}
+#endif
+		if (len) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			while (len--) {
+				u8 c = in[n];
+				ctx->Xi.c[n] ^= c;
+				out[n] = c^ctx->EKi.c[n];
+				++n;
+			}
+		}
+
+		ctx->mres = n;
+		return 0;
+	} while(0);
+#endif
+	for (i=0;i<len;++i) {
+		u8 c;
+		if (n==0) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+		}
+		c = in[i];
+		out[i] = c^ctx->EKi.c[n];
+		ctx->Xi.c[n] ^= c;
+		n = (n+1)%16;
+		if (n==0)
+			GCM_MUL(ctx,Xi);
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len, ctr128_f stream)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64   mlen = ctx->len.u[1];
+	void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+	if (n) {
+		while (n && len) {
+			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL(ctx,Xi);
+		else {
+			ctx->mres = n;
+			return 0;
+		}
+	}
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+	while (len>=GHASH_CHUNK) {
+		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+		ctr += GHASH_CHUNK/16;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		GHASH(ctx,out,GHASH_CHUNK);
+		out += GHASH_CHUNK;
+		in  += GHASH_CHUNK;
+		len -= GHASH_CHUNK;
+	}
+#endif
+	if ((i = (len&(size_t)-16))) {
+		size_t j=i/16;
+
+		(*stream)(in,out,j,key,ctx->Yi.c);
+		ctr += (unsigned int)j;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		in  += i;
+		len -= i;
+#if defined(GHASH)
+		GHASH(ctx,out,i);
+		out += i;
+#else
+		while (j--) {
+			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
+			GCM_MUL(ctx,Xi);
+			out += 16;
+		}
+#endif
+	}
+	if (len) {
+		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+		++ctr;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		while (len--) {
+			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len,ctr128_f stream)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64   mlen = ctx->len.u[1];
+	void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to decrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+	if (n) {
+		while (n && len) {
+			u8 c = *(in++);
+			*(out++) = c^ctx->EKi.c[n];
+			ctx->Xi.c[n] ^= c;
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL (ctx,Xi);
+		else {
+			ctx->mres = n;
+			return 0;
+		}
+	}
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+	while (len>=GHASH_CHUNK) {
+		GHASH(ctx,in,GHASH_CHUNK);
+		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+		ctr += GHASH_CHUNK/16;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		out += GHASH_CHUNK;
+		in  += GHASH_CHUNK;
+		len -= GHASH_CHUNK;
+	}
+#endif
+	if ((i = (len&(size_t)-16))) {
+		size_t j=i/16;
+
+#if defined(GHASH)
+		GHASH(ctx,in,i);
+#else
+		while (j--) {
+			size_t k;
+			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
+			GCM_MUL(ctx,Xi);
+			in += 16;
+		}
+		j   = i/16;
+		in -= i;
+#endif
+		(*stream)(in,out,j,key,ctx->Yi.c);
+		ctr += (unsigned int)j;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		out += i;
+		in  += i;
+		len -= i;
+	}
+	if (len) {
+		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+		++ctr;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		while (len--) {
+			u8 c = in[n];
+			ctx->Xi.c[n] ^= c;
+			out[n] = c^ctx->EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+			size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	u64 alen = ctx->len.u[0]<<3;
+	u64 clen = ctx->len.u[1]<<3;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+#endif
+
+	if (ctx->mres || ctx->ares)
+		GCM_MUL(ctx,Xi);
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		alen = BSWAP8(alen);
+		clen = BSWAP8(clen);
+#else
+		u8 *p = ctx->len.c;
+
+		ctx->len.u[0] = alen;
+		ctx->len.u[1] = clen;
+
+		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
+		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
+#endif
+	}
+
+	ctx->Xi.u[0] ^= alen;
+	ctx->Xi.u[1] ^= clen;
+	GCM_MUL(ctx,Xi);
+
+	ctx->Xi.u[0] ^= ctx->EK0.u[0];
+	ctx->Xi.u[1] ^= ctx->EK0.u[1];
+
+	if (tag && len<=sizeof(ctx->Xi))
+		return memcmp(ctx->Xi.c,tag,len);
+	else
+		return -1;
+}
+
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+	CRYPTO_gcm128_finish(ctx, NULL, 0);
+	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
+}
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
+{
+	GCM128_CONTEXT *ret;
+
+	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
+		CRYPTO_gcm128_init(ret,key,block);
+
+	return ret;
+}
+
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
+{
+	if (ctx) {
+		OPENSSL_cleanse(ctx,sizeof(*ctx));
+		OPENSSL_free(ctx);
+	}
+}
+
+#if defined(SELFTEST)
+#include <stdio.h>
+#include <openssl/aes.h>
+
+/* Test Case 1 */
+static const u8	K1[16],
+		*P1=NULL,
+		*A1=NULL,
+		IV1[12],
+		*C1=NULL,
+		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
+
+/* Test Case 2 */
+#define K2 K1
+#define A2 A1
+#define IV2 IV1
+static const u8	P2[16],
+		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
+		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
+
+/* Test Case 3 */
+#define A3 A2
+static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
+		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
+
+/* Test Case 4 */
+#define K4 K3
+#define IV4 IV3
+static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
+		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
+
+/* Test Case 5 */
+#define K5 K4
+#define P5 P4
+#define A5 A4
+static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
+			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
+			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
+			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
+		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
+
+/* Test Case 6 */
+#define K6 K5
+#define P6 P5
+#define A6 A5
+static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
+			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
+			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
+			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
+		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
+
+/* Test Case 7 */
+static const u8 K7[24],
+		*P7=NULL,
+		*A7=NULL,
+		IV7[12],
+		*C7=NULL,
+		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
+
+/* Test Case 8 */
+#define K8 K7
+#define IV8 IV7
+#define A8 A7
+static const u8	P8[16],
+		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
+		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
+
+/* Test Case 9 */
+#define A9 A8
+static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
+		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
+		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
+
+/* Test Case 10 */
+#define K10 K9
+#define IV10 IV9
+static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
+		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
+
+/* Test Case 11 */
+#define K11 K10
+#define P11 P10
+#define A11 A10
+static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
+			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
+			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
+			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
+		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
+
+/* Test Case 12 */
+#define K12 K11
+#define P12 P11
+#define A12 A11
+static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
+			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
+			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
+			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
+		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
+
+/* Test Case 13 */
+static const u8	K13[32],
+		*P13=NULL,
+		*A13=NULL,
+		IV13[12],
+		*C13=NULL,
+		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
+
+/* Test Case 14 */
+#define K14 K13
+#define A14 A13
+static const u8	P14[16],
+		IV14[12],
+		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
+		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
+
+/* Test Case 15 */
+#define A15 A14
+static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
+		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
+
+/* Test Case 16 */
+#define K16 K15
+#define IV16 IV15
+static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
+		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
+
+/* Test Case 17 */
+#define K17 K16
+#define P17 P16
+#define A17 A16
+static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
+			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
+			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
+			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
+		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
+
+/* Test Case 18 */
+#define K18 K17
+#define P18 P17
+#define A18 A17
+static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
+			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
+			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
+			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
+		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
+
+/* Test Case 19 */
+#define K19 K1
+#define P19 P1
+#define IV19 IV1
+#define C19 C1
+static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
+			0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
+		T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
+
+/* Test Case 20 */
+#define K20 K1
+#define A20 A1
+static const u8 IV20[64]={0xff,0xff,0xff,0xff},	/* this results in 0xff in counter LSB */
+		P20[288],
+		C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
+			0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
+			0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
+			0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
+			0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
+			0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
+			0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
+			0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
+			0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
+			0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
+			0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
+			0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
+			0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
+			0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
+			0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
+			0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
+			0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
+			0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
+		T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
+
+#define TEST_CASE(n)	do {					\
+	u8 out[sizeof(P##n)];					\
+	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
+	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
+	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
+	memset(out,0,sizeof(out));				\
+	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
+	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
+	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
+	    (C##n && memcmp(out,C##n,sizeof(out))))		\
+		ret++, printf ("encrypt test#%d failed.\n",n);	\
+	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
+	memset(out,0,sizeof(out));				\
+	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
+	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
+	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
+	    (P##n && memcmp(out,P##n,sizeof(out))))		\
+		ret++, printf ("decrypt test#%d failed.\n",n);	\
+	} while(0)
+
+int main()
+{
+	GCM128_CONTEXT ctx;
+	AES_KEY key;
+	int ret=0;
+
+	TEST_CASE(1);
+	TEST_CASE(2);
+	TEST_CASE(3);
+	TEST_CASE(4);
+	TEST_CASE(5);
+	TEST_CASE(6);
+	TEST_CASE(7);
+	TEST_CASE(8);
+	TEST_CASE(9);
+	TEST_CASE(10);
+	TEST_CASE(11);
+	TEST_CASE(12);
+	TEST_CASE(13);
+	TEST_CASE(14);
+	TEST_CASE(15);
+	TEST_CASE(16);
+	TEST_CASE(17);
+	TEST_CASE(18);
+	TEST_CASE(19);
+	TEST_CASE(20);
+
+#ifdef OPENSSL_CPUID_OBJ
+	{
+	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
+	union { u64 u; u8 c[1024]; } buf;
+	int i;
+
+	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
+	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
+	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
+
+	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+	start = OPENSSL_rdtsc();
+	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+	gcm_t = OPENSSL_rdtsc() - start;
+
+	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+			(block128_f)AES_encrypt);
+	start = OPENSSL_rdtsc();
+	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+			(block128_f)AES_encrypt);
+	ctr_t = OPENSSL_rdtsc() - start;
+
+	printf("%.2f-%.2f=%.2f\n",
+			gcm_t/(double)sizeof(buf),
+			ctr_t/(double)sizeof(buf),
+			(gcm_t-ctr_t)/(double)sizeof(buf));
+#ifdef GHASH
+	{
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx.ghash;
+
+	GHASH((&ctx),buf.c,sizeof(buf));
+	start = OPENSSL_rdtsc();
+	for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
+	gcm_t = OPENSSL_rdtsc() - start;
+	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
+	}
+#endif
+	}
+#endif
+
+	return ret;
+}
+#endif
diff --git a/main/openssl/crypto/modes/modes_lcl.h b/main/openssl/crypto/modes/modes_lcl.h
new file mode 100644
index 00000000..9d83e128
--- /dev/null
+++ b/main/openssl/crypto/modes/modes_lcl.h
@@ -0,0 +1,128 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use is governed by OpenSSL license.
+ * ====================================================================
+ */
+
+#include <openssl/modes.h>
+
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+typedef __int64 i64;
+typedef unsigned __int64 u64;
+#define U64(C) C##UI64
+#elif defined(__arch64__)
+typedef long i64;
+typedef unsigned long u64;
+#define U64(C) C##UL
+#else
+typedef long long i64;
+typedef unsigned long long u64;
+#define U64(C) C##ULL
+#endif
+
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#define STRICT_ALIGNMENT 1
+#if defined(__i386)	|| defined(__i386__)	|| \
+    defined(__x86_64)	|| defined(__x86_64__)	|| \
+    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
+    defined(__s390__)	|| defined(__s390x__)
+# undef STRICT_ALIGNMENT
+#endif
+
+#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#if defined(__GNUC__) && __GNUC__>=2
+# if defined(__x86_64) || defined(__x86_64__)
+#  define BSWAP8(x) ({	u64 ret=(x);			\
+			asm ("bswapq %0"		\
+			: "+r"(ret));	ret;		})
+#  define BSWAP4(x) ({	u32 ret=(x);			\
+			asm ("bswapl %0"		\
+			: "+r"(ret));	ret;		})
+# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
+#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
+			asm ("bswapl %0; bswapl %1"	\
+			: "+r"(hi),"+r"(lo));		\
+			(u64)hi<<32|lo;			})
+#  define BSWAP4(x) ({	u32 ret=(x);			\
+			asm ("bswapl %0"		\
+			: "+r"(ret));	ret;		})
+# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
+#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
+			asm ("rev %0,%0; rev %1,%1"	\
+			: "+r"(hi),"+r"(lo));		\
+			(u64)hi<<32|lo;			})
+#  define BSWAP4(x) ({	u32 ret;			\
+			asm ("rev %0,%1"		\
+			: "=r"(ret) : "r"((u32)(x)));	\
+			ret;				})
+# endif
+#elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+#  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
+#  define BSWAP8(x)	_byteswap_uint64((u64)(x))
+#  define BSWAP4(x)	_byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+   __inline u32 _bswap4(u32 val) {
+	_asm mov eax,val
+	_asm bswap eax
+   }
+#  define BSWAP4(x)	_bswap4(x)
+# endif
+#endif
+#endif
+
+#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
+#define GETU32(p)	BSWAP4(*(const u32 *)(p))
+#define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+#else
+#define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+#define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+#endif
+
+/* GCM definitions */
+
+typedef struct { u64 hi,lo; } u128;
+
+#ifdef	TABLE_BITS
+#undef	TABLE_BITS
+#endif
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8 [or 1]. For further information see gcm128.c.
+ */
+#define	TABLE_BITS 4
+
+struct gcm128_context {
+	/* Following 6 names follow names in GCM specification */
+	union { u64 u[2]; u32 d[4]; u8 c[16]; size_t t[16/sizeof(size_t)]; }
+	  Yi,EKi,EK0,len,Xi,H;
+	/* Relative position of Xi, H and pre-computed Htable is used
+	 * in some assembler modules, i.e. don't change the order! */
+#if TABLE_BITS==8
+	u128 Htable[256];
+#else
+	u128 Htable[16];
+	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
+	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+	unsigned int mres, ares;
+	block128_f block;
+	void *key;
+};
+
+struct xts128_context {
+	void      *key1, *key2;
+	block128_f block1,block2;
+};
+
+struct ccm128_context {
+	union { u64 u[2]; u8 c[16]; } nonce, cmac;
+	u64 blocks;
+	block128_f block;
+	void *key;
+};
+
diff --git a/main/openssl/crypto/modes/ofb128.c b/main/openssl/crypto/modes/ofb128.c
index c732e2ec..01c01702 100644
--- a/main/openssl/crypto/modes/ofb128.c
+++ b/main/openssl/crypto/modes/ofb128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit ofb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;
diff --git a/main/openssl/crypto/modes/xts128.c b/main/openssl/crypto/modes/xts128.c
new file mode 100644
index 00000000..9cf27a25
--- /dev/null
+++ b/main/openssl/crypto/modes/xts128.c
@@ -0,0 +1,187 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+	const unsigned char *inp, unsigned char *out,
+	size_t len, int enc)
+{
+	const union { long one; char little; } is_endian = {1};
+	union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
+	unsigned int i;
+
+	if (len<16) return -1;
+
+	memcpy(tweak.c, iv, 16);
+
+	(*ctx->block2)(tweak.c,tweak.c,ctx->key2);
+
+	if (!enc && (len%16)) len-=16;
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		memcpy(scratch.c,inp,16);
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+#else
+		scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
+		scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
+#endif
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy(out,scratch.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
+		((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
+#endif
+		inp += 16;
+		out += 16;
+		len -= 16;
+
+		if (len==0)	return 0;
+
+		if (is_endian.little) {
+			unsigned int carry,res;
+			
+			res = 0x87&(((int)tweak.d[3])>>31);
+			carry = (unsigned int)(tweak.u[0]>>63);
+			tweak.u[0] = (tweak.u[0]<<1)^res;
+			tweak.u[1] = (tweak.u[1]<<1)|carry;
+		}
+		else {
+			size_t c;
+
+			for (c=0,i=0;i<16;++i) {
+				/*+ substitutes for |, because c is 1 bit */ 
+				c += ((size_t)tweak.c[i])<<1;
+				tweak.c[i] = (u8)c;
+				c = c>>8;
+			}
+			tweak.c[0] ^= (u8)(0x87&(0-c));
+		}
+	}
+	if (enc) {
+		for (i=0;i<len;++i) {
+			u8 c = inp[i];
+			out[i] = scratch.c[i];
+			scratch.c[i] = c;
+		}
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy(out-16,scratch.c,16);
+	}
+	else {
+		union { u64 u[2]; u8 c[16]; } tweak1;
+
+		if (is_endian.little) {
+			unsigned int carry,res;
+
+			res = 0x87&(((int)tweak.d[3])>>31);
+			carry = (unsigned int)(tweak.u[0]>>63);
+			tweak1.u[0] = (tweak.u[0]<<1)^res;
+			tweak1.u[1] = (tweak.u[1]<<1)|carry;
+		}
+		else {
+			size_t c;
+
+			for (c=0,i=0;i<16;++i) {
+				/*+ substitutes for |, because c is 1 bit */ 
+				c += ((size_t)tweak.c[i])<<1;
+				tweak1.c[i] = (u8)c;
+				c = c>>8;
+			}
+			tweak1.c[0] ^= (u8)(0x87&(0-c));
+		}
+#if defined(STRICT_ALIGNMENT)
+		memcpy(scratch.c,inp,16);
+		scratch.u[0] ^= tweak1.u[0];
+		scratch.u[1] ^= tweak1.u[1];
+#else
+		scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
+		scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
+#endif
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+		scratch.u[0] ^= tweak1.u[0];
+		scratch.u[1] ^= tweak1.u[1];
+
+		for (i=0;i<len;++i) {
+			u8 c = inp[16+i];
+			out[16+i] = scratch.c[i];
+			scratch.c[i] = c;
+		}
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy (out,scratch.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^tweak.u[0];
+		((u64*)out)[1] = scratch.u[1]^tweak.u[1];
+#endif
+	}
+
+	return 0;
+}
diff --git a/main/openssl/crypto/o_init.c b/main/openssl/crypto/o_init.c
new file mode 100644
index 00000000..db4cdc44
--- /dev/null
+++ b/main/openssl/crypto/o_init.c
@@ -0,0 +1,82 @@
+/* o_init.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <e_os.h>
+#include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/rand.h>
+#endif
+
+/* Perform any essential OpenSSL initialization operations.
+ * Currently only sets FIPS callbacks
+ */
+
+void OPENSSL_init(void)
+	{
+	static int done = 0;
+	if (done)
+		return;
+	done = 1;
+#ifdef OPENSSL_FIPS
+	FIPS_set_locking_callbacks(CRYPTO_lock, CRYPTO_add_lock);
+	FIPS_set_error_callbacks(ERR_put_error, ERR_add_error_vdata);
+	FIPS_set_malloc_callbacks(CRYPTO_malloc, CRYPTO_free);
+	RAND_init_fips();
+#endif
+#if 0
+	fprintf(stderr, "Called OPENSSL_init\n");
+#endif
+	}
+
diff --git a/main/openssl/crypto/objects/o_names.c b/main/openssl/crypto/objects/o_names.c
index 84380a96..4a548c2e 100644
--- a/main/openssl/crypto/objects/o_names.c
+++ b/main/openssl/crypto/objects/o_names.c
@@ -73,7 +73,7 @@ int OBJ_NAME_new_index(unsigned long (*hash_func)(const char *),
 		name_funcs_stack=sk_NAME_FUNCS_new_null();
 		MemCheck_on();
 		}
-	if ((name_funcs_stack == NULL))
+	if (name_funcs_stack == NULL)
 		{
 		/* ERROR */
 		return(0);
diff --git a/main/openssl/crypto/objects/obj_dat.h b/main/openssl/crypto/objects/obj_dat.h
index 6449be60..d404ad07 100644
--- a/main/openssl/crypto/objects/obj_dat.h
+++ b/main/openssl/crypto/objects/obj_dat.h
@@ -62,12 +62,12 @@
  * [including the GNU Public Licence.]
  */
 
-#define NUM_NID 893
-#define NUM_SN 886
-#define NUM_LN 886
-#define NUM_OBJ 840
+#define NUM_NID 920
+#define NUM_SN 913
+#define NUM_LN 913
+#define NUM_OBJ 857
 
-static const unsigned char lvalues[5824]={
+static const unsigned char lvalues[5980]={
 0x00,                                        /* [  0] OBJ_undef */
 0x2A,0x86,0x48,0x86,0xF7,0x0D,               /* [  1] OBJ_rsadsi */
 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,          /* [  7] OBJ_pkcs */
@@ -908,6 +908,23 @@ static const unsigned char lvalues[5824]={
 0x55,0x04,0x34,                              /* [5814] OBJ_supportedAlgorithms */
 0x55,0x04,0x35,                              /* [5817] OBJ_deltaRevocationList */
 0x55,0x04,0x36,                              /* [5820] OBJ_dmdName */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x09,0x10,0x03,0x09,/* [5823] OBJ_id_alg_PWRI_KEK */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x06,/* [5834] OBJ_aes_128_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x07,/* [5843] OBJ_aes_128_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x08,/* [5852] OBJ_id_aes128_wrap_pad */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1A,/* [5861] OBJ_aes_192_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1B,/* [5870] OBJ_aes_192_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1C,/* [5879] OBJ_id_aes192_wrap_pad */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x2E,/* [5888] OBJ_aes_256_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x2F,/* [5897] OBJ_aes_256_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x30,/* [5906] OBJ_id_aes256_wrap_pad */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x02,/* [5915] OBJ_id_camellia128_wrap */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x03,/* [5926] OBJ_id_camellia192_wrap */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x04,/* [5937] OBJ_id_camellia256_wrap */
+0x55,0x1D,0x25,0x00,                         /* [5948] OBJ_anyExtendedKeyUsage */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x08,/* [5952] OBJ_mgf1 */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x0A,/* [5961] OBJ_rsassaPss */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x07,/* [5970] OBJ_rsaesOaep */
 };
 
 static const ASN1_OBJECT nid_objs[NUM_NID]={
@@ -2351,28 +2368,74 @@ static const ASN1_OBJECT nid_objs[NUM_NID]={
 {"deltaRevocationList","deltaRevocationList",NID_deltaRevocationList,
 	3,&(lvalues[5817]),0},
 {"dmdName","dmdName",NID_dmdName,3,&(lvalues[5820]),0},
+{"id-alg-PWRI-KEK","id-alg-PWRI-KEK",NID_id_alg_PWRI_KEK,11,
+	&(lvalues[5823]),0},
+{"CMAC","cmac",NID_cmac,0,NULL,0},
+{"id-aes128-GCM","aes-128-gcm",NID_aes_128_gcm,9,&(lvalues[5834]),0},
+{"id-aes128-CCM","aes-128-ccm",NID_aes_128_ccm,9,&(lvalues[5843]),0},
+{"id-aes128-wrap-pad","id-aes128-wrap-pad",NID_id_aes128_wrap_pad,9,
+	&(lvalues[5852]),0},
+{"id-aes192-GCM","aes-192-gcm",NID_aes_192_gcm,9,&(lvalues[5861]),0},
+{"id-aes192-CCM","aes-192-ccm",NID_aes_192_ccm,9,&(lvalues[5870]),0},
+{"id-aes192-wrap-pad","id-aes192-wrap-pad",NID_id_aes192_wrap_pad,9,
+	&(lvalues[5879]),0},
+{"id-aes256-GCM","aes-256-gcm",NID_aes_256_gcm,9,&(lvalues[5888]),0},
+{"id-aes256-CCM","aes-256-ccm",NID_aes_256_ccm,9,&(lvalues[5897]),0},
+{"id-aes256-wrap-pad","id-aes256-wrap-pad",NID_id_aes256_wrap_pad,9,
+	&(lvalues[5906]),0},
+{"AES-128-CTR","aes-128-ctr",NID_aes_128_ctr,0,NULL,0},
+{"AES-192-CTR","aes-192-ctr",NID_aes_192_ctr,0,NULL,0},
+{"AES-256-CTR","aes-256-ctr",NID_aes_256_ctr,0,NULL,0},
+{"id-camellia128-wrap","id-camellia128-wrap",NID_id_camellia128_wrap,
+	11,&(lvalues[5915]),0},
+{"id-camellia192-wrap","id-camellia192-wrap",NID_id_camellia192_wrap,
+	11,&(lvalues[5926]),0},
+{"id-camellia256-wrap","id-camellia256-wrap",NID_id_camellia256_wrap,
+	11,&(lvalues[5937]),0},
+{"anyExtendedKeyUsage","Any Extended Key Usage",
+	NID_anyExtendedKeyUsage,4,&(lvalues[5948]),0},
+{"MGF1","mgf1",NID_mgf1,9,&(lvalues[5952]),0},
+{"RSASSA-PSS","rsassaPss",NID_rsassaPss,9,&(lvalues[5961]),0},
+{"AES-128-XTS","aes-128-xts",NID_aes_128_xts,0,NULL,0},
+{"AES-256-XTS","aes-256-xts",NID_aes_256_xts,0,NULL,0},
+{"RC4-HMAC-MD5","rc4-hmac-md5",NID_rc4_hmac_md5,0,NULL,0},
+{"AES-128-CBC-HMAC-SHA1","aes-128-cbc-hmac-sha1",
+	NID_aes_128_cbc_hmac_sha1,0,NULL,0},
+{"AES-192-CBC-HMAC-SHA1","aes-192-cbc-hmac-sha1",
+	NID_aes_192_cbc_hmac_sha1,0,NULL,0},
+{"AES-256-CBC-HMAC-SHA1","aes-256-cbc-hmac-sha1",
+	NID_aes_256_cbc_hmac_sha1,0,NULL,0},
+{"RSAES-OAEP","rsaesOaep",NID_rsaesOaep,9,&(lvalues[5970]),0},
 };
 
 static const unsigned int sn_objs[NUM_SN]={
 364,	/* "AD_DVCS" */
 419,	/* "AES-128-CBC" */
+916,	/* "AES-128-CBC-HMAC-SHA1" */
 421,	/* "AES-128-CFB" */
 650,	/* "AES-128-CFB1" */
 653,	/* "AES-128-CFB8" */
+904,	/* "AES-128-CTR" */
 418,	/* "AES-128-ECB" */
 420,	/* "AES-128-OFB" */
+913,	/* "AES-128-XTS" */
 423,	/* "AES-192-CBC" */
+917,	/* "AES-192-CBC-HMAC-SHA1" */
 425,	/* "AES-192-CFB" */
 651,	/* "AES-192-CFB1" */
 654,	/* "AES-192-CFB8" */
+905,	/* "AES-192-CTR" */
 422,	/* "AES-192-ECB" */
 424,	/* "AES-192-OFB" */
 427,	/* "AES-256-CBC" */
+918,	/* "AES-256-CBC-HMAC-SHA1" */
 429,	/* "AES-256-CFB" */
 652,	/* "AES-256-CFB1" */
 655,	/* "AES-256-CFB8" */
+906,	/* "AES-256-CTR" */
 426,	/* "AES-256-ECB" */
 428,	/* "AES-256-OFB" */
+914,	/* "AES-256-XTS" */
 91,	/* "BF-CBC" */
 93,	/* "BF-CFB" */
 92,	/* "BF-ECB" */
@@ -2400,6 +2463,7 @@ static const unsigned int sn_objs[NUM_SN]={
 110,	/* "CAST5-CFB" */
 109,	/* "CAST5-ECB" */
 111,	/* "CAST5-OFB" */
+894,	/* "CMAC" */
 13,	/* "CN" */
 141,	/* "CRLReason" */
 417,	/* "CSPName" */
@@ -2451,6 +2515,7 @@ static const unsigned int sn_objs[NUM_SN]={
  4,	/* "MD5" */
 114,	/* "MD5-SHA1" */
 95,	/* "MDC2" */
+911,	/* "MGF1" */
 388,	/* "Mail" */
 393,	/* "NULL" */
 404,	/* "NULL" */
@@ -2487,6 +2552,7 @@ static const unsigned int sn_objs[NUM_SN]={
 40,	/* "RC2-OFB" */
  5,	/* "RC4" */
 97,	/* "RC4-40" */
+915,	/* "RC4-HMAC-MD5" */
 120,	/* "RC5-CBC" */
 122,	/* "RC5-CFB" */
 121,	/* "RC5-ECB" */
@@ -2507,6 +2573,8 @@ static const unsigned int sn_objs[NUM_SN]={
 668,	/* "RSA-SHA256" */
 669,	/* "RSA-SHA384" */
 670,	/* "RSA-SHA512" */
+919,	/* "RSAES-OAEP" */
+912,	/* "RSASSA-PSS" */
 777,	/* "SEED-CBC" */
 779,	/* "SEED-CFB" */
 776,	/* "SEED-ECB" */
@@ -2540,6 +2608,7 @@ static const unsigned int sn_objs[NUM_SN]={
 363,	/* "ad_timestamping" */
 376,	/* "algorithm" */
 405,	/* "ansi-X9-62" */
+910,	/* "anyExtendedKeyUsage" */
 746,	/* "anyPolicy" */
 370,	/* "archiveCutoff" */
 484,	/* "associatedDomain" */
@@ -2716,14 +2785,27 @@ static const unsigned int sn_objs[NUM_SN]={
 357,	/* "id-aca-group" */
 358,	/* "id-aca-role" */
 176,	/* "id-ad" */
+896,	/* "id-aes128-CCM" */
+895,	/* "id-aes128-GCM" */
 788,	/* "id-aes128-wrap" */
+897,	/* "id-aes128-wrap-pad" */
+899,	/* "id-aes192-CCM" */
+898,	/* "id-aes192-GCM" */
 789,	/* "id-aes192-wrap" */
+900,	/* "id-aes192-wrap-pad" */
+902,	/* "id-aes256-CCM" */
+901,	/* "id-aes256-GCM" */
 790,	/* "id-aes256-wrap" */
+903,	/* "id-aes256-wrap-pad" */
 262,	/* "id-alg" */
+893,	/* "id-alg-PWRI-KEK" */
 323,	/* "id-alg-des40" */
 326,	/* "id-alg-dh-pop" */
 325,	/* "id-alg-dh-sig-hmac-sha1" */
 324,	/* "id-alg-noSignature" */
+907,	/* "id-camellia128-wrap" */
+908,	/* "id-camellia192-wrap" */
+909,	/* "id-camellia256-wrap" */
 268,	/* "id-cct" */
 361,	/* "id-cct-PKIData" */
 362,	/* "id-cct-PKIResponse" */
@@ -3246,6 +3328,7 @@ static const unsigned int ln_objs[NUM_LN]={
 363,	/* "AD Time Stamping" */
 405,	/* "ANSI X9.62" */
 368,	/* "Acceptable OCSP Responses" */
+910,	/* "Any Extended Key Usage" */
 664,	/* "Any language" */
 177,	/* "Authority Information Access" */
 365,	/* "Basic OCSP Response" */
@@ -3386,23 +3469,37 @@ static const unsigned int ln_objs[NUM_LN]={
 364,	/* "ad dvcs" */
 606,	/* "additional verification" */
 419,	/* "aes-128-cbc" */
+916,	/* "aes-128-cbc-hmac-sha1" */
+896,	/* "aes-128-ccm" */
 421,	/* "aes-128-cfb" */
 650,	/* "aes-128-cfb1" */
 653,	/* "aes-128-cfb8" */
+904,	/* "aes-128-ctr" */
 418,	/* "aes-128-ecb" */
+895,	/* "aes-128-gcm" */
 420,	/* "aes-128-ofb" */
+913,	/* "aes-128-xts" */
 423,	/* "aes-192-cbc" */
+917,	/* "aes-192-cbc-hmac-sha1" */
+899,	/* "aes-192-ccm" */
 425,	/* "aes-192-cfb" */
 651,	/* "aes-192-cfb1" */
 654,	/* "aes-192-cfb8" */
+905,	/* "aes-192-ctr" */
 422,	/* "aes-192-ecb" */
+898,	/* "aes-192-gcm" */
 424,	/* "aes-192-ofb" */
 427,	/* "aes-256-cbc" */
+918,	/* "aes-256-cbc-hmac-sha1" */
+902,	/* "aes-256-ccm" */
 429,	/* "aes-256-cfb" */
 652,	/* "aes-256-cfb1" */
 655,	/* "aes-256-cfb8" */
+906,	/* "aes-256-ctr" */
 426,	/* "aes-256-ecb" */
+901,	/* "aes-256-gcm" */
 428,	/* "aes-256-ofb" */
+914,	/* "aes-256-xts" */
 376,	/* "algorithm" */
 484,	/* "associatedDomain" */
 485,	/* "associatedName" */
@@ -3467,6 +3564,7 @@ static const unsigned int ln_objs[NUM_LN]={
 407,	/* "characteristic-two-field" */
 395,	/* "clearance" */
 633,	/* "cleartext track 2" */
+894,	/* "cmac" */
 13,	/* "commonName" */
 513,	/* "content types" */
 50,	/* "contentType" */
@@ -3602,13 +3700,20 @@ static const unsigned int ln_objs[NUM_LN]={
 358,	/* "id-aca-role" */
 176,	/* "id-ad" */
 788,	/* "id-aes128-wrap" */
+897,	/* "id-aes128-wrap-pad" */
 789,	/* "id-aes192-wrap" */
+900,	/* "id-aes192-wrap-pad" */
 790,	/* "id-aes256-wrap" */
+903,	/* "id-aes256-wrap-pad" */
 262,	/* "id-alg" */
+893,	/* "id-alg-PWRI-KEK" */
 323,	/* "id-alg-des40" */
 326,	/* "id-alg-dh-pop" */
 325,	/* "id-alg-dh-sig-hmac-sha1" */
 324,	/* "id-alg-noSignature" */
+907,	/* "id-camellia128-wrap" */
+908,	/* "id-camellia192-wrap" */
+909,	/* "id-camellia256-wrap" */
 268,	/* "id-cct" */
 361,	/* "id-cct-PKIData" */
 362,	/* "id-cct-PKIResponse" */
@@ -3806,6 +3911,7 @@ static const unsigned int ln_objs[NUM_LN]={
 602,	/* "merchant initiated auth" */
 514,	/* "message extensions" */
 51,	/* "messageDigest" */
+911,	/* "mgf1" */
 506,	/* "mime-mhs-bodies" */
 505,	/* "mime-mhs-headings" */
 488,	/* "mobileTelephoneNumber" */
@@ -3889,6 +3995,7 @@ static const unsigned int ln_objs[NUM_LN]={
 40,	/* "rc2-ofb" */
  5,	/* "rc4" */
 97,	/* "rc4-40" */
+915,	/* "rc4-hmac-md5" */
 120,	/* "rc5-cbc" */
 122,	/* "rc5-cfb" */
 121,	/* "rc5-ecb" */
@@ -3905,6 +4012,8 @@ static const unsigned int ln_objs[NUM_LN]={
  6,	/* "rsaEncryption" */
 644,	/* "rsaOAEPEncryptionSET" */
 377,	/* "rsaSignature" */
+919,	/* "rsaesOaep" */
+912,	/* "rsassaPss" */
 124,	/* "run length compression" */
 482,	/* "sOARecord" */
 155,	/* "safeContentsBag" */
@@ -4254,6 +4363,7 @@ static const unsigned int obj_objs[NUM_OBJ]={
 96,	/* OBJ_mdc2WithRSA                  2 5 8 3 100 */
 95,	/* OBJ_mdc2                         2 5 8 3 101 */
 746,	/* OBJ_any_policy                   2 5 29 32 0 */
+910,	/* OBJ_anyExtendedKeyUsage          2 5 29 37 0 */
 519,	/* OBJ_setct_PANData                2 23 42 0 0 */
 520,	/* OBJ_setct_PANToken               2 23 42 0 1 */
 521,	/* OBJ_setct_PANOnly                2 23 42 0 2 */
@@ -4720,6 +4830,9 @@ static const unsigned int obj_objs[NUM_OBJ]={
  8,	/* OBJ_md5WithRSAEncryption         1 2 840 113549 1 1 4 */
 65,	/* OBJ_sha1WithRSAEncryption        1 2 840 113549 1 1 5 */
 644,	/* OBJ_rsaOAEPEncryptionSET         1 2 840 113549 1 1 6 */
+919,	/* OBJ_rsaesOaep                    1 2 840 113549 1 1 7 */
+911,	/* OBJ_mgf1                         1 2 840 113549 1 1 8 */
+912,	/* OBJ_rsassaPss                    1 2 840 113549 1 1 10 */
 668,	/* OBJ_sha256WithRSAEncryption      1 2 840 113549 1 1 11 */
 669,	/* OBJ_sha384WithRSAEncryption      1 2 840 113549 1 1 12 */
 670,	/* OBJ_sha512WithRSAEncryption      1 2 840 113549 1 1 13 */
@@ -4785,16 +4898,25 @@ static const unsigned int obj_objs[NUM_OBJ]={
 420,	/* OBJ_aes_128_ofb128               2 16 840 1 101 3 4 1 3 */
 421,	/* OBJ_aes_128_cfb128               2 16 840 1 101 3 4 1 4 */
 788,	/* OBJ_id_aes128_wrap               2 16 840 1 101 3 4 1 5 */
+895,	/* OBJ_aes_128_gcm                  2 16 840 1 101 3 4 1 6 */
+896,	/* OBJ_aes_128_ccm                  2 16 840 1 101 3 4 1 7 */
+897,	/* OBJ_id_aes128_wrap_pad           2 16 840 1 101 3 4 1 8 */
 422,	/* OBJ_aes_192_ecb                  2 16 840 1 101 3 4 1 21 */
 423,	/* OBJ_aes_192_cbc                  2 16 840 1 101 3 4 1 22 */
 424,	/* OBJ_aes_192_ofb128               2 16 840 1 101 3 4 1 23 */
 425,	/* OBJ_aes_192_cfb128               2 16 840 1 101 3 4 1 24 */
 789,	/* OBJ_id_aes192_wrap               2 16 840 1 101 3 4 1 25 */
+898,	/* OBJ_aes_192_gcm                  2 16 840 1 101 3 4 1 26 */
+899,	/* OBJ_aes_192_ccm                  2 16 840 1 101 3 4 1 27 */
+900,	/* OBJ_id_aes192_wrap_pad           2 16 840 1 101 3 4 1 28 */
 426,	/* OBJ_aes_256_ecb                  2 16 840 1 101 3 4 1 41 */
 427,	/* OBJ_aes_256_cbc                  2 16 840 1 101 3 4 1 42 */
 428,	/* OBJ_aes_256_ofb128               2 16 840 1 101 3 4 1 43 */
 429,	/* OBJ_aes_256_cfb128               2 16 840 1 101 3 4 1 44 */
 790,	/* OBJ_id_aes256_wrap               2 16 840 1 101 3 4 1 45 */
+901,	/* OBJ_aes_256_gcm                  2 16 840 1 101 3 4 1 46 */
+902,	/* OBJ_aes_256_ccm                  2 16 840 1 101 3 4 1 47 */
+903,	/* OBJ_id_aes256_wrap_pad           2 16 840 1 101 3 4 1 48 */
 672,	/* OBJ_sha256                       2 16 840 1 101 3 4 2 1 */
 673,	/* OBJ_sha384                       2 16 840 1 101 3 4 2 2 */
 674,	/* OBJ_sha512                       2 16 840 1 101 3 4 2 3 */
@@ -4901,6 +5023,9 @@ static const unsigned int obj_objs[NUM_OBJ]={
 751,	/* OBJ_camellia_128_cbc             1 2 392 200011 61 1 1 1 2 */
 752,	/* OBJ_camellia_192_cbc             1 2 392 200011 61 1 1 1 3 */
 753,	/* OBJ_camellia_256_cbc             1 2 392 200011 61 1 1 1 4 */
+907,	/* OBJ_id_camellia128_wrap          1 2 392 200011 61 1 1 3 2 */
+908,	/* OBJ_id_camellia192_wrap          1 2 392 200011 61 1 1 3 3 */
+909,	/* OBJ_id_camellia256_wrap          1 2 392 200011 61 1 1 3 4 */
 196,	/* OBJ_id_smime_mod_cms             1 2 840 113549 1 9 16 0 1 */
 197,	/* OBJ_id_smime_mod_ess             1 2 840 113549 1 9 16 0 2 */
 198,	/* OBJ_id_smime_mod_oid             1 2 840 113549 1 9 16 0 3 */
@@ -4956,6 +5081,7 @@ static const unsigned int obj_objs[NUM_OBJ]={
 246,	/* OBJ_id_smime_alg_CMS3DESwrap     1 2 840 113549 1 9 16 3 6 */
 247,	/* OBJ_id_smime_alg_CMSRC2wrap      1 2 840 113549 1 9 16 3 7 */
 125,	/* OBJ_zlib_compression             1 2 840 113549 1 9 16 3 8 */
+893,	/* OBJ_id_alg_PWRI_KEK              1 2 840 113549 1 9 16 3 9 */
 248,	/* OBJ_id_smime_cd_ldap             1 2 840 113549 1 9 16 4 1 */
 249,	/* OBJ_id_smime_spq_ets_sqt_uri     1 2 840 113549 1 9 16 5 1 */
 250,	/* OBJ_id_smime_spq_ets_sqt_unotice 1 2 840 113549 1 9 16 5 2 */
diff --git a/main/openssl/crypto/objects/obj_mac.h b/main/openssl/crypto/objects/obj_mac.h
index 282f11a8..b5ea7cda 100644
--- a/main/openssl/crypto/objects/obj_mac.h
+++ b/main/openssl/crypto/objects/obj_mac.h
@@ -580,6 +580,21 @@
 #define NID_sha1WithRSAEncryption		65
 #define OBJ_sha1WithRSAEncryption		OBJ_pkcs1,5L
 
+#define SN_rsaesOaep		"RSAES-OAEP"
+#define LN_rsaesOaep		"rsaesOaep"
+#define NID_rsaesOaep		919
+#define OBJ_rsaesOaep		OBJ_pkcs1,7L
+
+#define SN_mgf1		"MGF1"
+#define LN_mgf1		"mgf1"
+#define NID_mgf1		911
+#define OBJ_mgf1		OBJ_pkcs1,8L
+
+#define SN_rsassaPss		"RSASSA-PSS"
+#define LN_rsassaPss		"rsassaPss"
+#define NID_rsassaPss		912
+#define OBJ_rsassaPss		OBJ_pkcs1,10L
+
 #define SN_sha256WithRSAEncryption		"RSA-SHA256"
 #define LN_sha256WithRSAEncryption		"sha256WithRSAEncryption"
 #define NID_sha256WithRSAEncryption		668
@@ -981,6 +996,10 @@
 #define NID_id_smime_alg_CMSRC2wrap		247
 #define OBJ_id_smime_alg_CMSRC2wrap		OBJ_id_smime_alg,7L
 
+#define SN_id_alg_PWRI_KEK		"id-alg-PWRI-KEK"
+#define NID_id_alg_PWRI_KEK		893
+#define OBJ_id_alg_PWRI_KEK		OBJ_id_smime_alg,9L
+
 #define SN_id_smime_cd_ldap		"id-smime-cd-ldap"
 #define NID_id_smime_cd_ldap		248
 #define OBJ_id_smime_cd_ldap		OBJ_id_smime_cd,1L
@@ -2399,6 +2418,11 @@
 #define NID_no_rev_avail		403
 #define OBJ_no_rev_avail		OBJ_id_ce,56L
 
+#define SN_anyExtendedKeyUsage		"anyExtendedKeyUsage"
+#define LN_anyExtendedKeyUsage		"Any Extended Key Usage"
+#define NID_anyExtendedKeyUsage		910
+#define OBJ_anyExtendedKeyUsage		OBJ_ext_key_usage,0L
+
 #define SN_netscape		"Netscape"
 #define LN_netscape		"Netscape Communications Corp."
 #define NID_netscape		57
@@ -2586,6 +2610,24 @@
 #define NID_aes_128_cfb128		421
 #define OBJ_aes_128_cfb128		OBJ_aes,4L
 
+#define SN_id_aes128_wrap		"id-aes128-wrap"
+#define NID_id_aes128_wrap		788
+#define OBJ_id_aes128_wrap		OBJ_aes,5L
+
+#define SN_aes_128_gcm		"id-aes128-GCM"
+#define LN_aes_128_gcm		"aes-128-gcm"
+#define NID_aes_128_gcm		895
+#define OBJ_aes_128_gcm		OBJ_aes,6L
+
+#define SN_aes_128_ccm		"id-aes128-CCM"
+#define LN_aes_128_ccm		"aes-128-ccm"
+#define NID_aes_128_ccm		896
+#define OBJ_aes_128_ccm		OBJ_aes,7L
+
+#define SN_id_aes128_wrap_pad		"id-aes128-wrap-pad"
+#define NID_id_aes128_wrap_pad		897
+#define OBJ_id_aes128_wrap_pad		OBJ_aes,8L
+
 #define SN_aes_192_ecb		"AES-192-ECB"
 #define LN_aes_192_ecb		"aes-192-ecb"
 #define NID_aes_192_ecb		422
@@ -2606,6 +2648,24 @@
 #define NID_aes_192_cfb128		425
 #define OBJ_aes_192_cfb128		OBJ_aes,24L
 
+#define SN_id_aes192_wrap		"id-aes192-wrap"
+#define NID_id_aes192_wrap		789
+#define OBJ_id_aes192_wrap		OBJ_aes,25L
+
+#define SN_aes_192_gcm		"id-aes192-GCM"
+#define LN_aes_192_gcm		"aes-192-gcm"
+#define NID_aes_192_gcm		898
+#define OBJ_aes_192_gcm		OBJ_aes,26L
+
+#define SN_aes_192_ccm		"id-aes192-CCM"
+#define LN_aes_192_ccm		"aes-192-ccm"
+#define NID_aes_192_ccm		899
+#define OBJ_aes_192_ccm		OBJ_aes,27L
+
+#define SN_id_aes192_wrap_pad		"id-aes192-wrap-pad"
+#define NID_id_aes192_wrap_pad		900
+#define OBJ_id_aes192_wrap_pad		OBJ_aes,28L
+
 #define SN_aes_256_ecb		"AES-256-ECB"
 #define LN_aes_256_ecb		"aes-256-ecb"
 #define NID_aes_256_ecb		426
@@ -2626,6 +2686,24 @@
 #define NID_aes_256_cfb128		429
 #define OBJ_aes_256_cfb128		OBJ_aes,44L
 
+#define SN_id_aes256_wrap		"id-aes256-wrap"
+#define NID_id_aes256_wrap		790
+#define OBJ_id_aes256_wrap		OBJ_aes,45L
+
+#define SN_aes_256_gcm		"id-aes256-GCM"
+#define LN_aes_256_gcm		"aes-256-gcm"
+#define NID_aes_256_gcm		901
+#define OBJ_aes_256_gcm		OBJ_aes,46L
+
+#define SN_aes_256_ccm		"id-aes256-CCM"
+#define LN_aes_256_ccm		"aes-256-ccm"
+#define NID_aes_256_ccm		902
+#define OBJ_aes_256_ccm		OBJ_aes,47L
+
+#define SN_id_aes256_wrap_pad		"id-aes256-wrap-pad"
+#define NID_id_aes256_wrap_pad		903
+#define OBJ_id_aes256_wrap_pad		OBJ_aes,48L
+
 #define SN_aes_128_cfb1		"AES-128-CFB1"
 #define LN_aes_128_cfb1		"aes-128-cfb1"
 #define NID_aes_128_cfb1		650
@@ -2650,6 +2728,26 @@
 #define LN_aes_256_cfb8		"aes-256-cfb8"
 #define NID_aes_256_cfb8		655
 
+#define SN_aes_128_ctr		"AES-128-CTR"
+#define LN_aes_128_ctr		"aes-128-ctr"
+#define NID_aes_128_ctr		904
+
+#define SN_aes_192_ctr		"AES-192-CTR"
+#define LN_aes_192_ctr		"aes-192-ctr"
+#define NID_aes_192_ctr		905
+
+#define SN_aes_256_ctr		"AES-256-CTR"
+#define LN_aes_256_ctr		"aes-256-ctr"
+#define NID_aes_256_ctr		906
+
+#define SN_aes_128_xts		"AES-128-XTS"
+#define LN_aes_128_xts		"aes-128-xts"
+#define NID_aes_128_xts		913
+
+#define SN_aes_256_xts		"AES-256-XTS"
+#define LN_aes_256_xts		"aes-256-xts"
+#define NID_aes_256_xts		914
+
 #define SN_des_cfb1		"DES-CFB1"
 #define LN_des_cfb1		"des-cfb1"
 #define NID_des_cfb1		656
@@ -2666,18 +2764,6 @@
 #define LN_des_ede3_cfb8		"des-ede3-cfb8"
 #define NID_des_ede3_cfb8		659
 
-#define SN_id_aes128_wrap		"id-aes128-wrap"
-#define NID_id_aes128_wrap		788
-#define OBJ_id_aes128_wrap		OBJ_aes,5L
-
-#define SN_id_aes192_wrap		"id-aes192-wrap"
-#define NID_id_aes192_wrap		789
-#define OBJ_id_aes192_wrap		OBJ_aes,25L
-
-#define SN_id_aes256_wrap		"id-aes256-wrap"
-#define NID_id_aes256_wrap		790
-#define OBJ_id_aes256_wrap		OBJ_aes,45L
-
 #define OBJ_nist_hashalgs		OBJ_nistAlgorithms,2L
 
 #define SN_sha256		"SHA256"
@@ -3810,6 +3896,18 @@
 #define NID_camellia_256_cbc		753
 #define OBJ_camellia_256_cbc		1L,2L,392L,200011L,61L,1L,1L,1L,4L
 
+#define SN_id_camellia128_wrap		"id-camellia128-wrap"
+#define NID_id_camellia128_wrap		907
+#define OBJ_id_camellia128_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,2L
+
+#define SN_id_camellia192_wrap		"id-camellia192-wrap"
+#define NID_id_camellia192_wrap		908
+#define OBJ_id_camellia192_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,3L
+
+#define SN_id_camellia256_wrap		"id-camellia256-wrap"
+#define NID_id_camellia256_wrap		909
+#define OBJ_id_camellia256_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,4L
+
 #define OBJ_ntt_ds		0L,3L,4401L,5L
 
 #define OBJ_camellia		OBJ_ntt_ds,3L,1L,9L
@@ -3912,3 +4010,23 @@
 #define LN_hmac		"hmac"
 #define NID_hmac		855
 
+#define SN_cmac		"CMAC"
+#define LN_cmac		"cmac"
+#define NID_cmac		894
+
+#define SN_rc4_hmac_md5		"RC4-HMAC-MD5"
+#define LN_rc4_hmac_md5		"rc4-hmac-md5"
+#define NID_rc4_hmac_md5		915
+
+#define SN_aes_128_cbc_hmac_sha1		"AES-128-CBC-HMAC-SHA1"
+#define LN_aes_128_cbc_hmac_sha1		"aes-128-cbc-hmac-sha1"
+#define NID_aes_128_cbc_hmac_sha1		916
+
+#define SN_aes_192_cbc_hmac_sha1		"AES-192-CBC-HMAC-SHA1"
+#define LN_aes_192_cbc_hmac_sha1		"aes-192-cbc-hmac-sha1"
+#define NID_aes_192_cbc_hmac_sha1		917
+
+#define SN_aes_256_cbc_hmac_sha1		"AES-256-CBC-HMAC-SHA1"
+#define LN_aes_256_cbc_hmac_sha1		"aes-256-cbc-hmac-sha1"
+#define NID_aes_256_cbc_hmac_sha1		918
+
diff --git a/main/openssl/crypto/objects/obj_mac.num b/main/openssl/crypto/objects/obj_mac.num
index 8c50aac2..1d0a7c80 100644
--- a/main/openssl/crypto/objects/obj_mac.num
+++ b/main/openssl/crypto/objects/obj_mac.num
@@ -890,3 +890,30 @@ houseIdentifier		889
 supportedAlgorithms		890
 deltaRevocationList		891
 dmdName		892
+id_alg_PWRI_KEK		893
+cmac		894
+aes_128_gcm		895
+aes_128_ccm		896
+id_aes128_wrap_pad		897
+aes_192_gcm		898
+aes_192_ccm		899
+id_aes192_wrap_pad		900
+aes_256_gcm		901
+aes_256_ccm		902
+id_aes256_wrap_pad		903
+aes_128_ctr		904
+aes_192_ctr		905
+aes_256_ctr		906
+id_camellia128_wrap		907
+id_camellia192_wrap		908
+id_camellia256_wrap		909
+anyExtendedKeyUsage		910
+mgf1		911
+rsassaPss		912
+aes_128_xts		913
+aes_256_xts		914
+rc4_hmac_md5		915
+aes_128_cbc_hmac_sha1		916
+aes_192_cbc_hmac_sha1		917
+aes_256_cbc_hmac_sha1		918
+rsaesOaep		919
diff --git a/main/openssl/crypto/objects/obj_xref.c b/main/openssl/crypto/objects/obj_xref.c
index 152eca5c..9f744bce 100644
--- a/main/openssl/crypto/objects/obj_xref.c
+++ b/main/openssl/crypto/objects/obj_xref.c
@@ -110,8 +110,10 @@ int OBJ_find_sigid_algs(int signid, int *pdig_nid, int *ppkey_nid)
 #endif
 	if (rv == NULL)
 		return 0;
-	*pdig_nid = rv->hash_id;
-	*ppkey_nid = rv->pkey_id;
+	if (pdig_nid)
+		*pdig_nid = rv->hash_id;
+	if (ppkey_nid)
+		*ppkey_nid = rv->pkey_id;
 	return 1;
 	}
 
@@ -144,7 +146,8 @@ int OBJ_find_sigid_by_algs(int *psignid, int dig_nid, int pkey_nid)
 #endif
 	if (rv == NULL)
 		return 0;
-	*psignid = (*rv)->sign_id;
+	if (psignid)
+		*psignid = (*rv)->sign_id;
 	return 1;
 	}
 
diff --git a/main/openssl/crypto/objects/obj_xref.h b/main/openssl/crypto/objects/obj_xref.h
index d5b9b8e1..e23938c2 100644
--- a/main/openssl/crypto/objects/obj_xref.h
+++ b/main/openssl/crypto/objects/obj_xref.h
@@ -38,10 +38,12 @@ static const nid_triple sigoid_srt[] =
 	{NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94},
 	{NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc},
 	{NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc},
+	{NID_rsassaPss, NID_undef, NID_rsaEncryption},
 	};
 
 static const nid_triple * const sigoid_srt_xref[] =
 	{
+	&sigoid_srt[29],
 	&sigoid_srt[17],
 	&sigoid_srt[18],
 	&sigoid_srt[0],
diff --git a/main/openssl/crypto/objects/obj_xref.txt b/main/openssl/crypto/objects/obj_xref.txt
index e45b3d34..cb917182 100644
--- a/main/openssl/crypto/objects/obj_xref.txt
+++ b/main/openssl/crypto/objects/obj_xref.txt
@@ -13,6 +13,10 @@ sha512WithRSAEncryption	sha512	rsaEncryption
 sha224WithRSAEncryption	sha224	rsaEncryption
 mdc2WithRSA		mdc2	rsaEncryption
 ripemd160WithRSA	ripemd160 rsaEncryption
+# For PSS the digest algorithm can vary and depends on the included
+# AlgorithmIdentifier. The digest "undef" indicates the public key
+# method should handle this explicitly.
+rsassaPss		undef	rsaEncryption
 
 # Alternative deprecated OIDs. By using the older "rsa" OID this
 # type will be recognized by not normally used.
diff --git a/main/openssl/crypto/objects/objects.txt b/main/openssl/crypto/objects/objects.txt
index e61fe60c..d3bfad72 100644
--- a/main/openssl/crypto/objects/objects.txt
+++ b/main/openssl/crypto/objects/objects.txt
@@ -166,6 +166,10 @@ pkcs1 3			: RSA-MD4		: md4WithRSAEncryption
 pkcs1 4			: RSA-MD5		: md5WithRSAEncryption
 pkcs1 5			: RSA-SHA1		: sha1WithRSAEncryption
 # According to PKCS #1 version 2.1
+pkcs1 7			: RSAES-OAEP		: rsaesOaep
+pkcs1 8			: MGF1			: mgf1
+pkcs1 10		: RSASSA-PSS		: rsassaPss
+
 pkcs1 11		: RSA-SHA256		: sha256WithRSAEncryption
 pkcs1 12		: RSA-SHA384		: sha384WithRSAEncryption
 pkcs1 13		: RSA-SHA512		: sha512WithRSAEncryption
@@ -299,6 +303,7 @@ id-smime-alg 4		: id-smime-alg-RC2wrap
 id-smime-alg 5		: id-smime-alg-ESDH
 id-smime-alg 6		: id-smime-alg-CMS3DESwrap
 id-smime-alg 7		: id-smime-alg-CMSRC2wrap
+id-smime-alg 9		: id-alg-PWRI-KEK
 
 # S/MIME Certificate Distribution
 id-smime-cd 1		: id-smime-cd-ldap
@@ -770,6 +775,10 @@ id-ce 55		: targetInformation	: X509v3 AC Targeting
 !Cname no-rev-avail
 id-ce 56		: noRevAvail		: X509v3 No Revocation Available
 
+# From RFC5280
+ext-key-usage 0		: anyExtendedKeyUsage	: Any Extended Key Usage
+
+
 !Cname netscape
 2 16 840 1 113730	: Netscape		: Netscape Communications Corp.
 !Cname netscape-cert-extension
@@ -846,6 +855,10 @@ aes 2			: AES-128-CBC		: aes-128-cbc
 aes 3			: AES-128-OFB		: aes-128-ofb
 !Cname aes-128-cfb128
 aes 4			: AES-128-CFB		: aes-128-cfb
+aes 5			: id-aes128-wrap
+aes 6			: id-aes128-GCM		: aes-128-gcm
+aes 7			: id-aes128-CCM		: aes-128-ccm
+aes 8			: id-aes128-wrap-pad
 
 aes 21			: AES-192-ECB		: aes-192-ecb
 aes 22			: AES-192-CBC		: aes-192-cbc
@@ -853,6 +866,10 @@ aes 22			: AES-192-CBC		: aes-192-cbc
 aes 23			: AES-192-OFB		: aes-192-ofb
 !Cname aes-192-cfb128
 aes 24			: AES-192-CFB		: aes-192-cfb
+aes 25			: id-aes192-wrap
+aes 26			: id-aes192-GCM		: aes-192-gcm
+aes 27			: id-aes192-CCM		: aes-192-ccm
+aes 28			: id-aes192-wrap-pad
 
 aes 41			: AES-256-ECB		: aes-256-ecb
 aes 42			: AES-256-CBC		: aes-256-cbc
@@ -860,6 +877,10 @@ aes 42			: AES-256-CBC		: aes-256-cbc
 aes 43			: AES-256-OFB		: aes-256-ofb
 !Cname aes-256-cfb128
 aes 44			: AES-256-CFB		: aes-256-cfb
+aes 45			: id-aes256-wrap
+aes 46			: id-aes256-GCM		: aes-256-gcm
+aes 47			: id-aes256-CCM		: aes-256-ccm
+aes 48			: id-aes256-wrap-pad
 
 # There are no OIDs for these modes...
 
@@ -869,15 +890,16 @@ aes 44			: AES-256-CFB		: aes-256-cfb
 			: AES-128-CFB8		: aes-128-cfb8
 			: AES-192-CFB8		: aes-192-cfb8
 			: AES-256-CFB8		: aes-256-cfb8
+			: AES-128-CTR		: aes-128-ctr
+			: AES-192-CTR		: aes-192-ctr
+			: AES-256-CTR		: aes-256-ctr
+			: AES-128-XTS		: aes-128-xts
+			: AES-256-XTS		: aes-256-xts
 			: DES-CFB1		: des-cfb1
 			: DES-CFB8		: des-cfb8
 			: DES-EDE3-CFB1		: des-ede3-cfb1
 			: DES-EDE3-CFB8		: des-ede3-cfb8
 
-aes 5			: id-aes128-wrap 
-aes 25			: id-aes192-wrap 
-aes 45			: id-aes256-wrap 
-
 # OIDs for SHA224, SHA256, SHA385 and SHA512, according to x9.84.
 !Alias nist_hashalgs nistAlgorithms 2
 nist_hashalgs 1		: SHA256		: sha256
@@ -1211,6 +1233,9 @@ cryptocom 1 8 1		: id-GostR3410-2001-ParamSet-cc : GOST R 3410-2001 Parameter Se
 1 2 392 200011 61 1 1 1 2 : CAMELLIA-128-CBC		: camellia-128-cbc
 1 2 392 200011 61 1 1 1 3 : CAMELLIA-192-CBC		: camellia-192-cbc
 1 2 392 200011 61 1 1 1 4 : CAMELLIA-256-CBC		: camellia-256-cbc
+1 2 392 200011 61 1 1 3 2 : id-camellia128-wrap
+1 2 392 200011 61 1 1 3 3 : id-camellia192-wrap
+1 2 392 200011 61 1 1 3 4 : id-camellia256-wrap
 
 # Definitions for Camellia cipher - ECB, CFB, OFB MODE
 
@@ -1257,3 +1282,11 @@ kisa 1 6                : SEED-OFB      : seed-ofb
 # There is no OID that just denotes "HMAC" oddly enough...
 
 			: HMAC				: hmac
+# Nor CMAC either
+			: CMAC				: cmac
+
+# Synthetic composite ciphersuites
+			: RC4-HMAC-MD5			: rc4-hmac-md5
+			: AES-128-CBC-HMAC-SHA1		: aes-128-cbc-hmac-sha1
+			: AES-192-CBC-HMAC-SHA1		: aes-192-cbc-hmac-sha1
+			: AES-256-CBC-HMAC-SHA1		: aes-256-cbc-hmac-sha1
diff --git a/main/openssl/crypto/ocsp/ocsp_lib.c b/main/openssl/crypto/ocsp/ocsp_lib.c
index e92b86c0..a94dc838 100644
--- a/main/openssl/crypto/ocsp/ocsp_lib.c
+++ b/main/openssl/crypto/ocsp/ocsp_lib.c
@@ -124,7 +124,8 @@ OCSP_CERTID *OCSP_cert_id_new(const EVP_MD *dgst,
 	if (!(ASN1_OCTET_STRING_set(cid->issuerNameHash, md, i))) goto err;
 
 	/* Calculate the issuerKey hash, excluding tag and length */
-	EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL);
+	if (!EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL))
+		goto err;
 
 	if (!(ASN1_OCTET_STRING_set(cid->issuerKeyHash, md, i))) goto err;
 
diff --git a/main/openssl/crypto/ocsp/ocsp_vfy.c b/main/openssl/crypto/ocsp/ocsp_vfy.c
index 415d67e6..27671830 100644
--- a/main/openssl/crypto/ocsp/ocsp_vfy.c
+++ b/main/openssl/crypto/ocsp/ocsp_vfy.c
@@ -91,9 +91,12 @@ int OCSP_basic_verify(OCSP_BASICRESP *bs, STACK_OF(X509) *certs,
 		{
 		EVP_PKEY *skey;
 		skey = X509_get_pubkey(signer);
-		ret = OCSP_BASICRESP_verify(bs, skey, 0);
-		EVP_PKEY_free(skey);
-		if(ret <= 0)
+		if (skey)
+			{
+			ret = OCSP_BASICRESP_verify(bs, skey, 0);
+			EVP_PKEY_free(skey);
+			}
+		if(!skey || ret <= 0)
 			{
 			OCSPerr(OCSP_F_OCSP_BASIC_VERIFY, OCSP_R_SIGNATURE_FAILURE);
 			goto end;
@@ -108,6 +111,7 @@ int OCSP_basic_verify(OCSP_BASICRESP *bs, STACK_OF(X509) *certs,
 			init_res = X509_STORE_CTX_init(&ctx, st, signer, bs->certs);
 		if(!init_res)
 			{
+			ret = -1;
 			OCSPerr(OCSP_F_OCSP_BASIC_VERIFY,ERR_R_X509_LIB);
 			goto end;
 			}
diff --git a/main/openssl/crypto/opensslconf-32.h b/main/openssl/crypto/opensslconf-32.h
new file mode 100644
index 00000000..d6625489
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-32.h
@@ -0,0 +1,316 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+   asks for it.  This is a transient feature that is provided for those
+   who haven't had the time to do the appropriate changes in their
+   applications.  */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+#  define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+#  define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+#  define NO_CAST
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+#  define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+#  define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+#  define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+#  define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+#  define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+#  define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+#  define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+#  define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+#  define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+#  define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+#  define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+#  define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+#  define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+#  define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+#  define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+#  define NO_STORE
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+#  define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned char
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#define RC4_CHUNK unsigned long
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned int
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#define BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#undef SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#define THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#define BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units.  It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#define DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+   CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+   even newer MIPS CPU's, but at the moment one size fits all for
+   optimization options.  Older Sparc's work better with only UNROLL, but
+   there's no way to tell at compile time what it is you're running on */
+ 
+#if defined( sun )		/* Newer Sparc's */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#elif defined( __ultrix )	/* Older MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined( __osf1__ )	/* Alpha */
+#  define DES_PTR
+#  define DES_RISC2
+#elif defined ( _AIX )		/* RS6000 */
+  /* Unknown */
+#elif defined( __hpux )		/* HP-PA */
+  /* Unknown */
+#elif defined( __aux )		/* 68K */
+  /* Unknown */
+#elif defined( __dgux )		/* 88K (but P6 in latest boxes) */
+#  define DES_UNROLL
+#elif defined( __sgi )		/* Newer MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined(i386) || defined(__i386__)	/* x86 boxes, should be gcc */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-64.h b/main/openssl/crypto/opensslconf-64.h
new file mode 100644
index 00000000..70c5a2cb
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-64.h
@@ -0,0 +1,316 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+   asks for it.  This is a transient feature that is provided for those
+   who haven't had the time to do the appropriate changes in their
+   applications.  */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+#  define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+#  define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+#  define NO_CAST
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+#  define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+#  define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+#  define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+#  define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+#  define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+#  define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+#  define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+#  define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+#  define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+#  define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+#  define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+#  define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+#  define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+#  define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+#  define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+#  define NO_STORE
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+#  define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned char
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#define RC4_CHUNK unsigned long
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned int
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#undef BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#define SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#undef THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#define BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units.  It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#define DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+   CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+   even newer MIPS CPU's, but at the moment one size fits all for
+   optimization options.  Older Sparc's work better with only UNROLL, but
+   there's no way to tell at compile time what it is you're running on */
+ 
+#if defined( sun )		/* Newer Sparc's */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#elif defined( __ultrix )	/* Older MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined( __osf1__ )	/* Alpha */
+#  define DES_PTR
+#  define DES_RISC2
+#elif defined ( _AIX )		/* RS6000 */
+  /* Unknown */
+#elif defined( __hpux )		/* HP-PA */
+  /* Unknown */
+#elif defined( __aux )		/* 68K */
+  /* Unknown */
+#elif defined( __dgux )		/* 88K (but P6 in latest boxes) */
+#  define DES_UNROLL
+#elif defined( __sgi )		/* Newer MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined(i386) || defined(__i386__)	/* x86 boxes, should be gcc */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-static-32.h b/main/openssl/crypto/opensslconf-static-32.h
new file mode 100644
index 00000000..d6625489
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-static-32.h
@@ -0,0 +1,316 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+   asks for it.  This is a transient feature that is provided for those
+   who haven't had the time to do the appropriate changes in their
+   applications.  */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+#  define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+#  define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+#  define NO_CAST
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+#  define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+#  define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+#  define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+#  define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+#  define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+#  define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+#  define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+#  define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+#  define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+#  define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+#  define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+#  define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+#  define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+#  define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+#  define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+#  define NO_STORE
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+#  define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned char
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#define RC4_CHUNK unsigned long
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned int
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#define BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#undef SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#define THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#define BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units.  It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#define DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+   CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+   even newer MIPS CPU's, but at the moment one size fits all for
+   optimization options.  Older Sparc's work better with only UNROLL, but
+   there's no way to tell at compile time what it is you're running on */
+ 
+#if defined( sun )		/* Newer Sparc's */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#elif defined( __ultrix )	/* Older MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined( __osf1__ )	/* Alpha */
+#  define DES_PTR
+#  define DES_RISC2
+#elif defined ( _AIX )		/* RS6000 */
+  /* Unknown */
+#elif defined( __hpux )		/* HP-PA */
+  /* Unknown */
+#elif defined( __aux )		/* 68K */
+  /* Unknown */
+#elif defined( __dgux )		/* 88K (but P6 in latest boxes) */
+#  define DES_UNROLL
+#elif defined( __sgi )		/* Newer MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined(i386) || defined(__i386__)	/* x86 boxes, should be gcc */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-static-64.h b/main/openssl/crypto/opensslconf-static-64.h
new file mode 100644
index 00000000..70c5a2cb
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-static-64.h
@@ -0,0 +1,316 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+   asks for it.  This is a transient feature that is provided for those
+   who haven't had the time to do the appropriate changes in their
+   applications.  */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+#  define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+#  define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+#  define NO_CAST
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+#  define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+#  define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+#  define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+#  define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+#  define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+#  define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+#  define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+#  define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+#  define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+#  define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+#  define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+#  define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+#  define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+#  define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+#  define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+#  define NO_STORE
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+#  define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned char
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#define RC4_CHUNK unsigned long
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned int
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#undef BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#define SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#undef THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#define BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units.  It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#define DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+   CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+   even newer MIPS CPU's, but at the moment one size fits all for
+   optimization options.  Older Sparc's work better with only UNROLL, but
+   there's no way to tell at compile time what it is you're running on */
+ 
+#if defined( sun )		/* Newer Sparc's */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#elif defined( __ultrix )	/* Older MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined( __osf1__ )	/* Alpha */
+#  define DES_PTR
+#  define DES_RISC2
+#elif defined ( _AIX )		/* RS6000 */
+  /* Unknown */
+#elif defined( __hpux )		/* HP-PA */
+  /* Unknown */
+#elif defined( __aux )		/* 68K */
+  /* Unknown */
+#elif defined( __dgux )		/* 88K (but P6 in latest boxes) */
+#  define DES_UNROLL
+#elif defined( __sgi )		/* Newer MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined(i386) || defined(__i386__)	/* x86 boxes, should be gcc */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-static-trusty.h b/main/openssl/crypto/opensslconf-static-trusty.h
new file mode 100644
index 00000000..06f9f982
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-static-trusty.h
@@ -0,0 +1,448 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_CMS
+# define OPENSSL_NO_CMS
+#endif
+#ifndef OPENSSL_NO_COMP
+# define OPENSSL_NO_COMP
+#endif
+#ifndef OPENSSL_NO_CONF
+# define OPENSSL_NO_CONF
+#endif
+#ifndef OPENSSL_NO_DES
+# define OPENSSL_NO_DES
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_LOCKING
+# define OPENSSL_NO_LOCKING
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MD4
+# define OPENSSL_NO_MD4
+#endif
+#ifndef OPENSSL_NO_MD5
+# define OPENSSL_NO_MD5
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_OCSP
+# define OPENSSL_NO_OCSP
+#endif
+#ifndef OPENSSL_NO_PEM
+# define OPENSSL_NO_PEM
+#endif
+#ifndef OPENSSL_NO_PKCS12
+# define OPENSSL_NO_PKCS12
+#endif
+#ifndef OPENSSL_NO_PQUEUE
+# define OPENSSL_NO_PQUEUE
+#endif
+#ifndef OPENSSL_NO_RC2
+# define OPENSSL_NO_RC2
+#endif
+#ifndef OPENSSL_NO_RC4
+# define OPENSSL_NO_RC4
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RIPEMD
+# define OPENSSL_NO_RIPEMD
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_SRP
+# define OPENSSL_NO_SRP
+#endif
+#ifndef OPENSSL_NO_SSL2
+# define OPENSSL_NO_SSL2
+#endif
+#ifndef OPENSSL_NO_SSL3
+# define OPENSSL_NO_SSL3
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_TLS1
+# define OPENSSL_NO_TLS1
+#endif
+#ifndef OPENSSL_NO_TLSEXT
+# define OPENSSL_NO_TLSEXT
+#endif
+#ifndef OPENSSL_NO_TS
+# define OPENSSL_NO_TS
+#endif
+#ifndef OPENSSL_NO_TXT_DB
+# define OPENSSL_NO_TXT_DB
+#endif
+#ifndef OPENSSL_NO_UI
+# define OPENSSL_NO_UI
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_NO_ERR
+# define OPENSSL_NO_ERR
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+   asks for it.  This is a transient feature that is provided for those
+   who haven't had the time to do the appropriate changes in their
+   applications.  */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+#  define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+#  define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+#  define NO_CAST
+# endif
+# if defined(OPENSSL_NO_CMS) && !defined(NO_CMS)
+#  define NO_CMS
+# endif
+# if defined(OPENSSL_NO_COMP) && !defined(NO_COMP)
+#  define NO_COMP
+# endif
+# if defined(OPENSSL_NO_CONF) && !defined(NO_CONF)
+#  define NO_CONF
+# endif
+# if defined(OPENSSL_NO_DES) && !defined(NO_DES)
+#  define NO_DES
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+#  define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+#  define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+#  define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+#  define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+#  define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+#  define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+#  define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_LOCKING) && !defined(NO_LOCKING)
+#  define NO_LOCKING
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+#  define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MD4) && !defined(NO_MD4)
+#  define NO_MD4
+# endif
+# if defined(OPENSSL_NO_MD5) && !defined(NO_MD5)
+#  define NO_MD5
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_OCSP) && !defined(NO_OCSP)
+#  define NO_OCSP
+# endif
+# if defined(OPENSSL_NO_PEM) && !defined(NO_PEM)
+#  define NO_PEM
+# endif
+# if defined(OPENSSL_NO_PKCS12) && !defined(NO_PKCS12)
+#  define NO_PKCS12
+# endif
+# if defined(OPENSSL_NO_PQUEUE) && !defined(NO_PQUEUE)
+#  define NO_PQUEUE
+# endif
+# if defined(OPENSSL_NO_RC2) && !defined(NO_RC2)
+#  define NO_RC2
+# endif
+# if defined(OPENSSL_NO_RC4) && !defined(NO_RC4)
+#  define NO_RC4
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+#  define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+#  define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+#  define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD)
+#  define NO_RIPEMD
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+#  define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+#  define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+#  define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_SRP) && !defined(NO_SRP)
+#  define NO_SRP
+# endif
+# if defined(OPENSSL_NO_SSL2) && !defined(NO_SSL2)
+#  define NO_SSL2
+# endif
+# if defined(OPENSSL_NO_SSL3) && !defined(NO_SSL3)
+#  define NO_SSL3
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+#  define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+#  define NO_STORE
+# endif
+# if defined(OPENSSL_NO_TLS1) && !defined(NO_TLS1)
+#  define NO_TLS1
+# endif
+# if defined(OPENSSL_NO_TLSEXT) && !defined(NO_TLSEXT)
+#  define NO_TLSEXT
+# endif
+# if defined(OPENSSL_NO_TS) && !defined(NO_TS)
+#  define NO_TS
+# endif
+# if defined(OPENSSL_NO_TXT_DB) && !defined(NO_TXT_DB)
+#  define NO_TXT_DB
+# endif
+# if defined(OPENSSL_NO_UI) && !defined(NO_UI)
+#  define NO_UI
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+#  define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <trusty_std.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned int
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#undef RC4_CHUNK
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned long
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#undef BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#undef SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#define THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#undef BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units.  It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#undef DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+   CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+   even newer MIPS CPU's, but at the moment one size fits all for
+   optimization options.  Older Sparc's work better with only UNROLL, but
+   there's no way to tell at compile time what it is you're running on */
+ 
+#if defined( sun )		/* Newer Sparc's */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#elif defined( __ultrix )	/* Older MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined( __osf1__ )	/* Alpha */
+#  define DES_PTR
+#  define DES_RISC2
+#elif defined ( _AIX )		/* RS6000 */
+  /* Unknown */
+#elif defined( __hpux )		/* HP-PA */
+  /* Unknown */
+#elif defined( __aux )		/* 68K */
+  /* Unknown */
+#elif defined( __dgux )		/* 88K (but P6 in latest boxes) */
+#  define DES_UNROLL
+#elif defined( __sgi )		/* Newer MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined(i386) || defined(__i386__)	/* x86 boxes, should be gcc */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-static.h b/main/openssl/crypto/opensslconf-static.h
new file mode 100644
index 00000000..f63a6e0a
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-static.h
@@ -0,0 +1,6 @@
+// Auto-generated - DO NOT EDIT!
+#if defined(__LP64__)
+#include "opensslconf-static-64.h"
+#else
+#include "opensslconf-static-32.h"
+#endif
diff --git a/main/openssl/crypto/opensslconf-trusty.h b/main/openssl/crypto/opensslconf-trusty.h
new file mode 100644
index 00000000..06f9f982
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-trusty.h
@@ -0,0 +1,448 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_CMS
+# define OPENSSL_NO_CMS
+#endif
+#ifndef OPENSSL_NO_COMP
+# define OPENSSL_NO_COMP
+#endif
+#ifndef OPENSSL_NO_CONF
+# define OPENSSL_NO_CONF
+#endif
+#ifndef OPENSSL_NO_DES
+# define OPENSSL_NO_DES
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_LOCKING
+# define OPENSSL_NO_LOCKING
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MD4
+# define OPENSSL_NO_MD4
+#endif
+#ifndef OPENSSL_NO_MD5
+# define OPENSSL_NO_MD5
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_OCSP
+# define OPENSSL_NO_OCSP
+#endif
+#ifndef OPENSSL_NO_PEM
+# define OPENSSL_NO_PEM
+#endif
+#ifndef OPENSSL_NO_PKCS12
+# define OPENSSL_NO_PKCS12
+#endif
+#ifndef OPENSSL_NO_PQUEUE
+# define OPENSSL_NO_PQUEUE
+#endif
+#ifndef OPENSSL_NO_RC2
+# define OPENSSL_NO_RC2
+#endif
+#ifndef OPENSSL_NO_RC4
+# define OPENSSL_NO_RC4
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RIPEMD
+# define OPENSSL_NO_RIPEMD
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_SRP
+# define OPENSSL_NO_SRP
+#endif
+#ifndef OPENSSL_NO_SSL2
+# define OPENSSL_NO_SSL2
+#endif
+#ifndef OPENSSL_NO_SSL3
+# define OPENSSL_NO_SSL3
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_TLS1
+# define OPENSSL_NO_TLS1
+#endif
+#ifndef OPENSSL_NO_TLSEXT
+# define OPENSSL_NO_TLSEXT
+#endif
+#ifndef OPENSSL_NO_TS
+# define OPENSSL_NO_TS
+#endif
+#ifndef OPENSSL_NO_TXT_DB
+# define OPENSSL_NO_TXT_DB
+#endif
+#ifndef OPENSSL_NO_UI
+# define OPENSSL_NO_UI
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_NO_ERR
+# define OPENSSL_NO_ERR
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+   asks for it.  This is a transient feature that is provided for those
+   who haven't had the time to do the appropriate changes in their
+   applications.  */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+#  define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+#  define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+#  define NO_CAST
+# endif
+# if defined(OPENSSL_NO_CMS) && !defined(NO_CMS)
+#  define NO_CMS
+# endif
+# if defined(OPENSSL_NO_COMP) && !defined(NO_COMP)
+#  define NO_COMP
+# endif
+# if defined(OPENSSL_NO_CONF) && !defined(NO_CONF)
+#  define NO_CONF
+# endif
+# if defined(OPENSSL_NO_DES) && !defined(NO_DES)
+#  define NO_DES
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+#  define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+#  define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+#  define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+#  define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+#  define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+#  define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+#  define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_LOCKING) && !defined(NO_LOCKING)
+#  define NO_LOCKING
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+#  define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MD4) && !defined(NO_MD4)
+#  define NO_MD4
+# endif
+# if defined(OPENSSL_NO_MD5) && !defined(NO_MD5)
+#  define NO_MD5
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_OCSP) && !defined(NO_OCSP)
+#  define NO_OCSP
+# endif
+# if defined(OPENSSL_NO_PEM) && !defined(NO_PEM)
+#  define NO_PEM
+# endif
+# if defined(OPENSSL_NO_PKCS12) && !defined(NO_PKCS12)
+#  define NO_PKCS12
+# endif
+# if defined(OPENSSL_NO_PQUEUE) && !defined(NO_PQUEUE)
+#  define NO_PQUEUE
+# endif
+# if defined(OPENSSL_NO_RC2) && !defined(NO_RC2)
+#  define NO_RC2
+# endif
+# if defined(OPENSSL_NO_RC4) && !defined(NO_RC4)
+#  define NO_RC4
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+#  define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+#  define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+#  define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD)
+#  define NO_RIPEMD
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+#  define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+#  define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+#  define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_SRP) && !defined(NO_SRP)
+#  define NO_SRP
+# endif
+# if defined(OPENSSL_NO_SSL2) && !defined(NO_SSL2)
+#  define NO_SSL2
+# endif
+# if defined(OPENSSL_NO_SSL3) && !defined(NO_SSL3)
+#  define NO_SSL3
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+#  define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+#  define NO_STORE
+# endif
+# if defined(OPENSSL_NO_TLS1) && !defined(NO_TLS1)
+#  define NO_TLS1
+# endif
+# if defined(OPENSSL_NO_TLSEXT) && !defined(NO_TLSEXT)
+#  define NO_TLSEXT
+# endif
+# if defined(OPENSSL_NO_TS) && !defined(NO_TS)
+#  define NO_TS
+# endif
+# if defined(OPENSSL_NO_TXT_DB) && !defined(NO_TXT_DB)
+#  define NO_TXT_DB
+# endif
+# if defined(OPENSSL_NO_UI) && !defined(NO_UI)
+#  define NO_UI
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+#  define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <trusty_std.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned int
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#undef RC4_CHUNK
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned long
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#undef BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#undef SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#define THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#undef BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units.  It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#undef DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+   CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+   even newer MIPS CPU's, but at the moment one size fits all for
+   optimization options.  Older Sparc's work better with only UNROLL, but
+   there's no way to tell at compile time what it is you're running on */
+ 
+#if defined( sun )		/* Newer Sparc's */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#elif defined( __ultrix )	/* Older MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined( __osf1__ )	/* Alpha */
+#  define DES_PTR
+#  define DES_RISC2
+#elif defined ( _AIX )		/* RS6000 */
+  /* Unknown */
+#elif defined( __hpux )		/* HP-PA */
+  /* Unknown */
+#elif defined( __aux )		/* 68K */
+  /* Unknown */
+#elif defined( __dgux )		/* 88K (but P6 in latest boxes) */
+#  define DES_UNROLL
+#elif defined( __sgi )		/* Newer MIPS */
+#  define DES_PTR
+#  define DES_RISC2
+#  define DES_UNROLL
+#elif defined(i386) || defined(__i386__)	/* x86 boxes, should be gcc */
+#  define DES_PTR
+#  define DES_RISC1
+#  define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf.h b/main/openssl/crypto/opensslconf.h
index 26ac6ba3..94212a08 100644
--- a/main/openssl/crypto/opensslconf.h
+++ b/main/openssl/crypto/opensslconf.h
@@ -1,250 +1,10 @@
-/* opensslconf.h */
-/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
-
-/* OpenSSL was configured with the following options: */
-#ifndef OPENSSL_DOING_MAKEDEPEND
-
-
-#ifndef OPENSSL_NO_CAST
-# define OPENSSL_NO_CAST
+// Auto-generated - DO NOT EDIT!
+#ifndef OPENSSL_SYS_TRUSTY
+#if defined(__LP64__)
+#include "opensslconf-64.h"
+#else
+#include "opensslconf-32.h"
 #endif
-#ifndef OPENSSL_NO_GMP
-# define OPENSSL_NO_GMP
+#else
+#include "opensslconf-trusty.h"
 #endif
-#ifndef OPENSSL_NO_IDEA
-# define OPENSSL_NO_IDEA
-#endif
-#ifndef OPENSSL_NO_JPAKE
-# define OPENSSL_NO_JPAKE
-#endif
-#ifndef OPENSSL_NO_KRB5
-# define OPENSSL_NO_KRB5
-#endif
-#ifndef OPENSSL_NO_MD2
-# define OPENSSL_NO_MD2
-#endif
-#ifndef OPENSSL_NO_RC5
-# define OPENSSL_NO_RC5
-#endif
-#ifndef OPENSSL_NO_RFC3779
-# define OPENSSL_NO_RFC3779
-#endif
-#ifndef OPENSSL_NO_SEED
-# define OPENSSL_NO_SEED
-#endif
-#ifndef OPENSSL_NO_SHA0
-# define OPENSSL_NO_SHA0
-#endif
-#ifndef OPENSSL_NO_STORE
-# define OPENSSL_NO_STORE
-#endif
-#ifndef OPENSSL_NO_WHRLPOOL
-# define OPENSSL_NO_WHRLPOOL
-#endif
-
-#endif /* OPENSSL_DOING_MAKEDEPEND */
-
-#ifndef OPENSSL_THREADS
-# define OPENSSL_THREADS
-#endif
-#ifndef OPENSSL_NO_DYNAMIC_ENGINE
-# define OPENSSL_NO_DYNAMIC_ENGINE
-#endif
-
-/* The OPENSSL_NO_* macros are also defined as NO_* if the application
-   asks for it.  This is a transient feature that is provided for those
-   who haven't had the time to do the appropriate changes in their
-   applications.  */
-#ifdef OPENSSL_ALGORITHM_DEFINES
-# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
-#  define NO_CAST
-# endif
-# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
-#  define NO_GMP
-# endif
-# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
-#  define NO_IDEA
-# endif
-# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
-#  define NO_JPAKE
-# endif
-# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
-#  define NO_KRB5
-# endif
-# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
-#  define NO_MD2
-# endif
-# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
-#  define NO_RC5
-# endif
-# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
-#  define NO_RFC3779
-# endif
-# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
-#  define NO_SEED
-# endif
-# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
-#  define NO_SHA0
-# endif
-# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
-#  define NO_STORE
-# endif
-# if defined(OPENSSL_NO_WHRLPOOL) && !defined(NO_WHRLPOOL)
-#  define NO_WHRLPOOL
-# endif
-#endif
-
-/* crypto/opensslconf.h.in */
-
-/* Generate 80386 code? */
-#undef I386_ONLY
-
-#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
-#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
-#define ENGINESDIR "/usr/local/ssl/lib/engines"
-#define OPENSSLDIR "/usr/local/ssl"
-#endif
-#endif
-
-#undef OPENSSL_UNISTD
-#define OPENSSL_UNISTD <unistd.h>
-
-#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
-
-#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
-#define IDEA_INT unsigned int
-#endif
-
-#if defined(HEADER_MD2_H) && !defined(MD2_INT)
-#define MD2_INT unsigned int
-#endif
-
-#if defined(HEADER_RC2_H) && !defined(RC2_INT)
-/* I need to put in a mod for the alpha - eay */
-#define RC2_INT unsigned int
-#endif
-
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_INT)
-/* using int types make the structure larger but make the code faster
- * on most boxes I have tested - up to %20 faster. */
-/*
- * I don't know what does "most" mean, but declaring "int" is a must on:
- * - Intel P6 because partial register stalls are very expensive;
- * - elder Alpha because it lacks byte load/store instructions;
- */
-#define RC4_INT unsigned char
-#endif
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#define RC4_CHUNK unsigned long
-#endif
-#endif
-
-#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
-/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
- * %20 speed up (longs are 8 bytes, int's are 4). */
-#ifndef DES_LONG
-#define DES_LONG unsigned int
-#endif
-#endif
-
-#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
-#define CONFIG_HEADER_BN_H
-#define BN_LLONG
-
-/* Should we define BN_DIV2W here? */
-
-/* Only one for the following should be defined */
-#undef SIXTY_FOUR_BIT_LONG
-#undef SIXTY_FOUR_BIT
-#define THIRTY_TWO_BIT
-#endif
-
-#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
-#define CONFIG_HEADER_RC4_LOCL_H
-/* if this is defined data[i] is used instead of *data, this is a %20
- * speedup on x86 */
-#undef RC4_INDEX
-#endif
-
-#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
-#define CONFIG_HEADER_BF_LOCL_H
-#define BF_PTR
-#endif /* HEADER_BF_LOCL_H */
-
-#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
-#define CONFIG_HEADER_DES_LOCL_H
-#ifndef DES_DEFAULT_OPTIONS
-/* the following is tweaked from a config script, that is why it is a
- * protected undef/define */
-#ifndef DES_PTR
-#undef DES_PTR
-#endif
-
-/* This helps C compiler generate the correct code for multiple functional
- * units.  It reduces register dependancies at the expense of 2 more
- * registers */
-#ifndef DES_RISC1
-#undef DES_RISC1
-#endif
-
-#ifndef DES_RISC2
-#undef DES_RISC2
-#endif
-
-#if defined(DES_RISC1) && defined(DES_RISC2)
-YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
-#endif
-
-/* Unroll the inner loop, this sometimes helps, sometimes hinders.
- * Very mucy CPU dependant */
-#ifndef DES_UNROLL
-#define DES_UNROLL
-#endif
-
-/* These default values were supplied by
- * Peter Gutman <pgut001@cs.auckland.ac.nz>
- * They are only used if nothing else has been defined */
-#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
-/* Special defines which change the way the code is built depending on the
-   CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
-   even newer MIPS CPU's, but at the moment one size fits all for
-   optimization options.  Older Sparc's work better with only UNROLL, but
-   there's no way to tell at compile time what it is you're running on */
- 
-#if defined( sun )		/* Newer Sparc's */
-#  define DES_PTR
-#  define DES_RISC1
-#  define DES_UNROLL
-#elif defined( __ultrix )	/* Older MIPS */
-#  define DES_PTR
-#  define DES_RISC2
-#  define DES_UNROLL
-#elif defined( __osf1__ )	/* Alpha */
-#  define DES_PTR
-#  define DES_RISC2
-#elif defined ( _AIX )		/* RS6000 */
-  /* Unknown */
-#elif defined( __hpux )		/* HP-PA */
-  /* Unknown */
-#elif defined( __aux )		/* 68K */
-  /* Unknown */
-#elif defined( __dgux )		/* 88K (but P6 in latest boxes) */
-#  define DES_UNROLL
-#elif defined( __sgi )		/* Newer MIPS */
-#  define DES_PTR
-#  define DES_RISC2
-#  define DES_UNROLL
-#elif defined(i386) || defined(__i386__)	/* x86 boxes, should be gcc */
-#  define DES_PTR
-#  define DES_RISC1
-#  define DES_UNROLL
-#endif /* Systems-specific speed defines */
-#endif
-
-#endif /* DES_DEFAULT_OPTIONS */
-#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslv.h b/main/openssl/crypto/opensslv.h
index 310a3387..b27a5bb8 100644
--- a/main/openssl/crypto/opensslv.h
+++ b/main/openssl/crypto/opensslv.h
@@ -25,11 +25,11 @@
  * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
  *  major minor fix final patch/beta)
  */
-#define OPENSSL_VERSION_NUMBER	0x1000005fL
+#define OPENSSL_VERSION_NUMBER	0x1000106fL
 #ifdef OPENSSL_FIPS
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0e-fips 6 Sep 2011"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1f-fips 6 Jan 2014"
 #else
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0e 6 Sep 2011"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1f 6 Jan 2014"
 #endif
 #define OPENSSL_VERSION_PTEXT	" part of " OPENSSL_VERSION_TEXT
 
diff --git a/main/openssl/crypto/ossl_typ.h b/main/openssl/crypto/ossl_typ.h
index 12bd7014..ea9227f6 100644
--- a/main/openssl/crypto/ossl_typ.h
+++ b/main/openssl/crypto/ossl_typ.h
@@ -91,10 +91,12 @@ typedef struct asn1_string_st ASN1_TIME;
 typedef struct asn1_string_st ASN1_GENERALIZEDTIME;
 typedef struct asn1_string_st ASN1_VISIBLESTRING;
 typedef struct asn1_string_st ASN1_UTF8STRING;
+typedef struct asn1_string_st ASN1_STRING;
 typedef int ASN1_BOOLEAN;
 typedef int ASN1_NULL;
 #endif
 
+typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct asn1_pctx_st ASN1_PCTX;
 
 #ifdef OPENSSL_SYS_WIN32
diff --git a/main/openssl/crypto/pariscid.pl b/main/openssl/crypto/pariscid.pl
new file mode 100644
index 00000000..bfc56fdc
--- /dev/null
+++ b/main/openssl/crypto/pariscid.pl
@@ -0,0 +1,225 @@
+#!/usr/bin/env perl
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$ST		="std";
+} else {
+	$LEVEL		="1.1";
+	$SIZE_T		=4;
+	$ST		="stw";
+}
+
+$rp="%r2";
+$sp="%r30";
+$rv="%r28";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	OPENSSL_cpuid_setup,ENTRY
+	.ALIGN	8
+OPENSSL_cpuid_setup
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	bv	($rp)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	OPENSSL_rdtsc,ENTRY
+	.ALIGN	8
+OPENSSL_rdtsc
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	mfctl	%cr16,$rv
+	bv	($rp)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	OPENSSL_wipe_cpu,ENTRY
+	.ALIGN	8
+OPENSSL_wipe_cpu
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	xor		%r0,%r0,%r1
+	fcpy,dbl	%fr0,%fr4
+	xor		%r0,%r0,%r19
+	fcpy,dbl	%fr0,%fr5
+	xor		%r0,%r0,%r20
+	fcpy,dbl	%fr0,%fr6
+	xor		%r0,%r0,%r21
+	fcpy,dbl	%fr0,%fr7
+	xor		%r0,%r0,%r22
+	fcpy,dbl	%fr0,%fr8
+	xor		%r0,%r0,%r23
+	fcpy,dbl	%fr0,%fr9
+	xor		%r0,%r0,%r24
+	fcpy,dbl	%fr0,%fr10
+	xor		%r0,%r0,%r25
+	fcpy,dbl	%fr0,%fr11
+	xor		%r0,%r0,%r26
+	fcpy,dbl	%fr0,%fr22
+	xor		%r0,%r0,%r29
+	fcpy,dbl	%fr0,%fr23
+	xor		%r0,%r0,%r31
+	fcpy,dbl	%fr0,%fr24
+	fcpy,dbl	%fr0,%fr25
+	fcpy,dbl	%fr0,%fr26
+	fcpy,dbl	%fr0,%fr27
+	fcpy,dbl	%fr0,%fr28
+	fcpy,dbl	%fr0,%fr29
+	fcpy,dbl	%fr0,%fr30
+	fcpy,dbl	%fr0,%fr31
+	bv		($rp)
+	.EXIT
+	ldo		0($sp),$rv
+	.PROCEND
+___
+{
+my $inp="%r26";
+my $len="%r25";
+
+$code.=<<___;
+	.EXPORT	OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_cleanse
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	cmpib,*=	0,$len,L\$done
+	nop
+	cmpib,*>>=	15,$len,L\$ittle
+	ldi		$SIZE_T-1,%r1
+
+L\$align
+	and,*<>		$inp,%r1,%r28
+	b,n		L\$aligned
+	stb		%r0,0($inp)
+	ldo		-1($len),$len
+	b		L\$align
+	ldo		1($inp),$inp
+
+L\$aligned
+	andcm		$len,%r1,%r28
+L\$ot
+	$ST		%r0,0($inp)
+	addib,*<>	-$SIZE_T,%r28,L\$ot
+	ldo		$SIZE_T($inp),$inp
+
+	and,*<>		$len,%r1,$len
+	b,n		L\$done
+L\$ittle
+	stb		%r0,0($inp)
+	addib,*<>	-1,$len,L\$ittle
+	ldo		1($inp),$inp
+L\$done
+	bv		($rp)
+	.EXIT
+	nop
+	.PROCEND
+___
+}
+{
+my ($out,$cnt,$max)=("%r26","%r25","%r24");
+my ($tick,$lasttick)=("%r23","%r22");
+my ($diff,$lastdiff)=("%r21","%r20");
+
+$code.=<<___;
+	.EXPORT	OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+L\$oop
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,<>	-1,$cnt,L\$oop
+	addi		4,$out,$out
+
+	bv		($rp)
+	.EXIT
+	sub		$rv,$cnt,$rv
+	.PROCEND
+
+	.EXPORT	OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus2
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	sub		%r0,$cnt,$cnt
+
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+L\$oop2
+	copy		$diff,$lastdiff
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,=		-1,$max,L\$done2
+	nop
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+	cmpclr,<>	$lastdiff,$diff,$tick
+	ldi		1,$tick
+
+	ldi		1,%r1
+	xor		%r1,$tick,$tick
+	addb,<>		$tick,$cnt,L\$oop2
+	shladd,l	$tick,2,$out,$out
+L\$done2
+	bv		($rp)
+	.EXIT
+	add		$rv,$cnt,$rv
+	.PROCEND
+___
+}
+$code =~ s/cmpib,\*/comib,/gm	if ($SIZE_T==4);
+$code =~ s/,\*/,/gm		if ($SIZE_T==4);
+$code =~ s/\bbv\b/bve/gm	if ($SIZE_T==8);
+print $code;
+close STDOUT;
+
diff --git a/main/openssl/crypto/pem/pem_all.c b/main/openssl/crypto/pem/pem_all.c
index 3e7a6093..eac0460e 100644
--- a/main/openssl/crypto/pem/pem_all.c
+++ b/main/openssl/crypto/pem/pem_all.c
@@ -193,7 +193,61 @@ RSA *PEM_read_RSAPrivateKey(FILE *fp, RSA **rsa, pem_password_cb *cb,
 
 #endif
 
+#ifdef OPENSSL_FIPS
+
+int PEM_write_bio_RSAPrivateKey(BIO *bp, RSA *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_RSA(k, x);
+
+		ret = PEM_write_bio_PrivateKey(bp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write_bio((i2d_of_void *)i2d_RSAPrivateKey,
+					PEM_STRING_RSA,bp,x,enc,kstr,klen,cb,u);
+}
+
+#ifndef OPENSSL_NO_FP_API
+int PEM_write_RSAPrivateKey(FILE *fp, RSA *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+
+		EVP_PKEY_set1_RSA(k, x);
+
+		ret = PEM_write_PrivateKey(fp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write((i2d_of_void *)i2d_RSAPrivateKey,
+					PEM_STRING_RSA,fp,x,enc,kstr,klen,cb,u);
+}
+#endif
+
+#else
+
 IMPLEMENT_PEM_write_cb_const(RSAPrivateKey, RSA, PEM_STRING_RSA, RSAPrivateKey)
+
+#endif
+
 IMPLEMENT_PEM_rw_const(RSAPublicKey, RSA, PEM_STRING_RSA_PUBLIC, RSAPublicKey)
 IMPLEMENT_PEM_rw(RSA_PUBKEY, RSA, PEM_STRING_PUBLIC, RSA_PUBKEY)
 
@@ -223,7 +277,59 @@ DSA *PEM_read_bio_DSAPrivateKey(BIO *bp, DSA **dsa, pem_password_cb *cb,
 	return pkey_get_dsa(pktmp, dsa);	/* will free pktmp */
 }
 
+#ifdef OPENSSL_FIPS
+
+int PEM_write_bio_DSAPrivateKey(BIO *bp, DSA *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_DSA(k, x);
+
+		ret = PEM_write_bio_PrivateKey(bp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write_bio((i2d_of_void *)i2d_DSAPrivateKey,
+					PEM_STRING_DSA,bp,x,enc,kstr,klen,cb,u);
+}
+
+#ifndef OPENSSL_NO_FP_API
+int PEM_write_DSAPrivateKey(FILE *fp, DSA *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_DSA(k, x);
+		ret = PEM_write_PrivateKey(fp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write((i2d_of_void *)i2d_DSAPrivateKey,
+					PEM_STRING_DSA,fp,x,enc,kstr,klen,cb,u);
+}
+#endif
+
+#else
+
 IMPLEMENT_PEM_write_cb_const(DSAPrivateKey, DSA, PEM_STRING_DSA, DSAPrivateKey)
+
+#endif
+
 IMPLEMENT_PEM_rw(DSA_PUBKEY, DSA, PEM_STRING_PUBLIC, DSA_PUBKEY)
 
 #ifndef OPENSSL_NO_FP_API
@@ -269,8 +375,63 @@ EC_KEY *PEM_read_bio_ECPrivateKey(BIO *bp, EC_KEY **key, pem_password_cb *cb,
 
 IMPLEMENT_PEM_rw_const(ECPKParameters, EC_GROUP, PEM_STRING_ECPARAMETERS, ECPKParameters)
 
+
+
+#ifdef OPENSSL_FIPS
+
+int PEM_write_bio_ECPrivateKey(BIO *bp, EC_KEY *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_EC_KEY(k, x);
+
+		ret = PEM_write_bio_PrivateKey(bp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write_bio((i2d_of_void *)i2d_ECPrivateKey,
+						PEM_STRING_ECPRIVATEKEY,
+						bp,x,enc,kstr,klen,cb,u);
+}
+
+#ifndef OPENSSL_NO_FP_API
+int PEM_write_ECPrivateKey(FILE *fp, EC_KEY *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_EC_KEY(k, x);
+		ret = PEM_write_PrivateKey(fp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write((i2d_of_void *)i2d_ECPrivateKey,
+						PEM_STRING_ECPRIVATEKEY,
+						fp,x,enc,kstr,klen,cb,u);
+}
+#endif
+
+#else
+
 IMPLEMENT_PEM_write_cb(ECPrivateKey, EC_KEY, PEM_STRING_ECPRIVATEKEY, ECPrivateKey)
 
+#endif
+
 IMPLEMENT_PEM_rw(EC_PUBKEY, EC_KEY, PEM_STRING_PUBLIC, EC_PUBKEY)
 
 #ifndef OPENSSL_NO_FP_API
diff --git a/main/openssl/crypto/pem/pem_info.c b/main/openssl/crypto/pem/pem_info.c
index 1b2be527..cc7f24a9 100644
--- a/main/openssl/crypto/pem/pem_info.c
+++ b/main/openssl/crypto/pem/pem_info.c
@@ -167,6 +167,7 @@ start:
 #ifndef OPENSSL_NO_RSA
 			if (strcmp(name,PEM_STRING_RSA) == 0)
 			{
+			d2i=(D2I_OF(void))d2i_RSAPrivateKey;
 			if (xi->x_pkey != NULL) 
 				{
 				if (!sk_X509_INFO_push(ret,xi)) goto err;
diff --git a/main/openssl/crypto/pem/pem_lib.c b/main/openssl/crypto/pem/pem_lib.c
index cfc89a99..5a421fc4 100644
--- a/main/openssl/crypto/pem/pem_lib.c
+++ b/main/openssl/crypto/pem/pem_lib.c
@@ -394,7 +394,8 @@ int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp,
 			goto err;
 		/* The 'iv' is used as the iv and as a salt.  It is
 		 * NOT taken from the BytesToKey function */
-		EVP_BytesToKey(enc,EVP_md5(),iv,kstr,klen,1,key,NULL);
+		if (!EVP_BytesToKey(enc,EVP_md5(),iv,kstr,klen,1,key,NULL))
+			goto err;
 
 		if (kstr == (unsigned char *)buf) OPENSSL_cleanse(buf,PEM_BUFSIZE);
 
@@ -406,12 +407,15 @@ int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp,
 		/* k=strlen(buf); */
 
 		EVP_CIPHER_CTX_init(&ctx);
-		EVP_EncryptInit_ex(&ctx,enc,NULL,key,iv);
-		EVP_EncryptUpdate(&ctx,data,&j,data,i);
-		EVP_EncryptFinal_ex(&ctx,&(data[j]),&i);
+		ret = 1;
+		if (!EVP_EncryptInit_ex(&ctx,enc,NULL,key,iv)
+			|| !EVP_EncryptUpdate(&ctx,data,&j,data,i)
+			|| !EVP_EncryptFinal_ex(&ctx,&(data[j]),&i))
+			ret = 0;
 		EVP_CIPHER_CTX_cleanup(&ctx);
+		if (ret == 0)
+			goto err;
 		i+=j;
-		ret=1;
 		}
 	else
 		{
@@ -459,14 +463,17 @@ int PEM_do_header(EVP_CIPHER_INFO *cipher, unsigned char *data, long *plen,
 	ebcdic2ascii(buf, buf, klen);
 #endif
 
-	EVP_BytesToKey(cipher->cipher,EVP_md5(),&(cipher->iv[0]),
-		(unsigned char *)buf,klen,1,key,NULL);
+	if (!EVP_BytesToKey(cipher->cipher,EVP_md5(),&(cipher->iv[0]),
+		(unsigned char *)buf,klen,1,key,NULL))
+		return 0;
 
 	j=(int)len;
 	EVP_CIPHER_CTX_init(&ctx);
-	EVP_DecryptInit_ex(&ctx,cipher->cipher,NULL, key,&(cipher->iv[0]));
-	EVP_DecryptUpdate(&ctx,data,&i,data,j);
-	o=EVP_DecryptFinal_ex(&ctx,&(data[i]),&j);
+	o = EVP_DecryptInit_ex(&ctx,cipher->cipher,NULL, key,&(cipher->iv[0]));
+	if (o)
+		o = EVP_DecryptUpdate(&ctx,data,&i,data,j);
+	if (o)
+		o = EVP_DecryptFinal_ex(&ctx,&(data[i]),&j);
 	EVP_CIPHER_CTX_cleanup(&ctx);
 	OPENSSL_cleanse((char *)buf,sizeof(buf));
 	OPENSSL_cleanse((char *)key,sizeof(key));
diff --git a/main/openssl/crypto/pem/pem_seal.c b/main/openssl/crypto/pem/pem_seal.c
index 59690b56..b6b4e134 100644
--- a/main/openssl/crypto/pem/pem_seal.c
+++ b/main/openssl/crypto/pem/pem_seal.c
@@ -96,7 +96,8 @@ int PEM_SealInit(PEM_ENCODE_SEAL_CTX *ctx, EVP_CIPHER *type, EVP_MD *md_type,
 	EVP_EncodeInit(&ctx->encode);
 
 	EVP_MD_CTX_init(&ctx->md);
-	EVP_SignInit(&ctx->md,md_type);
+	if (!EVP_SignInit(&ctx->md,md_type))
+		goto err;
 
 	EVP_CIPHER_CTX_init(&ctx->cipher);
 	ret=EVP_SealInit(&ctx->cipher,type,ek,ekl,iv,pubk,npubk);
@@ -163,7 +164,8 @@ int PEM_SealFinal(PEM_ENCODE_SEAL_CTX *ctx, unsigned char *sig, int *sigl,
 		goto err;
 		}
 
-	EVP_EncryptFinal_ex(&ctx->cipher,s,(int *)&i);
+	if (!EVP_EncryptFinal_ex(&ctx->cipher,s,(int *)&i))
+		goto err;
 	EVP_EncodeUpdate(&ctx->encode,out,&j,s,i);
 	*outl=j;
 	out+=j;
diff --git a/main/openssl/crypto/pem/pvkfmt.c b/main/openssl/crypto/pem/pvkfmt.c
index 5f130c45..b1bf71a5 100644
--- a/main/openssl/crypto/pem/pvkfmt.c
+++ b/main/openssl/crypto/pem/pvkfmt.c
@@ -709,13 +709,16 @@ static int derive_pvk_key(unsigned char *key,
 			const unsigned char *pass, int passlen)
 	{
 	EVP_MD_CTX mctx;
+	int rv = 1;
 	EVP_MD_CTX_init(&mctx);
-	EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL);
-	EVP_DigestUpdate(&mctx, salt, saltlen);
-	EVP_DigestUpdate(&mctx, pass, passlen);
-	EVP_DigestFinal_ex(&mctx, key, NULL);
+	if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL)
+		|| !EVP_DigestUpdate(&mctx, salt, saltlen)
+		|| !EVP_DigestUpdate(&mctx, pass, passlen)
+		|| !EVP_DigestFinal_ex(&mctx, key, NULL))
+			rv = 0;
+
 	EVP_MD_CTX_cleanup(&mctx);
-	return 1;
+	return rv;
 	}
 	
 
@@ -727,11 +730,12 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
 	const unsigned char *p = *in;
 	unsigned int magic;
 	unsigned char *enctmp = NULL, *q;
+	EVP_CIPHER_CTX cctx;
+	EVP_CIPHER_CTX_init(&cctx);
 	if (saltlen)
 		{
 		char psbuf[PEM_BUFSIZE];
 		unsigned char keybuf[20];
-		EVP_CIPHER_CTX cctx;
 		int enctmplen, inlen;
 		if (cb)
 			inlen=cb(psbuf,PEM_BUFSIZE,0,u);
@@ -757,37 +761,41 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
 		p += 8;
 		inlen = keylen - 8;
 		q = enctmp + 8;
-		EVP_CIPHER_CTX_init(&cctx);
-		EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
-		EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
-		EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen);
+		if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+			goto err;
+		if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+			goto err;
+		if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen))
+			goto err;
 		magic = read_ledword((const unsigned char **)&q);
 		if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
 			{
 			q = enctmp + 8;
 			memset(keybuf + 5, 0, 11);
-			EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
-								NULL);
+			if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
+								NULL))
+				goto err;
 			OPENSSL_cleanse(keybuf, 20);
-			EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
-			EVP_DecryptFinal_ex(&cctx, q + enctmplen,
-								&enctmplen);
+			if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+				goto err;
+			if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen,
+								&enctmplen))
+				goto err;
 			magic = read_ledword((const unsigned char **)&q);
 			if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
 				{
-				EVP_CIPHER_CTX_cleanup(&cctx);
 				PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT);
 				goto err;
 				}
 			}
 		else
 			OPENSSL_cleanse(keybuf, 20);
-		EVP_CIPHER_CTX_cleanup(&cctx);
 		p = enctmp;
 		}
 
 	ret = b2i_PrivateKey(&p, keylen);
 	err:
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	if (enctmp && saltlen)
 		OPENSSL_free(enctmp);
 	return ret;
@@ -841,6 +849,8 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
 	{
 	int outlen = 24, pklen;
 	unsigned char *p, *salt = NULL;
+	EVP_CIPHER_CTX cctx;
+	EVP_CIPHER_CTX_init(&cctx);
 	if (enclevel)
 		outlen += PVK_SALTLEN;
 	pklen = do_i2b(NULL, pk, 0);
@@ -885,7 +895,6 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
 		{
 		char psbuf[PEM_BUFSIZE];
 		unsigned char keybuf[20];
-		EVP_CIPHER_CTX cctx;
 		int enctmplen, inlen;
 		if (cb)
 			inlen=cb(psbuf,PEM_BUFSIZE,1,u);
@@ -902,16 +911,19 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
 		if (enclevel == 1)
 			memset(keybuf + 5, 0, 11);
 		p = salt + PVK_SALTLEN + 8;
-		EVP_CIPHER_CTX_init(&cctx);
-		EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
+		if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+			goto error;
 		OPENSSL_cleanse(keybuf, 20);
-		EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8);
-		EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen);
-		EVP_CIPHER_CTX_cleanup(&cctx);
+		if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8))
+			goto error;
+		if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen))
+			goto error;
 		}
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	return outlen;
 
 	error:
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	return -1;
 	}
 
diff --git a/main/openssl/crypto/perlasm/cbc.pl b/main/openssl/crypto/perlasm/cbc.pl
index 6fc25109..24561e75 100644
--- a/main/openssl/crypto/perlasm/cbc.pl
+++ b/main/openssl/crypto/perlasm/cbc.pl
@@ -150,7 +150,7 @@ sub cbc
 &set_label("PIC_point");
 	&blindpop("edx");
 	&lea("ecx",&DWP(&label("cbc_enc_jmp_table")."-".&label("PIC_point"),"edx"));
-	&mov($count,&DWP(0,"ecx",$count,4))
+	&mov($count,&DWP(0,"ecx",$count,4));
 	&add($count,"edx");
 	&xor("ecx","ecx");
 	&xor("edx","edx");
diff --git a/main/openssl/crypto/perlasm/ppc-xlate.pl b/main/openssl/crypto/perlasm/ppc-xlate.pl
index 4579671c..a3edd982 100755
--- a/main/openssl/crypto/perlasm/ppc-xlate.pl
+++ b/main/openssl/crypto/perlasm/ppc-xlate.pl
@@ -31,10 +31,9 @@ my $globl = sub {
 				$ret .= ".type	$name,\@function";
 				last;
 			      };
-	/linux.*64/	&& do {	$ret .= ".globl	.$name\n";
-				$ret .= ".type	.$name,\@function\n";
+	/linux.*64/	&& do {	$ret .= ".globl	$name\n";
+				$ret .= ".type	$name,\@function\n";
 				$ret .= ".section	\".opd\",\"aw\"\n";
-				$ret .= ".globl	$name\n";
 				$ret .= ".align	3\n";
 				$ret .= "$name:\n";
 				$ret .= ".quad	.$name,.TOC.\@tocbase,0\n";
@@ -62,6 +61,14 @@ my $machine = sub {
     }
     ".machine	$arch";
 };
+my $size = sub {
+    if ($flavour =~ /linux.*32/)
+    {	shift;
+	".size	" . join(",",@_);
+    }
+    else
+    {	"";	}
+};
 my $asciz = sub {
     shift;
     my $line = join(",",@_);
diff --git a/main/openssl/crypto/perlasm/x86_64-xlate.pl b/main/openssl/crypto/perlasm/x86_64-xlate.pl
index e47116b7..56d9b64b 100755
--- a/main/openssl/crypto/perlasm/x86_64-xlate.pl
+++ b/main/openssl/crypto/perlasm/x86_64-xlate.pl
@@ -62,12 +62,8 @@ my $flavour = shift;
 my $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 
-{ my ($stddev,$stdino,@junk)=stat(STDOUT);
-  my ($outdev,$outino,@junk)=stat($output);
-
-    open STDOUT,">$output" || die "can't open $output: $!"
-	if ($stddev!=$outdev || $stdino!=$outino);
-}
+open STDOUT,">$output" || die "can't open $output: $!"
+	if (defined($output));
 
 my $gas=1;	$gas=0 if ($output =~ /\.asm$/);
 my $elf=1;	$elf=0 if (!$gas);
@@ -116,12 +112,16 @@ my %globals;
 	    $line = substr($line,@+[0]); $line =~ s/^\s+//;
 
 	    undef $self->{sz};
-	    if ($self->{op} =~ /^(movz)b.*/) {	# movz is pain...
+	    if ($self->{op} =~ /^(movz)x?([bw]).*/) {	# movz is pain...
 		$self->{op} = $1;
-		$self->{sz} = "b";
+		$self->{sz} = $2;
 	    } elsif ($self->{op} =~ /call|jmp/) {
 		$self->{sz} = "";
-	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op)/) { # SSEn
+	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^v/) { # VEX
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) {
 		$self->{sz} = "";
 	    } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
 		$self->{op} = $1;
@@ -246,35 +246,39 @@ my %globals;
 	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
 	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
 
+	# Solaris /usr/ccs/bin/as can't handle multiplications
+	# in $self->{label}, new gas requires sign extension...
+	use integer;
+	$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	$self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+	$self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
+
 	if ($gas) {
-	    # Solaris /usr/ccs/bin/as can't handle multiplications
-	    # in $self->{label}, new gas requires sign extension...
-	    use integer;
-	    $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
-	    $self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
-	    $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
 	    $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
 
 	    if (defined($self->{index})) {
-		sprintf "%s%s(%%%s,%%%s,%d)",$self->{asterisk},
-					$self->{label},$self->{base},
+		sprintf "%s%s(%s,%%%s,%d)",$self->{asterisk},
+					$self->{label},
+					$self->{base}?"%$self->{base}":"",
 					$self->{index},$self->{scale};
 	    } else {
 		sprintf "%s%s(%%%s)",	$self->{asterisk},$self->{label},$self->{base};
 	    }
 	} else {
-	    %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", q=>"QWORD$PTR" );
+	    %szmap = (	b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR",
+	    		q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" );
 
 	    $self->{label} =~ s/\./\$/g;
 	    $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
 	    $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
-	    $sz="q" if ($self->{asterisk});
+	    $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq");
+	    $sz="l" if (opcode->mnemonic() eq "movd");
 
 	    if (defined($self->{index})) {
-		sprintf "%s[%s%s*%d+%s]",$szmap{$sz},
+		sprintf "%s[%s%s*%d%s]",$szmap{$sz},
 					$self->{label}?"$self->{label}+":"",
 					$self->{index},$self->{scale},
-					$self->{base};
+					$self->{base}?"+$self->{base}":"";
 	    } elsif ($self->{base} eq "rip") {
 		sprintf "%s[%s]",$szmap{$sz},$self->{label};
 	    } else {
@@ -506,6 +510,12 @@ my %globals;
 		    }
 		} elsif ($dir =~ /\.(text|data)/) {
 		    $current_segment=".$1";
+		} elsif ($dir =~ /\.hidden/) {
+		    if    ($flavour eq "macosx")  { $self->{value} = ".private_extern\t$prefix$line"; }
+		    elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+		} elsif ($dir =~ /\.comm/) {
+		    $self->{value} = "$dir\t$prefix$line";
+		    $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
 		}
 		$line = "";
 		return $self;
@@ -555,7 +565,8 @@ my %globals;
 					    $v.=" READONLY";
 					    $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
 					} elsif ($line=~/\.CRT\$/i) {
-					    $v.=" READONLY DWORD";
+					    $v.=" READONLY ";
+					    $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
 					}
 				    }
 				    $current_segment = $line;
@@ -577,7 +588,7 @@ my %globals;
 					    $self->{value}="${decor}SEH_end_$current_function->{name}:";
 					    $self->{value}.=":\n" if($masm);
 					}
-					$self->{value}.="$current_function->{name}\tENDP" if($masm);
+					$self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
 					undef $current_function;
 				    }
 				    last;
@@ -613,6 +624,19 @@ my %globals;
 						.join(",",@str) if (@str);
 				    last;
 				  };
+		/\.comm/    && do { my @str=split(/,\s*/,$line);
+				    my $v=undef;
+				    if ($nasm) {
+					$v.="common	$prefix@str[0] @str[1]";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT\n";
+					$v.="COMM	@str[0]:DWORD:".@str[1]/4;
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
 	    }
 	    $line = "";
 	}
@@ -625,9 +649,133 @@ my %globals;
     }
 }
 
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @opcode,($rex|0x40) if ($rex);
+}
+
+# older gas and ml64 don't handle SSE>2 instructions
+my %regrm = (	"%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+		"%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7	);
+
+my $movq = sub {	# elderly gas can't handle inter-register movq
+  my $arg = shift;
+  my @opcode=(0x66);
+    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+	my ($src,$dst)=($1,$2);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x7e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+	my ($src,$dst)=($2,$1);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x6e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pextrd = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+      my @opcode=(0x66);
+	$imm=$1;
+	$src=$2;
+	$dst=$3;
+	if ($dst =~ /%r([0-9]+)d/)	{ $dst = $1; }
+	elsif ($dst =~ /%e/)		{ $dst = $regrm{$dst}; }
+	rex(\@opcode,$src,$dst);
+	push @opcode,0x0f,0x3a,0x16;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pinsrd = sub {
+    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	$imm=$1;
+	$src=$2;
+	$dst=$3;
+	if ($src =~ /%r([0-9]+)/)	{ $src = $1; }
+	elsif ($src =~ /%e/)		{ $src = $regrm{$src}; }
+	rex(\@opcode,$dst,$src);
+	push @opcode,0x0f,0x3a,0x22;
+	push @opcode,0xc0|(($dst&7)<<3)|($src&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pshufb = sub {
+    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$2,$1);
+	push @opcode,0x0f,0x38,0x00;
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $palignr = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x0f;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	push @opcode,$1;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pclmulqdq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x44;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdrand = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$1,8);
+	push @opcode,0x0f,0xc7,0xf0|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
 if ($nasm) {
     print <<___;
 default	rel
+%define XMMWORD
 ___
 } elsif ($masm) {
     print <<___;
@@ -644,14 +792,22 @@ while($line=<>) {
 
     undef $label;
     undef $opcode;
-    undef $sz;
     undef @args;
 
     if ($label=label->re(\$line))	{ print $label->out(); }
 
     if (directive->re(\$line)) {
 	printf "%s",directive->out();
-    } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) {
+    } elsif ($opcode=opcode->re(\$line)) {
+	my $asm = eval("\$".$opcode->mnemonic());
+	undef @bytes;
+	
+	if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
+	    print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+	    next;
+	}
+
+	ARGUMENT: while (1) {
 	my $arg;
 
 	if ($arg=register->re(\$line))	{ opcode->size($arg->size()); }
@@ -667,19 +823,26 @@ while($line=<>) {
 	$line =~ s/^,\s*//;
 	} # ARGUMENT:
 
-	$sz=opcode->size();
-
 	if ($#args>=0) {
 	    my $insn;
+	    my $sz=opcode->size();
+
 	    if ($gas) {
 		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+		@args = map($_->out($sz),@args);
+		printf "\t%s\t%s",$insn,join(",",@args);
 	    } else {
 		$insn = $opcode->out();
-		$insn .= $sz if (map($_->out() =~ /x?mm/,@args));
+		foreach (@args) {
+		    my $arg = $_->out();
+		    # $insn.=$sz compensates for movq, pinsrw, ...
+		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
+		}
 		@args = reverse(@args);
 		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+		printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
 	    }
-	    printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
 	} else {
 	    printf "\t%s",$opcode->out();
 	}
diff --git a/main/openssl/crypto/perlasm/x86asm.pl b/main/openssl/crypto/perlasm/x86asm.pl
index 28080caa..eb543db2 100644
--- a/main/openssl/crypto/perlasm/x86asm.pl
+++ b/main/openssl/crypto/perlasm/x86asm.pl
@@ -80,6 +80,57 @@ sub ::movq
     {	&::generic("movq",@_);			}
 }
 
+# SSE>2 instructions
+my %regrm = (	"eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+		"esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7	);
+sub ::pextrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm);	}
+    else
+    {	&::generic("pextrd",@_);		}
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm);	}
+    else
+    {	&::generic("pinsrd",@_);		}
+}
+
+sub ::pshufb
+{ my($dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);	}
+    else
+    {	&::generic("pshufb",@_);		}
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("palignr",@_);		}
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("pclmulqdq",@_);		}
+}
+
+sub ::rdrand
+{ my ($dst)=@_;
+    if ($dst =~ /(e[a-dsd][ixp])/)
+    {	&::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});	}
+    else
+    {	&::generic("rdrand",@_);	}
+}
+
 # label management
 $lbdecor="L";		# local label decoration, set by package
 $label="000";
@@ -167,7 +218,7 @@ sub ::asm_init
     $filename=$fn;
     $i386=$cpu;
 
-    $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=0;
+    $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=$android=0;
     if    (($type eq "elf"))
     {	$elf=1;			require "x86gas.pl";	}
     elsif (($type eq "a\.out"))
@@ -184,6 +235,8 @@ sub ::asm_init
     {	$win32=1;		require "x86masm.pl";	}
     elsif (($type eq "macosx"))
     {	$aout=1; $macosx=1;	require "x86gas.pl";	}
+    elsif (($type eq "android"))
+    {	$elf=1; $android=1;	require "x86gas.pl";	}
     else
     {	print STDERR <<"EOF";
 Pick one target type from
diff --git a/main/openssl/crypto/perlasm/x86gas.pl b/main/openssl/crypto/perlasm/x86gas.pl
index 6eab727f..682a3a31 100644
--- a/main/openssl/crypto/perlasm/x86gas.pl
+++ b/main/openssl/crypto/perlasm/x86gas.pl
@@ -45,9 +45,8 @@ sub ::generic
     undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
 
     if ($#_==0)				{ &::emit($opcode);		}
-    elsif ($opcode =~ m/^j/o && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "call" && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode =~ m/^set/&& $#_==1)	{ &::emit($opcode,@arg);	}
+    elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+					{ &::emit($opcode,@arg);	}
     else				{ &::emit($opcode.$suffix,@arg);}
 
   1;
@@ -91,6 +90,7 @@ sub ::DWP
 }
 sub ::QWP	{ &::DWP(@_);	}
 sub ::BP	{ &::DWP(@_);	}
+sub ::WP	{ &::DWP(@_);	}
 sub ::BC	{ @_;		}
 sub ::DWC	{ @_;		}
 
@@ -149,22 +149,24 @@ sub ::public_label
 {   push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");   }
 
 sub ::file_end
-{   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
-	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,4";
-	if ($::elf)	{ push (@out,"$tmp,4\n"); }
-	else		{ push (@out,"$tmp\n"); }
-    }
-    if ($::macosx)
+{   if ($::macosx)
     {	if (%non_lazy_ptr)
     	{   push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n");
 	    foreach $i (keys %non_lazy_ptr)
 	    {	push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n");   }
 	}
     }
+    if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
+	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8";
+	if ($::macosx)	{ push (@out,"$tmp,2\n"); }
+	elsif ($::elf)	{ push (@out,"$tmp,4\n"); }
+	else		{ push (@out,"$tmp\n"); }
+    }
     push(@out,$initseg) if ($initseg);
 }
 
 sub ::data_byte	{   push(@out,".byte\t".join(',',@_)."\n");   }
+sub ::data_short{   push(@out,".value\t".join(',',@_)."\n");  }
 sub ::data_word {   push(@out,".long\t".join(',',@_)."\n");   }
 
 sub ::align
@@ -180,7 +182,7 @@ sub ::align
 sub ::picmeup
 { my($dst,$sym,$base,$reflabel)=@_;
 
-    if ($::pic && ($::elf || $::aout))
+    if (($::pic && ($::elf || $::aout)) || $::macosx)
     {	if (!defined($base))
 	{   &::call(&::label("PIC_me_up"));
 	    &::set_label("PIC_me_up");
@@ -206,13 +208,17 @@ sub ::picmeup
 sub ::initseg
 { my $f=$nmdecor.shift;
 
-    if ($::elf)
+    if ($::android)
+    {	$initseg.=<<___;
+.section	.init_array
+.align	4
+.long	$f
+___
+    }
+    elsif ($::elf)
     {	$initseg.=<<___;
 .section	.init
 	call	$f
-	jmp	.Linitalign
-.align	$align
-.Linitalign:
 ___
     }
     elsif ($::coff)
diff --git a/main/openssl/crypto/perlasm/x86masm.pl b/main/openssl/crypto/perlasm/x86masm.pl
index 3d50e4a7..f937d07c 100644
--- a/main/openssl/crypto/perlasm/x86masm.pl
+++ b/main/openssl/crypto/perlasm/x86masm.pl
@@ -14,9 +14,11 @@ sub ::generic
 { my ($opcode,@arg)=@_;
 
     # fix hexadecimal constants
-    for (@arg) { s/0x([0-9a-f]+)/0$1h/oi; }
+    for (@arg) { s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/oi; }
 
-    if ($opcode !~ /movq/)
+    if ($opcode =~ /lea/ && @arg[1] =~ s/.*PTR\s+(\(.*\))$/OFFSET $1/)	# no []
+    {	$opcode="mov";	}
+    elsif ($opcode !~ /movq/)
     {	# fix xmm references
 	$arg[0] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[1]=~/\bxmm[0-7]\b/i);
 	$arg[1] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[0]=~/\bxmm[0-7]\b/i);
@@ -31,6 +33,7 @@ sub ::generic
 sub ::call	{ &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
 sub ::call_ptr	{ &::emit("call",@_);	}
 sub ::jmp_ptr	{ &::emit("jmp",@_);	}
+sub ::lock	{ &::data_byte(0xf0);	}
 
 sub get_mem
 { my($size,$addr,$reg1,$reg2,$idx)=@_;
@@ -65,6 +68,7 @@ sub get_mem
   $ret;
 }
 sub ::BP	{ &get_mem("BYTE",@_);  }
+sub ::WP	{ &get_mem("WORD",@_);	}
 sub ::DWP	{ &get_mem("DWORD",@_); }
 sub ::QWP	{ &get_mem("QWORD",@_); }
 sub ::BC	{ "@_";  }
@@ -129,7 +133,7 @@ ___
     if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
     {	my $comm=<<___;
 .bss	SEGMENT 'BSS'
-COMM	${nmdecor}OPENSSL_ia32cap_P:DWORD
+COMM	${nmdecor}OPENSSL_ia32cap_P:QWORD
 .bss	ENDS
 ___
 	# comment out OPENSSL_ia32cap_P declarations
@@ -156,6 +160,9 @@ sub ::public_label
 sub ::data_byte
 {   push(@out,("DB\t").join(',',@_)."\n");	}
 
+sub ::data_short
+{   push(@out,("DW\t").join(',',@_)."\n");	}
+
 sub ::data_word
 {   push(@out,("DD\t").join(',',@_)."\n");	}
 
@@ -181,4 +188,11 @@ ___
 sub ::dataseg
 {   push(@out,"$segment\tENDS\n_DATA\tSEGMENT\n"); $segment="_DATA";   }
 
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"IF \@Version GE 710\n");
+    push(@out,".SAFESEH	".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"ENDIF\n");
+}
+
 1;
diff --git a/main/openssl/crypto/perlasm/x86nasm.pl b/main/openssl/crypto/perlasm/x86nasm.pl
index ce2bed9b..ca2511c9 100644
--- a/main/openssl/crypto/perlasm/x86nasm.pl
+++ b/main/openssl/crypto/perlasm/x86nasm.pl
@@ -19,6 +19,8 @@ sub ::generic
 	{   $_[0] = "NEAR $_[0]";   	}
 	elsif ($opcode eq "lea" && $#_==1)  # wipe storage qualifier from lea
 	{   $_[1] =~ s/^[^\[]*\[/\[/o;	}
+	elsif ($opcode eq "clflush" && $#_==0)
+	{   $_[0] =~ s/^[^\[]*\[/\[/o;	}
     }
     &::emit($opcode,@_);
   1;
@@ -67,6 +69,7 @@ sub get_mem
 }
 sub ::BP	{ &get_mem("BYTE",@_);  }
 sub ::DWP	{ &get_mem("DWORD",@_); }
+sub ::WP	{ &get_mem("WORD",@_);	}
 sub ::QWP	{ &get_mem("",@_);      }
 sub ::BC	{ (($::mwerks)?"":"BYTE ")."@_";  }
 sub ::DWC	{ (($::mwerks)?"":"DWORD ")."@_"; }
@@ -114,7 +117,7 @@ sub ::file_end
 {   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
     {	my $comm=<<___;
 ${drdecor}segment	.bss
-${drdecor}common	${nmdecor}OPENSSL_ia32cap_P 4
+${drdecor}common	${nmdecor}OPENSSL_ia32cap_P 8
 ___
 	# comment out OPENSSL_ia32cap_P declarations
 	grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
@@ -135,7 +138,8 @@ sub ::public_label
 
 sub ::data_byte
 {   push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n");	}
-
+sub ::data_short
+{   push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n");	}
 sub ::data_word
 {   push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n");	}
 
@@ -163,4 +167,11 @@ sub ::dataseg
     else		{ push(@out,"section\t.data align=4\n"); }
 }
 
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"%if	__NASM_VERSION_ID__ >= 0x02030000\n");
+    push(@out,"safeseh	".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"%endif\n");
+}
+
 1;
diff --git a/main/openssl/crypto/pkcs12/p12_crt.c b/main/openssl/crypto/pkcs12/p12_crt.c
index 96b131de..a34915d0 100644
--- a/main/openssl/crypto/pkcs12/p12_crt.c
+++ b/main/openssl/crypto/pkcs12/p12_crt.c
@@ -90,7 +90,14 @@ PKCS12 *PKCS12_create(char *pass, char *name, EVP_PKEY *pkey, X509 *cert,
 
 	/* Set defaults */
 	if (!nid_cert)
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			nid_cert = NID_pbe_WithSHA1And3_Key_TripleDES_CBC;
+		else
+#endif
 		nid_cert = NID_pbe_WithSHA1And40BitRC2_CBC;
+		}
 	if (!nid_key)
 		nid_key = NID_pbe_WithSHA1And3_Key_TripleDES_CBC;
 	if (!iter)
diff --git a/main/openssl/crypto/pkcs12/p12_decr.c b/main/openssl/crypto/pkcs12/p12_decr.c
index ba77dbbe..9d3557e8 100644
--- a/main/openssl/crypto/pkcs12/p12_decr.c
+++ b/main/openssl/crypto/pkcs12/p12_decr.c
@@ -89,7 +89,14 @@ unsigned char * PKCS12_pbe_crypt(X509_ALGOR *algor, const char *pass,
 		goto err;
 	}
 
-	EVP_CipherUpdate(&ctx, out, &i, in, inlen);
+	if (!EVP_CipherUpdate(&ctx, out, &i, in, inlen))
+		{
+		OPENSSL_free(out);
+		out = NULL;
+		PKCS12err(PKCS12_F_PKCS12_PBE_CRYPT,ERR_R_EVP_LIB);
+		goto err;
+		}
+
 	outlen = i;
 	if(!EVP_CipherFinal_ex(&ctx, out + i, &i)) {
 		OPENSSL_free(out);
diff --git a/main/openssl/crypto/pkcs12/p12_key.c b/main/openssl/crypto/pkcs12/p12_key.c
index 424203f6..61d58502 100644
--- a/main/openssl/crypto/pkcs12/p12_key.c
+++ b/main/openssl/crypto/pkcs12/p12_key.c
@@ -152,14 +152,16 @@ int PKCS12_key_gen_uni(unsigned char *pass, int passlen, unsigned char *salt,
 	for (i = 0; i < Slen; i++) *p++ = salt[i % saltlen];
 	for (i = 0; i < Plen; i++) *p++ = pass[i % passlen];
 	for (;;) {
-		EVP_DigestInit_ex(&ctx, md_type, NULL);
-		EVP_DigestUpdate(&ctx, D, v);
-		EVP_DigestUpdate(&ctx, I, Ilen);
-		EVP_DigestFinal_ex(&ctx, Ai, NULL);
+		if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+			|| !EVP_DigestUpdate(&ctx, D, v)
+			|| !EVP_DigestUpdate(&ctx, I, Ilen)
+			|| !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+			goto err;
 		for (j = 1; j < iter; j++) {
-			EVP_DigestInit_ex(&ctx, md_type, NULL);
-			EVP_DigestUpdate(&ctx, Ai, u);
-			EVP_DigestFinal_ex(&ctx, Ai, NULL);
+			if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+				|| !EVP_DigestUpdate(&ctx, Ai, u)
+				|| !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+			goto err;
 		}
 		memcpy (out, Ai, min (n, u));
 		if (u >= n) {
@@ -174,24 +176,32 @@ int PKCS12_key_gen_uni(unsigned char *pass, int passlen, unsigned char *salt,
 		out += u;
 		for (j = 0; j < v; j++) B[j] = Ai[j % u];
 		/* Work out B + 1 first then can use B as tmp space */
-		if (!BN_bin2bn (B, v, Bpl1)) goto err;
-		if (!BN_add_word (Bpl1, 1)) goto err;
+		if (!BN_bin2bn (B, v, Bpl1))
+			goto err;
+		if (!BN_add_word (Bpl1, 1))
+			goto err;
 		for (j = 0; j < Ilen ; j+=v) {
-			if (!BN_bin2bn (I + j, v, Ij)) goto err;
-			if (!BN_add (Ij, Ij, Bpl1)) goto err;
-			BN_bn2bin (Ij, B);
+			if (!BN_bin2bn(I + j, v, Ij))
+				goto err;
+			if (!BN_add(Ij, Ij, Bpl1))
+				goto err;
+			if (!BN_bn2bin(Ij, B))
+				goto err;
 			Ijlen = BN_num_bytes (Ij);
 			/* If more than 2^(v*8) - 1 cut off MSB */
 			if (Ijlen > v) {
-				BN_bn2bin (Ij, B);
+				if (!BN_bn2bin (Ij, B))
+					goto err;
 				memcpy (I + j, B + 1, v);
 #ifndef PKCS12_BROKEN_KEYGEN
 			/* If less than v bytes pad with zeroes */
 			} else if (Ijlen < v) {
 				memset(I + j, 0, v - Ijlen);
-				BN_bn2bin(Ij, I + j + v - Ijlen); 
+				if (!BN_bn2bin(Ij, I + j + v - Ijlen))
+					goto err;
 #endif
-			} else BN_bn2bin (Ij, I + j);
+			} else if (!BN_bn2bin (Ij, I + j))
+				goto err;
 		}
 	}
 
diff --git a/main/openssl/crypto/pkcs12/p12_kiss.c b/main/openssl/crypto/pkcs12/p12_kiss.c
index 292cc3ed..206b1b0b 100644
--- a/main/openssl/crypto/pkcs12/p12_kiss.c
+++ b/main/openssl/crypto/pkcs12/p12_kiss.c
@@ -167,7 +167,7 @@ int PKCS12_parse(PKCS12 *p12, const char *pass, EVP_PKEY **pkey, X509 **cert,
 	if (cert && *cert)
 		X509_free(*cert);
 	if (x)
-		X509_free(*cert);
+		X509_free(x);
 	if (ocerts)
 		sk_X509_pop_free(ocerts, X509_free);
 	return 0;
diff --git a/main/openssl/crypto/pkcs12/p12_mutl.c b/main/openssl/crypto/pkcs12/p12_mutl.c
index 9ab740d5..96de1bd1 100644
--- a/main/openssl/crypto/pkcs12/p12_mutl.c
+++ b/main/openssl/crypto/pkcs12/p12_mutl.c
@@ -97,10 +97,14 @@ int PKCS12_gen_mac(PKCS12 *p12, const char *pass, int passlen,
 		return 0;
 	}
 	HMAC_CTX_init(&hmac);
-	HMAC_Init_ex(&hmac, key, md_size, md_type, NULL);
-    	HMAC_Update(&hmac, p12->authsafes->d.data->data,
-					 p12->authsafes->d.data->length);
-    	HMAC_Final(&hmac, mac, maclen);
+	if (!HMAC_Init_ex(&hmac, key, md_size, md_type, NULL)
+    		|| !HMAC_Update(&hmac, p12->authsafes->d.data->data,
+					 p12->authsafes->d.data->length)
+    		|| !HMAC_Final(&hmac, mac, maclen))
+		{
+    		HMAC_CTX_cleanup(&hmac);
+		return 0;
+		}
     	HMAC_CTX_cleanup(&hmac);
 	return 1;
 }
diff --git a/main/openssl/crypto/pkcs7/pk7_doit.c b/main/openssl/crypto/pkcs7/pk7_doit.c
index 3bf1a367..77fda3b8 100644
--- a/main/openssl/crypto/pkcs7/pk7_doit.c
+++ b/main/openssl/crypto/pkcs7/pk7_doit.c
@@ -204,11 +204,11 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
 	unsigned char *ek = NULL;
 	size_t eklen;
 
-	int ret = 0;
+	int ret = -1;
 
 	pctx = EVP_PKEY_CTX_new(pkey, NULL);
 	if (!pctx)
-		return 0;
+		return -1;
 
 	if (EVP_PKEY_decrypt_init(pctx) <= 0)
 		goto err;
@@ -235,12 +235,19 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
 	if (EVP_PKEY_decrypt(pctx, ek, &eklen,
 				ri->enc_key->data, ri->enc_key->length) <= 0)
 		{
+		ret = 0;
 		PKCS7err(PKCS7_F_PKCS7_DECRYPT_RINFO, ERR_R_EVP_LIB);
 		goto err;
 		}
 
 	ret = 1;
 
+	if (*pek)
+		{
+		OPENSSL_cleanse(*pek, *peklen);
+		OPENSSL_free(*pek);
+		}
+
 	*pek = ek;
 	*peklen = eklen;
 
@@ -423,6 +430,8 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 	STACK_OF(X509_ALGOR) *md_sk=NULL;
 	STACK_OF(PKCS7_RECIP_INFO) *rsk=NULL;
 	PKCS7_RECIP_INFO *ri=NULL;
+       unsigned char *ek = NULL, *tkey = NULL;
+       int eklen = 0, tkeylen = 0;
 
 	i=OBJ_obj2nid(p7->type);
 	p7->state=PKCS7_S_HEADER;
@@ -500,8 +509,6 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 		int max;
 		X509_OBJECT ret;
 #endif
-		unsigned char *ek = NULL;
-		int eklen;
 
 		if ((etmp=BIO_new(BIO_f_cipher())) == NULL)
 			{
@@ -534,29 +541,28 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			}
 
 		/* If we haven't got a certificate try each ri in turn */
-
 		if (pcert == NULL)
 			{
+			/* Always attempt to decrypt all rinfo even
+			 * after sucess as a defence against MMA timing
+			 * attacks.
+			 */
 			for (i=0; i<sk_PKCS7_RECIP_INFO_num(rsk); i++)
 				{
 				ri=sk_PKCS7_RECIP_INFO_value(rsk,i);
+				
 				if (pkcs7_decrypt_rinfo(&ek, &eklen,
-							ri, pkey) > 0)
-					break;
+							ri, pkey) < 0)
+					goto err;
 				ERR_clear_error();
-				ri = NULL;
-				}
-			if (ri == NULL)
-				{
-				PKCS7err(PKCS7_F_PKCS7_DATADECODE,
-				      PKCS7_R_NO_RECIPIENT_MATCHES_KEY);
-				goto err;
 				}
 			}
 		else
 			{
-			if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) <= 0)
+			/* Only exit on fatal errors, not decrypt failure */
+			if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) < 0)
 				goto err;
+			ERR_clear_error();
 			}
 
 		evp_ctx=NULL;
@@ -565,6 +571,19 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			goto err;
 		if (EVP_CIPHER_asn1_to_param(evp_ctx,enc_alg->parameter) < 0)
 			goto err;
+		/* Generate random key as MMA defence */
+		tkeylen = EVP_CIPHER_CTX_key_length(evp_ctx);
+		tkey = OPENSSL_malloc(tkeylen);
+		if (!tkey)
+			goto err;
+		if (EVP_CIPHER_CTX_rand_key(evp_ctx, tkey) <= 0)
+			goto err;
+		if (ek == NULL)
+			{
+			ek = tkey;
+			eklen = tkeylen;
+			tkey = NULL;
+			}
 
 		if (eklen != EVP_CIPHER_CTX_key_length(evp_ctx)) {
 			/* Some S/MIME clients don't use the same key
@@ -573,11 +592,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			 */
 			if(!EVP_CIPHER_CTX_set_key_length(evp_ctx, eklen))
 				{
-				PKCS7err(PKCS7_F_PKCS7_DATADECODE,
-					PKCS7_R_DECRYPTED_KEY_IS_WRONG_LENGTH);
-				goto err;
+				/* Use random key as MMA defence */
+				OPENSSL_cleanse(ek, eklen);
+				OPENSSL_free(ek);
+				ek = tkey;
+				eklen = tkeylen;
+				tkey = NULL;
 				}
 		} 
+		/* Clear errors so we don't leak information useful in MMA */
+		ERR_clear_error();
 		if (EVP_CipherInit_ex(evp_ctx,NULL,NULL,ek,NULL,0) <= 0)
 			goto err;
 
@@ -585,6 +609,13 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			{
 			OPENSSL_cleanse(ek,eklen);
 			OPENSSL_free(ek);
+                       ek = NULL;
+			}
+		if (tkey)
+			{
+			OPENSSL_cleanse(tkey,tkeylen);
+			OPENSSL_free(tkey);
+                       tkey = NULL;
 			}
 
 		if (out == NULL)
@@ -627,6 +658,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 	if (0)
 		{
 err:
+               if (ek)
+                       {
+                       OPENSSL_cleanse(ek,eklen);
+                       OPENSSL_free(ek);
+                       }
+               if (tkey)
+                       {
+                       OPENSSL_cleanse(tkey,tkeylen);
+                       OPENSSL_free(tkey);
+                       }
 		if (out != NULL) BIO_free_all(out);
 		if (btmp != NULL) BIO_free_all(btmp);
 		if (etmp != NULL) BIO_free_all(etmp);
@@ -676,7 +717,11 @@ static int do_pkcs7_signed_attrib(PKCS7_SIGNER_INFO *si, EVP_MD_CTX *mctx)
 		}
 
 	/* Add digest */
-	EVP_DigestFinal_ex(mctx, md_data,&md_len);
+	if (!EVP_DigestFinal_ex(mctx, md_data,&md_len))
+		{
+		PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_EVP_LIB);
+		return 0;
+		}
 	if (!PKCS7_add1_attrib_digest(si, md_data, md_len))
 		{
 		PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_MALLOC_FAILURE);
@@ -784,7 +829,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio)
 
 			/* We now have the EVP_MD_CTX, lets do the
 			 * signing. */
-			EVP_MD_CTX_copy_ex(&ctx_tmp,mdc);
+			if (!EVP_MD_CTX_copy_ex(&ctx_tmp,mdc))
+				goto err;
 
 			sk=si->auth_attr;
 
@@ -822,7 +868,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio)
 		if (!PKCS7_find_digest(&mdc, bio,
 				OBJ_obj2nid(p7->d.digest->md->algorithm)))
 			goto err;
-		EVP_DigestFinal_ex(mdc,md_data,&md_len);
+		if (!EVP_DigestFinal_ex(mdc,md_data,&md_len))
+			goto err;
 		M_ASN1_OCTET_STRING_set(p7->d.digest->digest, md_data, md_len);
 		}
 
@@ -1015,7 +1062,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si,
 
 	/* mdc is the digest ctx that we want, unless there are attributes,
 	 * in which case the digest is the signed attributes */
-	EVP_MD_CTX_copy_ex(&mdc_tmp,mdc);
+	if (!EVP_MD_CTX_copy_ex(&mdc_tmp,mdc))
+		goto err;
 
 	sk=si->auth_attr;
 	if ((sk != NULL) && (sk_X509_ATTRIBUTE_num(sk) != 0))
@@ -1025,7 +1073,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si,
 		int alen;
 		ASN1_OCTET_STRING *message_digest;
 
-		EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len);
+		if (!EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len))
+			goto err;
 		message_digest=PKCS7_digest_from_attributes(sk);
 		if (!message_digest)
 			{
@@ -1050,7 +1099,8 @@ for (ii=0; ii<md_len; ii++) printf("%02X",md_dat[ii]); printf(" calc\n");
 			goto err;
 			}
 
-		EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL);
+		if (!EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL))
+			goto err;
 
 		alen = ASN1_item_i2d((ASN1_VALUE *)sk, &abuf,
 						ASN1_ITEM_rptr(PKCS7_ATTR_VERIFY));
@@ -1060,7 +1110,8 @@ for (ii=0; ii<md_len; ii++) printf("%02X",md_dat[ii]); printf(" calc\n");
 			ret = -1;
 			goto err;
 			}
-		EVP_VerifyUpdate(&mdc_tmp, abuf, alen);
+		if (!EVP_VerifyUpdate(&mdc_tmp, abuf, alen))
+			goto err;
 
 		OPENSSL_free(abuf);
 		}
diff --git a/main/openssl/crypto/pkcs7/pk7_smime.c b/main/openssl/crypto/pkcs7/pk7_smime.c
index 86742d0d..a5104f8d 100644
--- a/main/openssl/crypto/pkcs7/pk7_smime.c
+++ b/main/openssl/crypto/pkcs7/pk7_smime.c
@@ -573,15 +573,34 @@ int PKCS7_decrypt(PKCS7 *p7, EVP_PKEY *pkey, X509 *cert, BIO *data, int flags)
 			return 0;
 		}
 		ret = SMIME_text(bread, data);
+		if (ret > 0 && BIO_method_type(tmpmem) == BIO_TYPE_CIPHER)
+			{
+			if (!BIO_get_cipher_status(tmpmem))
+				ret = 0;
+			}
 		BIO_free_all(bread);
 		return ret;
 	} else {
 		for(;;) {
 			i = BIO_read(tmpmem, buf, sizeof(buf));
-			if(i <= 0) break;
-			BIO_write(data, buf, i);
+			if(i <= 0)
+				{
+				ret = 1;
+				if (BIO_method_type(tmpmem) == BIO_TYPE_CIPHER)
+					{
+					if (!BIO_get_cipher_status(tmpmem))
+						ret = 0;
+					}
+					
+				break;
+				}
+			if (BIO_write(data, buf, i) != i)
+				{
+				ret = 0;
+				break;
+				}
 		}
 		BIO_free_all(tmpmem);
-		return 1;
+		return ret;
 	}
 }
diff --git a/main/openssl/crypto/ppccpuid.pl b/main/openssl/crypto/ppccpuid.pl
index 369e1d0d..4ba736a1 100755
--- a/main/openssl/crypto/ppccpuid.pl
+++ b/main/openssl/crypto/ppccpuid.pl
@@ -23,36 +23,67 @@ $code=<<___;
 .machine	"any"
 .text
 
-.globl	.OPENSSL_cpuid_setup
+.globl	.OPENSSL_ppc64_probe
 .align	4
-.OPENSSL_cpuid_setup:
+.OPENSSL_ppc64_probe:
+	fcfid	f1,f1
+	extrdi	r0,r0,32,0
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+
+.globl	.OPENSSL_altivec_probe
+.align	4
+.OPENSSL_altivec_probe:
+	.long	0x10000484	# vor	v0,v0,v0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_wipe_cpu
 .align	4
 .OPENSSL_wipe_cpu:
 	xor	r0,r0,r0
+	fmr	f0,f31
+	fmr	f1,f31
+	fmr	f2,f31
 	mr	r3,r1
+	fmr	f3,f31
 	xor	r4,r4,r4
+	fmr	f4,f31
 	xor	r5,r5,r5
+	fmr	f5,f31
 	xor	r6,r6,r6
+	fmr	f6,f31
 	xor	r7,r7,r7
+	fmr	f7,f31
 	xor	r8,r8,r8
+	fmr	f8,f31
 	xor	r9,r9,r9
+	fmr	f9,f31
 	xor	r10,r10,r10
+	fmr	f10,f31
 	xor	r11,r11,r11
+	fmr	f11,f31
 	xor	r12,r12,r12
+	fmr	f12,f31
+	fmr	f13,f31
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_atomic_add
 .align	4
 .OPENSSL_atomic_add:
-Loop:	lwarx	r5,0,r3
+Ladd:	lwarx	r5,0,r3
 	add	r0,r4,r5
 	stwcx.	r0,0,r3
-	bne-	Loop
+	bne-	Ladd
 	$SIGNX	r3,r0
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 .globl	.OPENSSL_rdtsc
 .align	4
@@ -60,6 +91,8 @@ Loop:	lwarx	r5,0,r3
 	mftb	r3
 	mftbu	r4
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_cleanse
 .align	4
@@ -72,7 +105,7 @@ Loop:	lwarx	r5,0,r3
 Little:	mtctr	r4
 	stb	r0,0(r3)
 	addi	r3,r3,1
-	bdnz-	\$-8
+	bdnz	\$-8
 	blr
 Lot:	andi.	r5,r3,3
 	beq	Laligned
@@ -85,10 +118,13 @@ Laligned:
 	mtctr	r5
 	stw	r0,0(r3)
 	addi	r3,r3,4
-	bdnz-	\$-8
+	bdnz	\$-8
 	andi.	r4,r4,3
 	bne	Little
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/rand/md_rand.c b/main/openssl/crypto/rand/md_rand.c
index b2f04ff1..dd291637 100644
--- a/main/openssl/crypto/rand/md_rand.c
+++ b/main/openssl/crypto/rand/md_rand.c
@@ -109,6 +109,8 @@
  *
  */
 
+#define OPENSSL_FIPSEVP
+
 #ifdef MD_RAND_DEBUG
 # ifndef NDEBUG
 #   define NDEBUG
@@ -121,10 +123,10 @@
 
 #include "e_os.h"
 
+#include <openssl/crypto.h>
 #include <openssl/rand.h>
 #include "rand_lcl.h"
 
-#include <openssl/crypto.h>
 #include <openssl/err.h>
 
 #ifdef BN_DEBUG
@@ -157,13 +159,14 @@ const char RAND_version[]="RAND" OPENSSL_VERSION_PTEXT;
 static void ssleay_rand_cleanup(void);
 static void ssleay_rand_seed(const void *buf, int num);
 static void ssleay_rand_add(const void *buf, int num, double add_entropy);
-static int ssleay_rand_bytes(unsigned char *buf, int num);
+static int ssleay_rand_bytes(unsigned char *buf, int num, int pseudo);
+static int ssleay_rand_nopseudo_bytes(unsigned char *buf, int num);
 static int ssleay_rand_pseudo_bytes(unsigned char *buf, int num);
 static int ssleay_rand_status(void);
 
 RAND_METHOD rand_ssleay_meth={
 	ssleay_rand_seed,
-	ssleay_rand_bytes,
+	ssleay_rand_nopseudo_bytes,
 	ssleay_rand_cleanup,
 	ssleay_rand_add,
 	ssleay_rand_pseudo_bytes,
@@ -328,7 +331,7 @@ static void ssleay_rand_seed(const void *buf, int num)
 	ssleay_rand_add(buf, num, (double)num);
 	}
 
-static int ssleay_rand_bytes(unsigned char *buf, int num)
+static int ssleay_rand_bytes(unsigned char *buf, int num, int pseudo)
 	{
 	static volatile int stirred_pool = 0;
 	int i,j,k,st_num,st_idx;
@@ -377,8 +380,11 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
 	 * are fed into the hash function and the results are kept in the
 	 * global 'md'.
 	 */
-
-	CRYPTO_w_lock(CRYPTO_LOCK_RAND);
+#ifdef OPENSSL_FIPS
+	/* NB: in FIPS mode we are already under a lock */
+	if (!FIPS_mode())
+#endif
+		CRYPTO_w_lock(CRYPTO_LOCK_RAND);
 
 	/* prevent ssleay_rand_bytes() from trying to obtain the lock again */
 	CRYPTO_w_lock(CRYPTO_LOCK_RAND2);
@@ -457,7 +463,10 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
 
 	/* before unlocking, we must clear 'crypto_lock_rand' */
 	crypto_lock_rand = 0;
-	CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
+#ifdef OPENSSL_FIPS
+	if (!FIPS_mode())
+#endif
+		CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
 
 	while (num > 0)
 		{
@@ -509,15 +518,23 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
 	MD_Init(&m);
 	MD_Update(&m,(unsigned char *)&(md_c[0]),sizeof(md_c));
 	MD_Update(&m,local_md,MD_DIGEST_LENGTH);
-	CRYPTO_w_lock(CRYPTO_LOCK_RAND);
+#ifdef OPENSSL_FIPS
+	if (!FIPS_mode())
+#endif
+		CRYPTO_w_lock(CRYPTO_LOCK_RAND);
 	MD_Update(&m,md,MD_DIGEST_LENGTH);
 	MD_Final(&m,md);
-	CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
+#ifdef OPENSSL_FIPS
+	if (!FIPS_mode())
+#endif
+		CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
 
 	EVP_MD_CTX_cleanup(&m);
 	if (ok)
 		return(1);
-	else
+	else if (pseudo)
+		return 0;
+	else 
 		{
 		RANDerr(RAND_F_SSLEAY_RAND_BYTES,RAND_R_PRNG_NOT_SEEDED);
 		ERR_add_error_data(1, "You need to read the OpenSSL FAQ, "
@@ -526,22 +543,16 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
 		}
 	}
 
+static int ssleay_rand_nopseudo_bytes(unsigned char *buf, int num)
+	{
+	return ssleay_rand_bytes(buf, num, 0);
+	}
+
 /* pseudo-random bytes that are guaranteed to be unique but not
    unpredictable */
 static int ssleay_rand_pseudo_bytes(unsigned char *buf, int num) 
 	{
-	int ret;
-	unsigned long err;
-
-	ret = RAND_bytes(buf, num);
-	if (ret == 0)
-		{
-		err = ERR_peek_error();
-		if (ERR_GET_LIB(err) == ERR_LIB_RAND &&
-		    ERR_GET_REASON(err) == RAND_R_PRNG_NOT_SEEDED)
-			ERR_clear_error();
-		}
-	return (ret);
+	return ssleay_rand_bytes(buf, num, 1);
 	}
 
 static int ssleay_rand_status(void)
diff --git a/main/openssl/crypto/rand/rand.h b/main/openssl/crypto/rand/rand.h
index ac6c0217..bb5520e8 100644
--- a/main/openssl/crypto/rand/rand.h
+++ b/main/openssl/crypto/rand/rand.h
@@ -119,6 +119,11 @@ int RAND_event(UINT, WPARAM, LPARAM);
 
 #endif
 
+#ifdef OPENSSL_FIPS
+void RAND_set_fips_drbg_type(int type, int flags);
+int RAND_init_fips(void);
+#endif
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -129,9 +134,14 @@ void ERR_load_RAND_strings(void);
 
 /* Function codes. */
 #define RAND_F_RAND_GET_RAND_METHOD			 101
+#define RAND_F_RAND_INIT_FIPS				 102
 #define RAND_F_SSLEAY_RAND_BYTES			 100
 
 /* Reason codes. */
+#define RAND_R_DUAL_EC_DRBG_DISABLED			 104
+#define RAND_R_ERROR_INITIALISING_DRBG			 102
+#define RAND_R_ERROR_INSTANTIATING_DRBG			 103
+#define RAND_R_NO_FIPS_RANDOM_METHOD_SET		 101
 #define RAND_R_PRNG_NOT_SEEDED				 100
 
 #ifdef  __cplusplus
diff --git a/main/openssl/crypto/rand/rand_err.c b/main/openssl/crypto/rand/rand_err.c
index 03cda4dd..c4c80fc8 100644
--- a/main/openssl/crypto/rand/rand_err.c
+++ b/main/openssl/crypto/rand/rand_err.c
@@ -1,6 +1,6 @@
 /* crypto/rand/rand_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -71,12 +71,17 @@
 static ERR_STRING_DATA RAND_str_functs[]=
 	{
 {ERR_FUNC(RAND_F_RAND_GET_RAND_METHOD),	"RAND_get_rand_method"},
+{ERR_FUNC(RAND_F_RAND_INIT_FIPS),	"RAND_init_fips"},
 {ERR_FUNC(RAND_F_SSLEAY_RAND_BYTES),	"SSLEAY_RAND_BYTES"},
 {0,NULL}
 	};
 
 static ERR_STRING_DATA RAND_str_reasons[]=
 	{
+{ERR_REASON(RAND_R_DUAL_EC_DRBG_DISABLED),"dual ec drbg disabled"},
+{ERR_REASON(RAND_R_ERROR_INITIALISING_DRBG),"error initialising drbg"},
+{ERR_REASON(RAND_R_ERROR_INSTANTIATING_DRBG),"error instantiating drbg"},
+{ERR_REASON(RAND_R_NO_FIPS_RANDOM_METHOD_SET),"no fips random method set"},
 {ERR_REASON(RAND_R_PRNG_NOT_SEEDED)      ,"PRNG not seeded"},
 {0,NULL}
 	};
diff --git a/main/openssl/crypto/rand/rand_lib.c b/main/openssl/crypto/rand/rand_lib.c
index 513e3389..5ac0e14c 100644
--- a/main/openssl/crypto/rand/rand_lib.c
+++ b/main/openssl/crypto/rand/rand_lib.c
@@ -60,10 +60,16 @@
 #include <time.h>
 #include "cryptlib.h"
 #include <openssl/rand.h>
+
 #ifndef OPENSSL_NO_ENGINE
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/fips_rand.h>
+#endif
+
 #ifndef OPENSSL_NO_ENGINE
 /* non-NULL if default_RAND_meth is ENGINE-provided */
 static ENGINE *funct_ref =NULL;
@@ -174,3 +180,127 @@ int RAND_status(void)
 		return meth->status();
 	return 0;
 	}
+
+#ifdef OPENSSL_FIPS
+
+/* FIPS DRBG initialisation code. This sets up the DRBG for use by the
+ * rest of OpenSSL. 
+ */
+
+/* Entropy gatherer: use standard OpenSSL PRNG to seed (this will gather
+ * entropy internally through RAND_poll().
+ */
+
+static size_t drbg_get_entropy(DRBG_CTX *ctx, unsigned char **pout,
+                                int entropy, size_t min_len, size_t max_len)
+        {
+	/* Round up request to multiple of block size */
+	min_len = ((min_len + 19) / 20) * 20;
+	*pout = OPENSSL_malloc(min_len);
+	if (!*pout)
+		return 0;
+	if (RAND_SSLeay()->bytes(*pout, min_len) <= 0)
+		{
+		OPENSSL_free(*pout);
+		*pout = NULL;
+		return 0;
+		}
+        return min_len;
+        }
+
+static void drbg_free_entropy(DRBG_CTX *ctx, unsigned char *out, size_t olen)
+	{
+	if (out)
+		{
+		OPENSSL_cleanse(out, olen);
+		OPENSSL_free(out);
+		}
+	}
+
+/* Set "additional input" when generating random data. This uses the
+ * current PID, a time value and a counter.
+ */
+
+static size_t drbg_get_adin(DRBG_CTX *ctx, unsigned char **pout)
+    	{
+	/* Use of static variables is OK as this happens under a lock */
+	static unsigned char buf[16];
+	static unsigned long counter;
+	FIPS_get_timevec(buf, &counter);
+	*pout = buf;
+	return sizeof(buf);
+	}
+
+/* RAND_add() and RAND_seed() pass through to OpenSSL PRNG so it is 
+ * correctly seeded by RAND_poll().
+ */
+
+static int drbg_rand_add(DRBG_CTX *ctx, const void *in, int inlen,
+				double entropy)
+	{
+	RAND_SSLeay()->add(in, inlen, entropy);
+	return 1;
+	}
+
+static int drbg_rand_seed(DRBG_CTX *ctx, const void *in, int inlen)
+	{
+	RAND_SSLeay()->seed(in, inlen);
+	return 1;
+	}
+
+#ifndef OPENSSL_DRBG_DEFAULT_TYPE
+#define OPENSSL_DRBG_DEFAULT_TYPE	NID_aes_256_ctr
+#endif
+#ifndef OPENSSL_DRBG_DEFAULT_FLAGS
+#define OPENSSL_DRBG_DEFAULT_FLAGS	DRBG_FLAG_CTR_USE_DF
+#endif 
+
+static int fips_drbg_type = OPENSSL_DRBG_DEFAULT_TYPE;
+static int fips_drbg_flags = OPENSSL_DRBG_DEFAULT_FLAGS;
+
+void RAND_set_fips_drbg_type(int type, int flags)
+	{
+	fips_drbg_type = type;
+	fips_drbg_flags = flags;
+	}
+
+int RAND_init_fips(void)
+	{
+	DRBG_CTX *dctx;
+	size_t plen;
+	unsigned char pers[32], *p;
+#ifndef OPENSSL_ALLOW_DUAL_EC_DRBG
+	if (fips_drbg_type >> 16)
+		{
+		RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_DUAL_EC_DRBG_DISABLED);
+		return 0;
+		}
+#endif
+		
+	dctx = FIPS_get_default_drbg();
+        if (FIPS_drbg_init(dctx, fips_drbg_type, fips_drbg_flags) <= 0)
+		{
+		RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INITIALISING_DRBG);
+		return 0;
+		}
+		
+        FIPS_drbg_set_callbacks(dctx,
+				drbg_get_entropy, drbg_free_entropy, 20,
+				drbg_get_entropy, drbg_free_entropy);
+	FIPS_drbg_set_rand_callbacks(dctx, drbg_get_adin, 0,
+					drbg_rand_seed, drbg_rand_add);
+	/* Personalisation string: a string followed by date time vector */
+	strcpy((char *)pers, "OpenSSL DRBG2.0");
+	plen = drbg_get_adin(dctx, &p);
+	memcpy(pers + 16, p, plen);
+
+        if (FIPS_drbg_instantiate(dctx, pers, sizeof(pers)) <= 0)
+		{
+		RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INSTANTIATING_DRBG);
+		return 0;
+		}
+        FIPS_rand_set_method(FIPS_drbg_method());
+	return 1;
+	}
+
+#endif
diff --git a/main/openssl/crypto/rand/rand_unix.c b/main/openssl/crypto/rand/rand_unix.c
index e9ead3a5..e3a65571 100644
--- a/main/openssl/crypto/rand/rand_unix.c
+++ b/main/openssl/crypto/rand/rand_unix.c
@@ -133,47 +133,87 @@
 # define FD_SETSIZE (8*sizeof(fd_set))
 #endif
 
-#ifdef __VOS__
+#if defined(OPENSSL_SYS_VOS)
+
+/* The following algorithm repeatedly samples the real-time clock
+   (RTC) to generate a sequence of unpredictable data.  The algorithm
+   relies upon the uneven execution speed of the code (due to factors
+   such as cache misses, interrupts, bus activity, and scheduling) and
+   upon the rather large relative difference between the speed of the
+   clock and the rate at which it can be read.
+
+   If this code is ported to an environment where execution speed is
+   more constant or where the RTC ticks at a much slower rate, or the
+   clock can be read with fewer instructions, it is likely that the
+   results would be far more predictable.
+
+   As a precaution, we generate 4 times the minimum required amount of
+   seed data.  */
+
 int RAND_poll(void)
 {
-	unsigned char buf[ENTROPY_NEEDED];
+	short int code;
+	gid_t curr_gid;
 	pid_t curr_pid;
 	uid_t curr_uid;
-	static int first=1;
-	int i;
-	long rnd = 0;
+	int i, k;
 	struct timespec ts;
-	unsigned seed;
-
-/* The VOS random() function starts from a static seed so its
-   initial value is predictable.  If random() returns the
-   initial value, reseed it with dynamic data.  The VOS
-   real-time clock has a granularity of 1 nsec so it should be
-   reasonably difficult to predict its exact value.  Do not
-   gratuitously reseed the PRNG because other code in this
-   process or thread may be using it.  */
-
-	if (first) {
-		first = 0;
-		rnd = random ();
-		if (rnd == 1804289383) {
-			clock_gettime (CLOCK_REALTIME, &ts);
-			curr_pid = getpid();
-			curr_uid = getuid();
-			seed = ts.tv_sec ^ ts.tv_nsec ^ curr_pid ^ curr_uid;
-			srandom (seed);
-		}
-	}
+	unsigned char v;
 
-	for (i = 0; i < sizeof(buf); i++) {
-		if (i % 4 == 0)
-			rnd = random();
-		buf[i] = rnd;
-		rnd >>= 8;
-	}
-	RAND_add(buf, sizeof(buf), ENTROPY_NEEDED);
-	memset(buf, 0, sizeof(buf));
+#ifdef OPENSSL_SYS_VOS_HPPA
+	long duration;
+	extern void s$sleep (long *_duration, short int *_code);
+#else
+#ifdef OPENSSL_SYS_VOS_IA32
+	long long duration;
+	extern void s$sleep2 (long long *_duration, short int *_code);
+#else
+#error "Unsupported Platform."
+#endif /* OPENSSL_SYS_VOS_IA32 */
+#endif /* OPENSSL_SYS_VOS_HPPA */
+
+	/* Seed with the gid, pid, and uid, to ensure *some*
+	   variation between different processes.  */
+
+	curr_gid = getgid();
+	RAND_add (&curr_gid, sizeof curr_gid, 1);
+	curr_gid = 0;
+
+	curr_pid = getpid();
+	RAND_add (&curr_pid, sizeof curr_pid, 1);
+	curr_pid = 0;
+
+	curr_uid = getuid();
+	RAND_add (&curr_uid, sizeof curr_uid, 1);
+	curr_uid = 0;
 
+	for (i=0; i<(ENTROPY_NEEDED*4); i++)
+	{
+		/* burn some cpu; hope for interrupts, cache
+		   collisions, bus interference, etc.  */
+		for (k=0; k<99; k++)
+			ts.tv_nsec = random ();
+
+#ifdef OPENSSL_SYS_VOS_HPPA
+		/* sleep for 1/1024 of a second (976 us).  */
+		duration = 1;
+		s$sleep (&duration, &code);
+#else
+#ifdef OPENSSL_SYS_VOS_IA32
+		/* sleep for 1/65536 of a second (15 us).  */
+		duration = 1;
+		s$sleep2 (&duration, &code);
+#endif /* OPENSSL_SYS_VOS_IA32 */
+#endif /* OPENSSL_SYS_VOS_HPPA */
+
+		/* get wall clock time.  */
+		clock_gettime (CLOCK_REALTIME, &ts);
+
+		/* take 8 bits */
+		v = (unsigned char) (ts.tv_nsec % 256);
+		RAND_add (&v, sizeof v, 1);
+		v = 0;
+	}
 	return 1;
 }
 #elif defined __OpenBSD__
diff --git a/main/openssl/crypto/rand/randfile.c b/main/openssl/crypto/rand/randfile.c
index bc7d9c58..7f142807 100644
--- a/main/openssl/crypto/rand/randfile.c
+++ b/main/openssl/crypto/rand/randfile.c
@@ -57,7 +57,9 @@
  */
 
 /* We need to define this to get macros like S_IFBLK and S_IFCHR */
+#if !defined(OPENSSL_SYS_VXWORKS)
 #define _XOPEN_SOURCE 500
+#endif
 
 #include <errno.h>
 #include <stdio.h>
@@ -137,7 +139,7 @@ int RAND_load_file(const char *file, long bytes)
 	in=fopen(file,"rb");
 #endif
 	if (in == NULL) goto err;
-#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPNESSL_NO_POSIX_IO)
+#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPENSSL_NO_POSIX_IO)
 	if (sb.st_mode & (S_IFBLK | S_IFCHR)) {
 	  /* this file is a device. we don't want read an infinite number
 	   * of bytes from a random device, nor do we want to use buffered
diff --git a/main/openssl/crypto/rc2/rc2.h b/main/openssl/crypto/rc2/rc2.h
index 34c83623..e542ec94 100644
--- a/main/openssl/crypto/rc2/rc2.h
+++ b/main/openssl/crypto/rc2/rc2.h
@@ -79,7 +79,9 @@ typedef struct rc2_key_st
 	RC2_INT data[64];
 	} RC2_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
+#endif
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
 void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key,
 		     int enc);
diff --git a/main/openssl/crypto/rc2/rc2_skey.c b/main/openssl/crypto/rc2/rc2_skey.c
index 0150b0e0..6668ac01 100644
--- a/main/openssl/crypto/rc2/rc2_skey.c
+++ b/main/openssl/crypto/rc2/rc2_skey.c
@@ -56,6 +56,7 @@
  * [including the GNU Public Licence.]
  */
 
+#include <openssl/crypto.h>
 #include <openssl/rc2.h>
 #include "rc2_locl.h"
 
@@ -95,6 +96,13 @@ static const unsigned char key_table[256]={
  * the same as specifying 1024 for the 'bits' parameter.  Bsafe uses
  * a version where the bits parameter is the same as len*8 */
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(RC2);
+	private_RC2_set_key(key, len, data, bits);
+	}
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#endif
 	{
 	int i,j;
 	unsigned char *k;
diff --git a/main/openssl/crypto/rc4/asm/rc4-586.pl b/main/openssl/crypto/rc4/asm/rc4-586.pl
index 38a44a70..5c9ac6ad 100644
--- a/main/openssl/crypto/rc4/asm/rc4-586.pl
+++ b/main/openssl/crypto/rc4/asm/rc4-586.pl
@@ -28,6 +28,34 @@
 #
 #					<appro@fy.chalmers.se>
 
+# May 2011
+#
+# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
+# performance in cycles per processed byte (less is better) and
+# improvement relative to previous version of this module is:
+#
+# Pentium	10.2			# original numbers
+# Pentium III	7.8(*)
+# Intel P4	7.5
+#
+# Opteron	6.1/+20%		# new MMX numbers
+# Core2		5.3/+67%(**)
+# Westmere	5.1/+94%(**)
+# Sandy Bridge	5.0/+8%
+# Atom		12.6/+6%
+#
+# (*)	PIII can actually deliver 6.6 cycles per byte with MMX code,
+#	but this specific code performs poorly on Core2. And vice
+#	versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs
+#	poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU
+#	[anymore], I chose to discard PIII-specific code path and opt
+#	for original IALU-only code, which is why MMX/SSE code path
+#	is guarded by SSE2 bit (see below), not MMX/SSE.
+# (**)	Performance vs. block size on Core2 and Westmere had a maximum
+#	at ... 64 bytes block size. And it was quite a maximum, 40-60%
+#	in comparison to largest 8KB block size. Above improvement
+#	coefficients are for the largest block size.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
@@ -62,6 +90,68 @@ sub RC4_loop {
 	&$func	($out,&DWP(0,$dat,$ty,4));
 }
 
+if ($alt=0) {
+  # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron,
+  # but ~40% slower on Core2 and Westmere... Attempt to add movz
+  # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet
+  # on Core2 with movz it's almost 20% slower than below alternative
+  # code... Yes, it's a total mess...
+  my @XX=($xx,$out);
+  $RC4_loop_mmx = sub {		# SSE actually...
+    my $i=shift;
+    my $j=$i<=0?0:$i>>1;
+    my $mm=$i<=0?"mm0":"mm".($i&1);
+
+	&add	(&LB($yy),&LB($tx));
+	&lea	(@XX[1],&DWP(1,@XX[0]));
+	&pxor	("mm2","mm0")				if ($i==0);
+	&psllq	("mm1",8)				if ($i==0);
+	&and	(@XX[1],0xff);
+	&pxor	("mm0","mm0")				if ($i<=0);
+	&mov	($ty,&DWP(0,$dat,$yy,4));
+	&mov	(&DWP(0,$dat,$yy,4),$tx);
+	&pxor	("mm1","mm2")				if ($i==0);
+	&mov	(&DWP(0,$dat,$XX[0],4),$ty);
+	&add	(&LB($ty),&LB($tx));
+	&movd	(@XX[0],"mm7")				if ($i==0);
+	&mov	($tx,&DWP(0,$dat,@XX[1],4));
+	&pxor	("mm1","mm1")				if ($i==1);
+	&movq	("mm2",&QWP(0,$inp))			if ($i==1);
+	&movq	(&QWP(-8,(@XX[0],$inp)),"mm1")		if ($i==0);
+	&pinsrw	($mm,&DWP(0,$dat,$ty,4),$j);
+
+	push	(@XX,shift(@XX))			if ($i>=0);
+  }
+} else {
+  # Using pinsrw here improves performane on Intel CPUs by 2-3%, but
+  # brings down AMD by 7%...
+  $RC4_loop_mmx = sub {
+    my $i=shift;
+
+	&add	(&LB($yy),&LB($tx));
+	&psllq	("mm1",8*(($i-1)&7))			if (abs($i)!=1);
+	&mov	($ty,&DWP(0,$dat,$yy,4));
+	&mov	(&DWP(0,$dat,$yy,4),$tx);
+	&mov	(&DWP(0,$dat,$xx,4),$ty);
+	&inc	($xx);
+	&add	($ty,$tx);
+	&movz	($xx,&LB($xx));				# (*)
+	&movz	($ty,&LB($ty));				# (*)
+	&pxor	("mm2",$i==1?"mm0":"mm1")		if ($i>=0);
+	&movq	("mm0",&QWP(0,$inp))			if ($i<=0);
+	&movq	(&QWP(-8,($out,$inp)),"mm2")		if ($i==0);
+	&mov	($tx,&DWP(0,$dat,$xx,4));
+	&movd	($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
+
+	# (*)	This is the key to Core2 and Westmere performance.
+	#	Whithout movz out-of-order execution logic confuses
+	#	itself and fails to reorder loads and stores. Problem
+	#	appears to be fixed in Sandy Bridge...
+  }
+}
+
+&external_label("OPENSSL_ia32cap_P");
+
 # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
 &function_begin("RC4");
 	&mov	($dat,&wparam(0));	# load key schedule pointer
@@ -94,11 +184,56 @@ sub RC4_loop {
 	&and	($ty,-4);		# how many 4-byte chunks?
 	&jz	(&label("loop1"));
 
+	&test	($ty,-8);
+	&mov	(&wparam(3),$out);	# $out as accumulator in these loops
+	&jz	(&label("go4loop4"));
+
+	&picmeup($out,"OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,$out),26);	# check SSE2 bit [could have been MMX]
+	&jnc	(&label("go4loop4"));
+
+	&mov	($out,&wparam(3))	if (!$alt);
+	&movd	("mm7",&wparam(3))	if ($alt);
+	&and	($ty,-8);
+	&lea	($ty,&DWP(-8,$inp,$ty));
+	&mov	(&DWP(-4,$dat),$ty);	# save input+(len/8)*8-8
+
+	&$RC4_loop_mmx(-1);
+	&jmp(&label("loop_mmx_enter"));
+
+	&set_label("loop_mmx",16);
+		&$RC4_loop_mmx(0);
+	&set_label("loop_mmx_enter");
+		for 	($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
+		&mov	($ty,$yy);
+		&xor	($yy,$yy);		# this is second key to Core2
+		&mov	(&LB($yy),&LB($ty));	# and Westmere performance...
+		&cmp	($inp,&DWP(-4,$dat));
+		&lea	($inp,&DWP(8,$inp));
+	&jb	(&label("loop_mmx"));
+
+    if ($alt) {
+	&movd	($out,"mm7");
+	&pxor	("mm2","mm0");
+	&psllq	("mm1",8);
+	&pxor	("mm1","mm2");
+	&movq	(&QWP(-8,$out,$inp),"mm1");
+    } else {
+	&psllq	("mm1",56);
+	&pxor	("mm2","mm1");
+	&movq	(&QWP(-8,$out,$inp),"mm2");
+    }
+	&emms	();
+
+	&cmp	($inp,&wparam(1));	# compare to input+len
+	&je	(&label("done"));
+	&jmp	(&label("loop1"));
+
+&set_label("go4loop4",16);
 	&lea	($ty,&DWP(-4,$inp,$ty));
 	&mov	(&wparam(2),$ty);	# save input+(len/4)*4-4
-	&mov	(&wparam(3),$out);	# $out as accumulator in this loop
 
-	&set_label("loop4",16);
+	&set_label("loop4");
 		for ($i=0;$i<4;$i++) { RC4_loop($i); }
 		&ror	($out,8);
 		&xor	($out,&DWP(0,$inp));
@@ -151,7 +286,7 @@ sub RC4_loop {
 
 &set_label("done");
 	&dec	(&LB($xx));
-	&mov	(&BP(-4,$dat),&LB($yy));	# save key->y
+	&mov	(&DWP(-4,$dat),$yy);		# save key->y
 	&mov	(&BP(-8,$dat),&LB($xx));	# save key->x
 &set_label("abort");
 &function_end("RC4");
@@ -164,10 +299,8 @@ $idi="ebp";
 $ido="ecx";
 $idx="edx";
 
-&external_label("OPENSSL_ia32cap_P");
-
 # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
-&function_begin("RC4_set_key");
+&function_begin("private_RC4_set_key");
 	&mov	($out,&wparam(0));		# load key
 	&mov	($idi,&wparam(1));		# load len
 	&mov	($inp,&wparam(2));		# load data
@@ -245,7 +378,7 @@ $idx="edx";
 	&xor	("eax","eax");
 	&mov	(&DWP(-8,$out),"eax");		# key->x=0;
 	&mov	(&DWP(-4,$out),"eax");		# key->y=0;
-&function_end("RC4_set_key");
+&function_end("private_RC4_set_key");
 
 # const char *RC4_options(void);
 &function_begin_B("RC4_options");
@@ -254,14 +387,21 @@ $idx="edx";
 	&blindpop("eax");
 	&lea	("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
 	&picmeup("edx","OPENSSL_ia32cap_P");
-	&bt	(&DWP(0,"edx"),20);
-	&jnc	(&label("skip"));
-	  &add	("eax",12);
-	&set_label("skip");
+	&mov	("edx",&DWP(0,"edx"));
+	&bt	("edx",20);
+	&jc	(&label("1xchar"));
+	&bt	("edx",26);
+	&jnc	(&label("ret"));
+	&add	("eax",25);
+	&ret	();
+&set_label("1xchar");
+	&add	("eax",12);
+&set_label("ret");
 	&ret	();
 &set_label("opts",64);
 &asciz	("rc4(4x,int)");
 &asciz	("rc4(1x,char)");
+&asciz	("rc4(8x,mmx)");
 &asciz	("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
 &align	(64);
 &function_end_B("RC4_options");
diff --git a/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.S b/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.S
new file mode 100644
index 00000000..aab3c6db
--- /dev/null
+++ b/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.S
@@ -0,0 +1,1259 @@
+.text	
+.align	16
+
+.globl	rc4_md5_enc
+.type	rc4_md5_enc,@function
+rc4_md5_enc:
+	cmpq	$0,%r9
+	je	.Labort
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$40,%rsp
+.Lbody:
+	movq	%rcx,%r11
+	movq	%r9,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%r8,%r15
+	xorq	%rbp,%rbp
+	xorq	%rcx,%rcx
+
+	leaq	8(%rdi),%rdi
+	movb	-8(%rdi),%bpl
+	movb	-4(%rdi),%cl
+
+	incb	%bpl
+	subq	%r13,%r14
+	movl	(%rdi,%rbp,4),%eax
+	addb	%al,%cl
+	leaq	(%rdi,%rbp,4),%rsi
+	shlq	$6,%r12
+	addq	%r15,%r12
+	movq	%r12,16(%rsp)
+
+	movq	%r11,24(%rsp)
+	movl	0(%r11),%r8d
+	movl	4(%r11),%r9d
+	movl	8(%r11),%r10d
+	movl	12(%r11),%r11d
+	jmp	.Loop
+
+.align	16
+.Loop:
+	movl	%r8d,0(%rsp)
+	movl	%r9d,4(%rsp)
+	movl	%r10d,8(%rsp)
+	movl	%r11d,%r12d
+	movl	%r11d,12(%rsp)
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	0(%r15),%r8d
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	addl	$3614090360,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	4(%r15),%r11d
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	addl	$3905402710,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,4(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	8(%r15),%r10d
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	addl	$606105819,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,8(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	12(%r15),%r9d
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	addl	$3250441966,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,12(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	16(%r15),%r8d
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	addl	$4118548399,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,16(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	20(%r15),%r11d
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	addl	$1200080426,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,20(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	24(%r15),%r10d
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	addl	$2821735955,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,24(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	28(%r15),%r9d
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	addl	$4249261313,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,28(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	32(%r15),%r8d
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	addl	$1770035416,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,32(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	36(%r15),%r11d
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	addl	$2336552879,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,36(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	40(%r15),%r10d
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	addl	$4294925233,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,40(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	44(%r15),%r9d
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	addl	$2304563134,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,44(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	48(%r15),%r8d
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	addl	$1804603682,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,48(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	52(%r15),%r11d
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	addl	$4254626195,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,52(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	56(%r15),%r10d
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	addl	$2792965006,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,56(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	(%r13),%xmm2
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	60(%r15),%r9d
+	addb	%dl,%bl
+	movl	64(%rsi),%eax
+	addl	$1236535329,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,60(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	4(%r15),%r8d
+	addb	%dl,%al
+	movl	68(%rsi),%ebx
+	addl	$4129170786,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,64(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	24(%r15),%r11d
+	addb	%dl,%bl
+	movl	72(%rsi),%eax
+	addl	$3225465664,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,68(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	44(%r15),%r10d
+	addb	%dl,%al
+	movl	76(%rsi),%ebx
+	addl	$643717713,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,72(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	0(%r15),%r9d
+	addb	%dl,%bl
+	movl	80(%rsi),%eax
+	addl	$3921069994,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,76(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	20(%r15),%r8d
+	addb	%dl,%al
+	movl	84(%rsi),%ebx
+	addl	$3593408605,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,80(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	40(%r15),%r11d
+	addb	%dl,%bl
+	movl	88(%rsi),%eax
+	addl	$38016083,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,84(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	60(%r15),%r10d
+	addb	%dl,%al
+	movl	92(%rsi),%ebx
+	addl	$3634488961,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,88(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	16(%r15),%r9d
+	addb	%dl,%bl
+	movl	96(%rsi),%eax
+	addl	$3889429448,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,92(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	36(%r15),%r8d
+	addb	%dl,%al
+	movl	100(%rsi),%ebx
+	addl	$568446438,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,96(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	56(%r15),%r11d
+	addb	%dl,%bl
+	movl	104(%rsi),%eax
+	addl	$3275163606,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,100(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	12(%r15),%r10d
+	addb	%dl,%al
+	movl	108(%rsi),%ebx
+	addl	$4107603335,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,104(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	32(%r15),%r9d
+	addb	%dl,%bl
+	movl	112(%rsi),%eax
+	addl	$1163531501,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,108(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	52(%r15),%r8d
+	addb	%dl,%al
+	movl	116(%rsi),%ebx
+	addl	$2850285829,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,112(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	8(%r15),%r11d
+	addb	%dl,%bl
+	movl	120(%rsi),%eax
+	addl	$4243563512,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,116(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	28(%r15),%r10d
+	addb	%dl,%al
+	movl	124(%rsi),%ebx
+	addl	$1735328473,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,120(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	16(%r13),%xmm3
+	addb	$32,%bpl
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	48(%r15),%r9d
+	addb	%dl,%bl
+	movl	0(%rdi,%rbp,4),%eax
+	addl	$2368359562,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,124(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movq	%rcx,%rsi
+	xorq	%rcx,%rcx
+	movb	%sil,%cl
+	leaq	(%rdi,%rbp,4),%rsi
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	20(%r15),%r8d
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	addl	$4294588738,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,0(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	32(%r15),%r11d
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	addl	$2272392833,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,4(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	44(%r15),%r10d
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	addl	$1839030562,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,8(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	56(%r15),%r9d
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	addl	$4259657740,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,12(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	4(%r15),%r8d
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	addl	$2763975236,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,16(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	16(%r15),%r11d
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	addl	$1272893353,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,20(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	28(%r15),%r10d
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	addl	$4139469664,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,24(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	40(%r15),%r9d
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	addl	$3200236656,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,28(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	52(%r15),%r8d
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	addl	$681279174,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,32(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	0(%r15),%r11d
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	addl	$3936430074,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,36(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	12(%r15),%r10d
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	addl	$3572445317,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,40(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	24(%r15),%r9d
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	addl	$76029189,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,44(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	36(%r15),%r8d
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	addl	$3654602809,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,48(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	48(%r15),%r11d
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	addl	$3873151461,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,52(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	60(%r15),%r10d
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	addl	$530742520,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,56(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	32(%r13),%xmm4
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	8(%r15),%r9d
+	addb	%dl,%bl
+	movl	64(%rsi),%eax
+	addl	$3299628645,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,60(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	$-1,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm4
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	0(%r15),%r8d
+	addb	%dl,%al
+	movl	68(%rsi),%ebx
+	addl	$4096336452,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,64(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	28(%r15),%r11d
+	addb	%dl,%bl
+	movl	72(%rsi),%eax
+	addl	$1126891415,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,68(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	56(%r15),%r10d
+	addb	%dl,%al
+	movl	76(%rsi),%ebx
+	addl	$2878612391,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,72(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	20(%r15),%r9d
+	addb	%dl,%bl
+	movl	80(%rsi),%eax
+	addl	$4237533241,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,76(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	48(%r15),%r8d
+	addb	%dl,%al
+	movl	84(%rsi),%ebx
+	addl	$1700485571,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,80(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	12(%r15),%r11d
+	addb	%dl,%bl
+	movl	88(%rsi),%eax
+	addl	$2399980690,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,84(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	40(%r15),%r10d
+	addb	%dl,%al
+	movl	92(%rsi),%ebx
+	addl	$4293915773,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,88(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	4(%r15),%r9d
+	addb	%dl,%bl
+	movl	96(%rsi),%eax
+	addl	$2240044497,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,92(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	32(%r15),%r8d
+	addb	%dl,%al
+	movl	100(%rsi),%ebx
+	addl	$1873313359,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,96(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	60(%r15),%r11d
+	addb	%dl,%bl
+	movl	104(%rsi),%eax
+	addl	$4264355552,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,100(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	24(%r15),%r10d
+	addb	%dl,%al
+	movl	108(%rsi),%ebx
+	addl	$2734768916,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,104(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	52(%r15),%r9d
+	addb	%dl,%bl
+	movl	112(%rsi),%eax
+	addl	$1309151649,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,108(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	16(%r15),%r8d
+	addb	%dl,%al
+	movl	116(%rsi),%ebx
+	addl	$4149444226,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,112(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	44(%r15),%r11d
+	addb	%dl,%bl
+	movl	120(%rsi),%eax
+	addl	$3174756917,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,116(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	8(%r15),%r10d
+	addb	%dl,%al
+	movl	124(%rsi),%ebx
+	addl	$718787259,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,120(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	48(%r13),%xmm5
+	addb	$32,%bpl
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	36(%r15),%r9d
+	addb	%dl,%bl
+	movl	0(%rdi,%rbp,4),%eax
+	addl	$3951481745,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,124(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movq	%rbp,%rsi
+	xorq	%rbp,%rbp
+	movb	%sil,%bpl
+	movq	%rcx,%rsi
+	xorq	%rcx,%rcx
+	movb	%sil,%cl
+	leaq	(%rdi,%rbp,4),%rsi
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm5
+	pxor	%xmm1,%xmm5
+	addl	0(%rsp),%r8d
+	addl	4(%rsp),%r9d
+	addl	8(%rsp),%r10d
+	addl	12(%rsp),%r11d
+
+	movdqu	%xmm2,(%r14,%r13,1)
+	movdqu	%xmm3,16(%r14,%r13,1)
+	movdqu	%xmm4,32(%r14,%r13,1)
+	movdqu	%xmm5,48(%r14,%r13,1)
+	leaq	64(%r15),%r15
+	leaq	64(%r13),%r13
+	cmpq	16(%rsp),%r15
+	jb	.Loop
+
+	movq	24(%rsp),%r12
+	subb	%al,%cl
+	movl	%r8d,0(%r12)
+	movl	%r9d,4(%r12)
+	movl	%r10d,8(%r12)
+	movl	%r11d,12(%r12)
+	subb	$1,%bpl
+	movl	%ebp,-8(%rdi)
+	movl	%ecx,-4(%rdi)
+
+	movq	40(%rsp),%r15
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r13
+	movq	64(%rsp),%r12
+	movq	72(%rsp),%rbp
+	movq	80(%rsp),%rbx
+	leaq	88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+	.byte	0xf3,0xc3
+.size	rc4_md5_enc,.-rc4_md5_enc
diff --git a/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.pl b/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.pl
new file mode 100644
index 00000000..272fa91e
--- /dev/null
+++ b/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.pl
@@ -0,0 +1,632 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# June 2011
+#
+# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
+# http://download.intel.com/design/intarch/papers/323686.pdf, is that
+# since both algorithms exhibit instruction-level parallelism, ILP,
+# below theoretical maximum, interleaving them would allow to utilize
+# processor resources better and achieve better performance. RC4
+# instruction sequence is virtually identical to rc4-x86_64.pl, which
+# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
+# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
+# minimize register usage, which was used as "main thread" with RC4
+# weaved into it, one RC4 round per one MD5 round. In addition to the
+# stiched subroutine the script can generate standalone replacement
+# md5_block_asm_data_order and RC4. Below are performance numbers in
+# cycles per processed byte, less is better, for these the standalone
+# subroutines, sum of them, and stitched one:
+#
+#		RC4	MD5	RC4+MD5	stitch	gain
+# Opteron	6.5(*)	5.4	11.9	7.0	+70%(*)
+# Core2		6.5	5.8	12.3	7.7	+60%
+# Westmere	4.3	5.2	9.5	7.0	+36%
+# Sandy Bridge	4.2	5.5	9.7	6.8	+43%
+# Atom		9.3	6.5	15.8	11.1	+42%
+#
+# (*)	rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
+#	is +53%...
+
+my ($rc4,$md5)=(1,1);	# what to generate?
+my $D="#" if (!$md5);	# if set to "#", MD5 is stitched into RC4(),
+			# but its result is discarded. Idea here is
+			# to be able to use 'openssl speed rc4' for
+			# benchmarking the stitched subroutine... 
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
+
+if ($rc4 && !$md5) {
+  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
+  $func="RC4";				$nargs=4;
+} elsif ($md5 && !$rc4) {
+  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
+  $func="md5_block_asm_data_order";	$nargs=3;
+} else {
+  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+  $func="rc4_md5_enc";			$nargs=6;
+  # void rc4_md5_enc(
+  #		RC4_KEY *key,		#
+  #		const void *in0,	# RC4 input
+  #		void *out,		# RC4 output
+  #		MD5_CTX *ctx,		#
+  #		const void *inp,	# MD5 input
+  #		size_t len);		# number of 64-byte blocks
+}
+
+my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391	);
+
+my @V=("%r8d","%r9d","%r10d","%r11d");	# MD5 registers
+my $tmp="%r12d";
+
+my @XX=("%rbp","%rsi");			# RC4 registers
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+my $MOD=32;				# 16, 32 or 64
+
+$code.=<<___;
+.text
+.align 16
+
+.globl	$func
+.type	$func,\@function,$nargs
+$func:
+	cmp	\$0,$len
+	je	.Labort
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$40,%rsp
+.Lbody:
+___
+if ($rc4) {
+$code.=<<___;
+$D#md5#	mov	$ctx,%r11		# reassign arguments
+	mov	$len,%r12
+	mov	$in0,%r13
+	mov	$out,%r14
+$D#md5#	mov	$inp,%r15
+___
+    $ctx="%r11"	if ($md5);		# reassign arguments
+    $len="%r12";
+    $in0="%r13";
+    $out="%r14";
+    $inp="%r15"	if ($md5);
+    $inp=$in0	if (!$md5);
+$code.=<<___;
+	xor	$XX[0],$XX[0]
+	xor	$YY,$YY
+
+	lea	8($dat),$dat
+	mov	-8($dat),$XX[0]#b
+	mov	-4($dat),$YY#b
+
+	inc	$XX[0]#b
+	sub	$in0,$out
+	movl	($dat,$XX[0],4),$TX[0]#d
+___
+$code.=<<___ if (!$md5);
+	xor	$TX[1],$TX[1]
+	test	\$-128,$len
+	jz	.Loop1
+	sub	$XX[0],$TX[1]
+	and	\$`$MOD-1`,$TX[1]
+	jz	.Loop${MOD}_is_hot
+	sub	$TX[1],$len
+.Loop${MOD}_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$TX[1]
+	jnz	.Loop${MOD}_warmup
+
+	mov	$YY,$TX[1]
+	xor	$YY,$YY
+	mov	$TX[1]#b,$YY#b
+
+.Loop${MOD}_is_hot:
+	mov	$len,32(%rsp)		# save original $len
+	shr	\$6,$len		# number of 64-byte blocks
+___
+  if ($D && !$md5) {			# stitch in dummy MD5
+    $md5=1;
+    $ctx="%r11";
+    $inp="%r15";
+    $code.=<<___;
+	mov	%rsp,$ctx
+	mov	$in0,$inp
+___
+  }
+}
+$code.=<<___;
+#rc4#	add	$TX[0]#b,$YY#b
+#rc4#	lea	($dat,$XX[0],4),$XX[1]
+	shl	\$6,$len
+	add	$inp,$len		# pointer to the end of input
+	mov	$len,16(%rsp)
+
+#md5#	mov	$ctx,24(%rsp)		# save pointer to MD5_CTX
+#md5#	mov	0*4($ctx),$V[0]		# load current hash value from MD5_CTX
+#md5#	mov	1*4($ctx),$V[1]
+#md5#	mov	2*4($ctx),$V[2]
+#md5#	mov	3*4($ctx),$V[3]
+	jmp	.Loop
+
+.align	16
+.Loop:
+#md5#	mov	$V[0],0*4(%rsp)		# put aside current hash value
+#md5#	mov	$V[1],1*4(%rsp)
+#md5#	mov	$V[2],2*4(%rsp)
+#md5#	mov	$V[3],$tmp		# forward reference
+#md5#	mov	$V[3],3*4(%rsp)
+___
+
+sub R0 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot0=(7,12,17,22);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	($in0),%xmm2\n"		if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$b,$tmp
+#md5#	add	4*`$j`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$d,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot0[$j%4],$a
+#md5#	mov	`$j==15?"$b":"$c"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+___
+}
+sub R1 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot1=(5,9,14,20);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	16($in0),%xmm3\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$b,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$d,$tmp
+#md5#	add	4*`((1+5*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$c,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot1[$j%4],$a
+#md5#	mov	`$j==15?"$c":"$b"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+___
+}
+sub R2 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot2=(4,11,16,23);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	32($in0),%xmm4\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	xor	$b,$tmp
+#md5#	add	4*`((5+3*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	add	$tmp,$a
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot2[$j%4],$a
+#md5#	mov	`$j==15?"\\\$-1":"$c"`,$tmp	# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm4
+___
+}
+sub R3 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot3=(6,10,15,21);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	48($in0),%xmm5\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$d,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	or	$b,$tmp
+#md5#	add	4*`((7*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot3[$j%4],$a
+#md5#	mov	\$-1,$tmp			# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	mov	$XX[0],$XX[1]
+	xor	$XX[0],$XX[0]			# keyword to partial register
+	mov	$XX[1]#b,$XX[0]#b
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm5
+	pxor	%xmm1,%xmm5
+___
+}
+
+my $i=0;
+for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+
+$code.=<<___;
+#md5#	add	0*4(%rsp),$V[0]		# accumulate hash value
+#md5#	add	1*4(%rsp),$V[1]
+#md5#	add	2*4(%rsp),$V[2]
+#md5#	add	3*4(%rsp),$V[3]
+
+#rc4#	movdqu	%xmm2,($out,$in0)	# write RC4 output
+#rc4#	movdqu	%xmm3,16($out,$in0)
+#rc4#	movdqu	%xmm4,32($out,$in0)
+#rc4#	movdqu	%xmm5,48($out,$in0)
+#md5#	lea	64($inp),$inp
+#rc4#	lea	64($in0),$in0
+	cmp	16(%rsp),$inp		# are we done?
+	jb	.Loop
+
+#md5#	mov	24(%rsp),$len		# restore pointer to MD5_CTX
+#rc4#	sub	$TX[0]#b,$YY#b		# correct $YY
+#md5#	mov	$V[0],0*4($len)		# write MD5_CTX
+#md5#	mov	$V[1],1*4($len)
+#md5#	mov	$V[2],2*4($len)
+#md5#	mov	$V[3],3*4($len)
+___
+$code.=<<___ if ($rc4 && (!$md5 || $D));
+	mov	32(%rsp),$len		# restore original $len
+	and	\$63,$len		# remaining bytes
+	jnz	.Loop1
+	jmp	.Ldone
+	
+.align	16
+.Loop1:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$len
+	jnz	.Loop1
+
+.Ldone:
+___
+$code.=<<___;
+#rc4#	sub	\$1,$XX[0]#b
+#rc4#	movl	$XX[0]#d,-8($dat)
+#rc4#	movl	$YY#d,-4($dat)
+
+	mov	40(%rsp),%r15
+	mov	48(%rsp),%r14
+	mov	56(%rsp),%r13
+	mov	64(%rsp),%r12
+	mov	72(%rsp),%rbp
+	mov	80(%rsp),%rbx
+	lea	88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+	ret
+.size $func,.-$func
+___
+
+if ($rc4 && $D) {	# sole purpose of this section is to provide
+			# option to use the generated module as drop-in
+			# replacement for rc4-x86_64.pl for debugging
+			# and testing purposes...
+my ($idx,$ido)=("%r8","%r9");
+my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+.globl	RC4_set_key
+.type	RC4_set_key,\@function,3
+.align	16
+RC4_set_key:
+	lea	8($dat),$dat
+	lea	($inp,$len),$inp
+	neg	$len
+	mov	$len,%rcx
+	xor	%eax,%eax
+	xor	$ido,$ido
+	xor	%r10,%r10
+	xor	%r11,%r11
+	jmp	.Lw1stloop
+
+.align	16
+.Lw1stloop:
+	mov	%eax,($dat,%rax,4)
+	add	\$1,%al
+	jnc	.Lw1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lw2ndloop:
+	mov	($dat,$ido,4),%r10d
+	add	($inp,$len,1),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx,4),%r11d
+	cmovz	%rcx,$len
+	mov	%r10d,($dat,$idx,4)
+	mov	%r11d,($dat,$ido,4)
+	add	\$1,$ido#b
+	jnc	.Lw2ndloop
+
+	xor	%eax,%eax
+	mov	%eax,-8($dat)
+	mov	%eax,-4($dat)
+	ret
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,\@abi-omnipotent
+.align	16
+RC4_options:
+	lea	.Lopts(%rip),%rax
+	ret
+.align	64
+.Lopts:
+.asciz	"rc4(64x,int)"
+.align	64
+.size	RC4_options,.-RC4_options
+___
+}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lbody
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	mov	40(%rax),%r15
+	mov	48(%rax),%r14
+	mov	56(%rax),%r13
+	mov	64(%rax),%r12
+	mov	72(%rax),%rbp
+	mov	80(%rax),%rbx
+	lea	88(%rax),%rax
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R12
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_$func
+	.rva	.LSEH_end_$func
+	.rva	.LSEH_info_$func
+
+.section	.xdata
+.align	8
+.LSEH_info_$func:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
+    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
+    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
+    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/pinsrw\s+\$0,/movd	/gm;
+
+$code =~ s/#md5#//gm	if ($md5);
+$code =~ s/#rc4#//gm	if ($rc4);
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/rc4/asm/rc4-parisc.pl b/main/openssl/crypto/rc4/asm/rc4-parisc.pl
new file mode 100644
index 00000000..ad7e6565
--- /dev/null
+++ b/main/openssl/crypto/rc4/asm/rc4-parisc.pl
@@ -0,0 +1,314 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# RC4 for PA-RISC.
+
+# June 2009.
+#
+# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
+# For reference, [4x] unrolled loop is >40% faster than folded one.
+# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
+# is believed to be not sufficient to justify the effort...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=4*$SIZE_T+$FRAME_MARKER;	# 4 saved regs + frame marker
+				#                [+ argument transfer]
+$SZ=1;				# defaults to RC4_CHAR
+if (open CONF,"<${dir}../../opensslconf.h") {
+    while(<CONF>) {
+	if (m/#\s*define\s+RC4_INT\s+(.*)/) {
+	    $SZ = ($1=~/char$/) ? 1 : 4;
+	    last;
+	}
+    }
+    close CONF;
+}
+
+if ($SZ==1) {	# RC4_CHAR
+    $LD="ldb";
+    $LDX="ldbx";
+    $MKX="addl";
+    $ST="stb";
+} else {	# RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
+    $LD="ldw";
+    $LDX="ldwx,s";
+    $MKX="sh2addl";
+    $ST="stw";
+}
+
+$key="%r26";
+$len="%r25";
+$inp="%r24";
+$out="%r23";
+
+@XX=("%r19","%r20");
+@TX=("%r21","%r22");
+$YY="%r28";
+$TY="%r29";
+
+$acc="%r1";
+$ix="%r2";
+$iy="%r3";
+$dat0="%r4";
+$dat1="%r5";
+$rem="%r6";
+$mask="%r31";
+
+sub unrolledloopbody {
+for ($i=0;$i<4;$i++) {
+$code.=<<___;
+	ldo	1($XX[0]),$XX[1]
+	`sprintf("$LDX	%$TY(%$key),%$dat1") if ($i>0)`	
+	and	$mask,$XX[1],$XX[1]
+	$LDX	$YY($key),$TY
+	$MKX	$YY,$key,$ix
+	$LDX	$XX[1]($key),$TX[1]
+	$MKX	$XX[0],$key,$iy
+	$ST	$TX[0],0($ix)
+	comclr,<> $XX[1],$YY,%r0	; conditional
+	copy	$TX[0],$TX[1]		; move
+	`sprintf("%sdep	%$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
+	$ST	$TY,0($iy)
+	addl	$TX[0],$TY,$TY
+	addl	$TX[1],$YY,$YY
+	and	$mask,$TY,$TY
+	and	$mask,$YY,$YY
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+} }
+
+sub foldedloop {
+my ($label,$count)=@_;
+$code.=<<___;
+$label
+	$MKX	$YY,$key,$iy
+	$LDX	$YY($key),$TY
+	$MKX	$XX[0],$key,$ix
+	$ST	$TX[0],0($iy)
+	ldo	1($XX[0]),$XX[0]
+	$ST	$TY,0($ix)
+	addl	$TX[0],$TY,$TY
+	ldbx	$inp($out),$dat1
+	and	$mask,$TY,$TY
+	and	$mask,$XX[0],$XX[0]
+	$LDX	$TY($key),$acc
+	$LDX	$XX[0]($key),$TX[0]
+	ldo	1($out),$out
+	xor	$dat1,$acc,$acc
+	addl	$TX[0],$YY,$YY
+	stb	$acc,-1($out)
+	addib,<> -1,$count,$label	; $count is always small
+	and	$mask,$YY,$YY
+___
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+RC4
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+
+	cmpib,*= 0,$len,L\$abort
+	sub	$inp,$out,$inp		; distance between $inp and $out
+
+	$LD	`0*$SZ`($key),$XX[0]
+	$LD	`1*$SZ`($key),$YY
+	ldo	`2*$SZ`($key),$key
+
+	ldi	0xff,$mask
+	ldi	3,$dat0		
+
+	ldo	1($XX[0]),$XX[0]	; warm up loop
+	and	$mask,$XX[0],$XX[0]
+	$LDX	$XX[0]($key),$TX[0]
+	addl	$TX[0],$YY,$YY
+	cmpib,*>>= 6,$len,L\$oop1	; is $len large enough to bother?
+	and	$mask,$YY,$YY
+
+	and,<>	$out,$dat0,$rem		; is $out aligned?
+	b	L\$alignedout
+	subi	4,$rem,$rem
+	sub	$len,$rem,$len
+___
+&foldedloop("L\$alignout",$rem);	# process till $out is aligned
+
+$code.=<<___;
+L\$alignedout				; $len is at least 4 here
+	and,<>	$inp,$dat0,$acc		; is $inp aligned?
+	b	L\$oop4
+	sub	$inp,$acc,$rem		; align $inp
+
+	sh3addl	$acc,%r0,$acc
+	subi	32,$acc,$acc
+	mtctl	$acc,%cr11		; load %sar with vshd align factor
+	ldwx	$rem($out),$dat0
+	ldo	4($rem),$rem
+L\$oop4misalignedinp
+___
+&unrolledloopbody();
+$code.=<<___;
+	$LDX	$TY($key),$ix
+	ldwx	$rem($out),$dat1
+	ldo	-4($len),$len
+	or	$ix,$acc,$acc		; last piece, no need to dep
+	vshd	$dat0,$dat1,$iy		; align data
+	copy	$dat1,$dat0
+	xor	$iy,$acc,$acc
+	stw	$acc,0($out)
+	cmpib,*<< 3,$len,L\$oop4misalignedinp
+	ldo	4($out),$out
+	cmpib,*= 0,$len,L\$done
+	nop
+	b	L\$oop1
+	nop
+
+	.ALIGN	8
+L\$oop4
+___
+&unrolledloopbody();
+$code.=<<___;
+	$LDX	$TY($key),$ix
+	ldwx	$inp($out),$dat0
+	ldo	-4($len),$len
+	or	$ix,$acc,$acc		; last piece, no need to dep
+	xor	$dat0,$acc,$acc
+	stw	$acc,0($out)
+	cmpib,*<< 3,$len,L\$oop4
+	ldo	4($out),$out
+	cmpib,*= 0,$len,L\$done
+	nop
+___
+&foldedloop("L\$oop1",$len);
+$code.=<<___;
+L\$done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2
+	ldo	-1($XX[0]),$XX[0]	; chill out loop
+	sub	$YY,$TX[0],$YY
+	and	$mask,$XX[0],$XX[0]
+	and	$mask,$YY,$YY
+	$ST	$XX[0],`-2*$SZ`($key)
+	$ST	$YY,`-1*$SZ`($key)
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+___
+
+$code.=<<___;
+
+	.EXPORT	private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	8
+private_RC4_set_key
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	$ST	%r0,`0*$SZ`($key)
+	$ST	%r0,`1*$SZ`($key)
+	ldo	`2*$SZ`($key),$key
+	copy	%r0,@XX[0]
+L\$1st
+	$ST	@XX[0],0($key)
+	ldo	1(@XX[0]),@XX[0]
+	bb,>=	@XX[0],`31-8`,L\$1st	; @XX[0]<256
+	ldo	$SZ($key),$key
+
+	ldo	`-256*$SZ`($key),$key	; rewind $key
+	addl	$len,$inp,$inp		; $inp to point at the end
+	sub	%r0,$len,%r23		; inverse index
+	copy	%r0,@XX[0]
+	copy	%r0,@XX[1]
+	ldi	0xff,$mask
+
+L\$2nd
+	$LDX	@XX[0]($key),@TX[0]
+	ldbx	%r23($inp),@TX[1]
+	addi,nuv 1,%r23,%r23		; increment and conditional
+	sub	%r0,$len,%r23		; inverse index
+	addl	@TX[0],@XX[1],@XX[1]
+	addl	@TX[1],@XX[1],@XX[1]
+	and	$mask,@XX[1],@XX[1]
+	$MKX	@XX[0],$key,$TY
+	$LDX	@XX[1]($key),@TX[1]
+	$MKX	@XX[1],$key,$YY
+	ldo	1(@XX[0]),@XX[0]
+	$ST	@TX[0],0($YY)
+	bb,>=	@XX[0],`31-8`,L\$2nd	; @XX[0]<256
+	$ST	@TX[1],0($TY)
+
+	bv,n	(%r2)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	RC4_options,ENTRY
+	.ALIGN	8
+RC4_options
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	blr	%r0,%r28
+	ldi	3,%r1
+L\$pic
+	andcm	%r28,%r1,%r28
+	bv	(%r2)
+	.EXIT
+	ldo	L\$opts-L\$pic(%r28),%r28
+	.PROCEND
+	.ALIGN	8
+L\$opts
+	.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
+	.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/cmpib,\*/comib,/gm	if ($SIZE_T==4);
+$code =~ s/\bbv\b/bve/gm	if ($SIZE_T==8);
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/rc4/asm/rc4-s390x.pl b/main/openssl/crypto/rc4/asm/rc4-s390x.pl
index 96681fa0..7528ece1 100644
--- a/main/openssl/crypto/rc4/asm/rc4-s390x.pl
+++ b/main/openssl/crypto/rc4/asm/rc4-s390x.pl
@@ -13,6 +13,29 @@
 # "cluster" Address Generation Interlocks, so that one pipeline stall
 # resolves several dependencies.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $rp="%r14";
 $sp="%r15";
 $code=<<___;
@@ -39,7 +62,12 @@ $code.=<<___;
 .type	RC4,\@function
 .align	64
 RC4:
-	stmg	%r6,%r11,48($sp)
+	stm${g}	%r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
 	llgc	$XX[0],0($key)
 	llgc	$YY,1($key)
 	la	$XX[0],1($XX[0])
@@ -90,7 +118,7 @@ $code.=<<___;
 	xgr	$acc,$TX[1]
 	stg	$acc,0($out)
 	la	$out,8($out)
-	brct	$cnt,.Loop8
+	brctg	$cnt,.Loop8
 
 .Lshort:
 	lghi	$acc,7
@@ -122,7 +150,7 @@ $code.=<<___;
 	ahi	$XX[0],-1
 	stc	$XX[0],0($key)
 	stc	$YY,1($key)
-	lmg	%r6,%r11,48($sp)
+	lm${g}	%r6,%r11,6*$SIZE_T($sp)
 	br	$rp
 .size	RC4,.-RC4
 .string	"RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
@@ -143,11 +171,11 @@ $ikey="%r7";
 $iinp="%r8";
 
 $code.=<<___;
-.globl	RC4_set_key
-.type	RC4_set_key,\@function
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,\@function
 .align	64
-RC4_set_key:
-	stmg	%r6,%r8,48($sp)
+private_RC4_set_key:
+	stm${g}	%r6,%r8,6*$SIZE_T($sp)
 	lhi	$cnt,256
 	la	$idx,0(%r0)
 	sth	$idx,0($key)
@@ -180,9 +208,9 @@ RC4_set_key:
 	la	$iinp,0(%r0)
 	j	.L2ndloop
 .Ldone:
-	lmg	%r6,%r8,48($sp)
+	lm${g}	%r6,%r8,6*$SIZE_T($sp)
 	br	$rp
-.size	RC4_set_key,.-RC4_set_key
+.size	private_RC4_set_key,.-private_RC4_set_key
 
 ___
 }
@@ -203,3 +231,4 @@ RC4_options:
 ___
 
 print $code;
+close STDOUT;	# force flush
diff --git a/main/openssl/crypto/rc4/asm/rc4-x86_64.S b/main/openssl/crypto/rc4/asm/rc4-x86_64.S
new file mode 100644
index 00000000..af161582
--- /dev/null
+++ b/main/openssl/crypto/rc4/asm/rc4-x86_64.S
@@ -0,0 +1,615 @@
+.text	
+
+
+.globl	RC4
+.type	RC4,@function
+.align	16
+RC4:	orq	%rsi,%rsi
+	jne	.Lentry
+	.byte	0xf3,0xc3
+.Lentry:
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+.Lprologue:
+	movq	%rsi,%r11
+	movq	%rdx,%r12
+	movq	%rcx,%r13
+	xorq	%r10,%r10
+	xorq	%rcx,%rcx
+
+	leaq	8(%rdi),%rdi
+	movb	-8(%rdi),%r10b
+	movb	-4(%rdi),%cl
+	cmpl	$-1,256(%rdi)
+	je	.LRC4_CHAR
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	xorq	%rbx,%rbx
+	incb	%r10b
+	subq	%r10,%rbx
+	subq	%r12,%r13
+	movl	(%rdi,%r10,4),%eax
+	testq	$-16,%r11
+	jz	.Lloop1
+	btl	$30,%r8d
+	jc	.Lintel
+	andq	$7,%rbx
+	leaq	1(%r10),%rsi
+	jz	.Loop8
+	subq	%rbx,%r11
+.Loop8_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop8_warmup
+
+	leaq	1(%r10),%rsi
+	jmp	.Loop8
+.align	16
+.Loop8:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	0(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,0(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,4(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	8(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,8(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	12(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,12(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	16(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,16(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	20(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,20(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	24(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,24(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%sil
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	-4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,28(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%r10b
+	rorq	$8,%r8
+	subq	$8,%r11
+
+	xorq	(%r12),%r8
+	movq	%r8,(%r13,%r12,1)
+	leaq	8(%r12),%r12
+
+	testq	$-8,%r11
+	jnz	.Loop8
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lintel:
+	testq	$-32,%r11
+	jz	.Lloop1
+	andq	$15,%rbx
+	jz	.Loop16_is_hot
+	subq	%rbx,%r11
+.Loop16_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop16_warmup
+
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	movb	%bl,%cl
+
+.Loop16_is_hot:
+	leaq	(%rdi,%r10,4),%rsi
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	jmp	.Loop16_enter
+.align	16
+.Loop16:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm2
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	pxor	%xmm1,%xmm2
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+.Loop16_enter:
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm1,%xmm1
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,4(%rsi)
+	addb	%al,%cl
+	pinsrw	$0,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,8(%rsi)
+	addb	%bl,%cl
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,12(%rsi)
+	addb	%al,%cl
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,16(%rsi)
+	addb	%bl,%cl
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,20(%rsi)
+	addb	%al,%cl
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,24(%rsi)
+	addb	%bl,%cl
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,28(%rsi)
+	addb	%al,%cl
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,32(%rsi)
+	addb	%bl,%cl
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,36(%rsi)
+	addb	%al,%cl
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,40(%rsi)
+	addb	%bl,%cl
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,44(%rsi)
+	addb	%al,%cl
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,48(%rsi)
+	addb	%bl,%cl
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,52(%rsi)
+	addb	%al,%cl
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,56(%rsi)
+	addb	%bl,%cl
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+	addb	$16,%r10b
+	movdqu	(%r12),%xmm2
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movzbl	%bl,%ebx
+	movl	%edx,60(%rsi)
+	leaq	(%rdi,%r10,4),%rsi
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+	movl	(%rsi),%eax
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	subq	$16,%r11
+	movb	%bl,%cl
+	testq	$-16,%r11
+	jnz	.Loop16
+
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lloop1:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%r11
+	jnz	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.LRC4_CHAR:
+	addb	$1,%r10b
+	movzbl	(%rdi,%r10,1),%eax
+	testq	$-8,%r11
+	jz	.Lcloop1
+	jmp	.Lcloop8
+.align	16
+.Lcloop8:
+	movl	(%r12),%r8d
+	movl	4(%r12),%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov0			
+	movq	%rax,%rbx
+.Lcmov0:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov1			
+	movq	%rbx,%rax
+.Lcmov1:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov2			
+	movq	%rax,%rbx
+.Lcmov2:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov3			
+	movq	%rbx,%rax
+.Lcmov3:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov4			
+	movq	%rax,%rbx
+.Lcmov4:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov5			
+	movq	%rbx,%rax
+.Lcmov5:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov6			
+	movq	%rax,%rbx
+.Lcmov6:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov7			
+	movq	%rbx,%rax
+.Lcmov7:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	leaq	-8(%r11),%r11
+	movl	%r8d,(%r13)
+	leaq	8(%r12),%r12
+	movl	%r9d,4(%r13)
+	leaq	8(%r13),%r13
+
+	testq	$-8,%r11
+	jnz	.Lcloop8
+	cmpq	$0,%r11
+	jne	.Lcloop1
+	jmp	.Lexit
+.align	16
+.Lcloop1:
+	addb	%al,%cl
+	movzbl	%cl,%ecx
+	movzbl	(%rdi,%rcx,1),%edx
+	movb	%al,(%rdi,%rcx,1)
+	movb	%dl,(%rdi,%r10,1)
+	addb	%al,%dl
+	addb	$1,%r10b
+	movzbl	%dl,%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%rdx,1),%edx
+	movzbl	(%rdi,%r10,1),%eax
+	xorb	(%r12),%dl
+	leaq	1(%r12),%r12
+	movb	%dl,(%r13)
+	leaq	1(%r13),%r13
+	subq	$1,%r11
+	jnz	.Lcloop1
+	jmp	.Lexit
+
+.align	16
+.Lexit:
+	subb	$1,%r10b
+	movl	%r10d,-8(%rdi)
+	movl	%ecx,-4(%rdi)
+
+	movq	(%rsp),%r13
+	movq	8(%rsp),%r12
+	movq	16(%rsp),%rbx
+	addq	$24,%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	RC4,.-RC4
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,@function
+.align	16
+private_RC4_set_key:
+	leaq	8(%rdi),%rdi
+	leaq	(%rdx,%rsi,1),%rdx
+	negq	%rsi
+	movq	%rsi,%rcx
+	xorl	%eax,%eax
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	btl	$20,%r8d
+	jc	.Lc1stloop
+	jmp	.Lw1stloop
+
+.align	16
+.Lw1stloop:
+	movl	%eax,(%rdi,%rax,4)
+	addb	$1,%al
+	jnc	.Lw1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.align	16
+.Lw2ndloop:
+	movl	(%rdi,%r9,4),%r10d
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movl	(%rdi,%r8,4),%r11d
+	cmovzq	%rcx,%rsi
+	movl	%r10d,(%rdi,%r8,4)
+	movl	%r11d,(%rdi,%r9,4)
+	addb	$1,%r9b
+	jnc	.Lw2ndloop
+	jmp	.Lexit_key
+
+.align	16
+.Lc1stloop:
+	movb	%al,(%rdi,%rax,1)
+	addb	$1,%al
+	jnc	.Lc1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.align	16
+.Lc2ndloop:
+	movb	(%rdi,%r9,1),%r10b
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movb	(%rdi,%r8,1),%r11b
+	jnz	.Lcnowrap
+	movq	%rcx,%rsi
+.Lcnowrap:
+	movb	%r10b,(%rdi,%r8,1)
+	movb	%r11b,(%rdi,%r9,1)
+	addb	$1,%r9b
+	jnc	.Lc2ndloop
+	movl	$-1,256(%rdi)
+
+.align	16
+.Lexit_key:
+	xorl	%eax,%eax
+	movl	%eax,-8(%rdi)
+	movl	%eax,-4(%rdi)
+	.byte	0xf3,0xc3
+.size	private_RC4_set_key,.-private_RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,@function
+.align	16
+RC4_options:
+	leaq	.Lopts(%rip),%rax
+	movl	OPENSSL_ia32cap_P(%rip),%edx
+	btl	$20,%edx
+	jc	.L8xchar
+	btl	$30,%edx
+	jnc	.Ldone
+	addq	$25,%rax
+	.byte	0xf3,0xc3
+.L8xchar:
+	addq	$12,%rax
+.Ldone:
+	.byte	0xf3,0xc3
+.align	64
+.Lopts:
+.byte	114,99,52,40,56,120,44,105,110,116,41,0
+.byte	114,99,52,40,56,120,44,99,104,97,114,41,0
+.byte	114,99,52,40,49,54,120,44,105,110,116,41,0
+.byte	82,67,52,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.size	RC4_options,.-RC4_options
diff --git a/main/openssl/crypto/rc4/asm/rc4-x86_64.pl b/main/openssl/crypto/rc4/asm/rc4-x86_64.pl
index 677be5fe..20722d3e 100755..100644
--- a/main/openssl/crypto/rc4/asm/rc4-x86_64.pl
+++ b/main/openssl/crypto/rc4/asm/rc4-x86_64.pl
@@ -7,6 +7,8 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
+# July 2004
+#
 # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
 # "hand-coded assembler"] doesn't stand for the whole improvement
 # coefficient. It turned out that eliminating RC4_CHAR from config
@@ -19,6 +21,8 @@
 # to operate on partial registers, it turned out to be the best bet.
 # At least for AMD... How IA32E would perform remains to be seen...
 
+# November 2004
+#
 # As was shown by Marc Bevand reordering of couple of load operations
 # results in even higher performance gain of 3.3x:-) At least on
 # Opteron... For reference, 1x in this case is RC4_CHAR C-code
@@ -26,6 +30,8 @@
 # Latter means that if you want to *estimate* what to expect from
 # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
 
+# November 2004
+#
 # Intel P4 EM64T core was found to run the AMD64 code really slow...
 # The only way to achieve comparable performance on P4 was to keep
 # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
@@ -33,10 +39,14 @@
 # on either AMD and Intel platforms, I implement both cases. See
 # rc4_skey.c for further details...
 
+# April 2005
+#
 # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 
 # those with add/sub results in 50% performance improvement of folded
 # loop...
 
+# May 2005
+#
 # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
 # performance by >30% [unlike P4 32-bit case that is]. But this is
 # provided that loads are reordered even more aggressively! Both code
@@ -46,10 +56,12 @@
 # achieves respectful 432MBps on 2.8GHz processor now. For reference.
 # If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
 # RC4_INT code-path. While if executed on Opteron, it's only 25%
-# slower than the RC4_INT one [meaning that if CPU �-arch detection
+# slower than the RC4_INT one [meaning that if CPU µ-arch detection
 # is not implemented, then this final RC4_CHAR code-path should be
 # preferred, as it provides better *all-round* performance].
 
+# March 2007
+#
 # Intel Core2 was observed to perform poorly on both code paths:-( It
 # apparently suffers from some kind of partial register stall, which
 # occurs in 64-bit mode only [as virtually identical 32-bit loop was
@@ -58,6 +70,37 @@
 # fit for Core2 and therefore the code was modified to skip cloop8 on
 # this CPU.
 
+# May 2010
+#
+# Intel Westmere was observed to perform suboptimally. Adding yet
+# another movzb to cloop1 improved performance by almost 50%! Core2
+# performance is improved too, but nominally...
+
+# May 2011
+#
+# The only code path that was not modified is P4-specific one. Non-P4
+# Intel code path optimization is heavily based on submission by Maxim
+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
+# some of the ideas even in attempt to optmize the original RC4_INT
+# code path... Current performance in cycles per processed byte (less
+# is better) and improvement coefficients relative to previous
+# version of this module are:
+#
+# Opteron	5.3/+0%(*)
+# P4		6.5
+# Core2		6.2/+15%(**)
+# Westmere	4.2/+60%
+# Sandy Bridge	4.2/+120%
+# Atom		9.3/+80%
+#
+# (*)	But corresponding loop has less instructions, which should have
+#	positive effect on upcoming Bulldozer, which has one less ALU.
+#	For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**)	Note that Core2 result is ~15% lower than corresponding result
+#	for 32-bit code, meaning that it's possible to improve it,
+#	but more than likely at the cost of the others (see rc4-586.pl
+#	to get the idea)...
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -69,20 +112,18 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $dat="%rdi";	    # arg1
 $len="%rsi";	    # arg2
 $inp="%rdx";	    # arg3
 $out="%rcx";	    # arg4
 
-@XX=("%r8","%r10");
-@TX=("%r9","%r11");
-$YY="%r12";
-$TY="%r13";
-
+{
 $code=<<___;
 .text
+.extern	OPENSSL_ia32cap_P
 
 .globl	RC4
 .type	RC4,\@function,4
@@ -95,48 +136,173 @@ RC4:	or	$len,$len
 	push	%r12
 	push	%r13
 .Lprologue:
+	mov	$len,%r11
+	mov	$inp,%r12
+	mov	$out,%r13
+___
+my $len="%r11";		# reassign input arguments
+my $inp="%r12";
+my $out="%r13";
 
-	add	\$8,$dat
-	movl	-8($dat),$XX[0]#d
-	movl	-4($dat),$YY#d
+my @XX=("%r10","%rsi");
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+$code.=<<___;
+	xor	$XX[0],$XX[0]
+	xor	$YY,$YY
+
+	lea	8($dat),$dat
+	mov	-8($dat),$XX[0]#b
+	mov	-4($dat),$YY#b
 	cmpl	\$-1,256($dat)
 	je	.LRC4_CHAR
+	mov	OPENSSL_ia32cap_P(%rip),%r8d
+	xor	$TX[1],$TX[1]
 	inc	$XX[0]#b
+	sub	$XX[0],$TX[1]
+	sub	$inp,$out
 	movl	($dat,$XX[0],4),$TX[0]#d
-	test	\$-8,$len
+	test	\$-16,$len
 	jz	.Lloop1
-	jmp	.Lloop8
+	bt	\$30,%r8d	# Intel CPU?
+	jc	.Lintel
+	and	\$7,$TX[1]
+	lea	1($XX[0]),$XX[1]
+	jz	.Loop8
+	sub	$TX[1],$len
+.Loop8_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($inp),$TY#b
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
+	dec	$TX[1]
+	jnz	.Loop8_warmup
+
+	lea	1($XX[0]),$XX[1]
+	jmp	.Loop8
 .align	16
-.Lloop8:
+.Loop8:
 ___
 for ($i=0;$i<8;$i++) {
+$code.=<<___ if ($i==7);
+	add	\$8,$XX[1]#b
+___
 $code.=<<___;
 	add	$TX[0]#b,$YY#b
-	mov	$XX[0],$XX[1]
 	movl	($dat,$YY,4),$TY#d
-	ror	\$8,%rax			# ror is redundant when $i=0
-	inc	$XX[1]#b
-	movl	($dat,$XX[1],4),$TX[1]#d
-	cmp	$XX[1],$YY
 	movl	$TX[0]#d,($dat,$YY,4)
-	cmove	$TX[0],$TX[1]
-	movl	$TY#d,($dat,$XX[0],4)
+	movl	`4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
+	ror	\$8,%r8				# ror is redundant when $i=0
+	movl	$TY#d,4*$i($dat,$XX[0],4)
 	add	$TX[0]#b,$TY#b
-	movb	($dat,$TY,4),%al
+	movb	($dat,$TY,4),%r8b
 ___
-push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+push(@TX,shift(@TX)); #push(@XX,shift(@XX));	# "rotate" registers
 }
 $code.=<<___;
-	ror	\$8,%rax
+	add	\$8,$XX[0]#b
+	ror	\$8,%r8
 	sub	\$8,$len
 
-	xor	($inp),%rax
-	add	\$8,$inp
-	mov	%rax,($out)
-	add	\$8,$out
+	xor	($inp),%r8
+	mov	%r8,($out,$inp)
+	lea	8($inp),$inp
 
 	test	\$-8,$len
-	jnz	.Lloop8
+	jnz	.Loop8
+	cmp	\$0,$len
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lintel:
+	test	\$-32,$len
+	jz	.Lloop1
+	and	\$15,$TX[1]
+	jz	.Loop16_is_hot
+	sub	$TX[1],$len
+.Loop16_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($inp),$TY#b
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
+	dec	$TX[1]
+	jnz	.Loop16_warmup
+
+	mov	$YY,$TX[1]
+	xor	$YY,$YY
+	mov	$TX[1]#b,$YY#b
+
+.Loop16_is_hot:
+	lea	($dat,$XX[0],4),$XX[1]
+___
+sub RC4_loop {
+  my $i=shift;
+  my $j=$i<0?0:$i;
+  my $xmm="%xmm".($j&1);
+
+    $code.="	add	\$16,$XX[0]#b\n"		if ($i==15);
+    $code.="	movdqu	($inp),%xmm2\n"			if ($i==15);
+    $code.="	add	$TX[0]#b,$YY#b\n"		if ($i<=0);
+    $code.="	movl	($dat,$YY,4),$TY#d\n";
+    $code.="	pxor	%xmm0,%xmm2\n"			if ($i==0);
+    $code.="	psllq	\$8,%xmm1\n"			if ($i==0);
+    $code.="	pxor	$xmm,$xmm\n"			if ($i<=1);
+    $code.="	movl	$TX[0]#d,($dat,$YY,4)\n";
+    $code.="	add	$TY#b,$TX[0]#b\n";
+    $code.="	movl	`4*($j+1)`($XX[1]),$TX[1]#d\n"	if ($i<15);
+    $code.="	movz	$TX[0]#b,$TX[0]#d\n";
+    $code.="	movl	$TY#d,4*$j($XX[1])\n";
+    $code.="	pxor	%xmm1,%xmm2\n"			if ($i==0);
+    $code.="	lea	($dat,$XX[0],4),$XX[1]\n"	if ($i==15);
+    $code.="	add	$TX[1]#b,$YY#b\n"		if ($i<15);
+    $code.="	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
+    $code.="	movdqu	%xmm2,($out,$inp)\n"		if ($i==0);
+    $code.="	lea	16($inp),$inp\n"		if ($i==0);
+    $code.="	movl	($XX[1]),$TX[1]#d\n"		if ($i==15);
+}
+	RC4_loop(-1);
+$code.=<<___;
+	jmp	.Loop16_enter
+.align	16
+.Loop16:
+___
+
+for ($i=0;$i<16;$i++) {
+    $code.=".Loop16_enter:\n"		if ($i==1);
+	RC4_loop($i);
+	push(@TX,shift(@TX)); 		# "rotate" registers
+}
+$code.=<<___;
+	mov	$YY,$TX[1]
+	xor	$YY,$YY			# keyword to partial register
+	sub	\$16,$len
+	mov	$TX[1]#b,$YY#b
+	test	\$-16,$len
+	jnz	.Loop16
+
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,($out,$inp)
+	lea	16($inp),$inp
+
 	cmp	\$0,$len
 	jne	.Lloop1
 	jmp	.Lexit
@@ -152,9 +318,8 @@ $code.=<<___;
 	movl	($dat,$TX[0],4),$TY#d
 	movl	($dat,$XX[0],4),$TX[0]#d
 	xorb	($inp),$TY#b
-	inc	$inp
-	movb	$TY#b,($out)
-	inc	$out
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
 	dec	$len
 	jnz	.Lloop1
 	jmp	.Lexit
@@ -165,13 +330,11 @@ $code.=<<___;
 	movzb	($dat,$XX[0]),$TX[0]#d
 	test	\$-8,$len
 	jz	.Lcloop1
-	cmpl	\$0,260($dat)
-	jnz	.Lcloop1
 	jmp	.Lcloop8
 .align	16
 .Lcloop8:
-	mov	($inp),%eax
-	mov	4($inp),%ebx
+	mov	($inp),%r8d
+	mov	4($inp),%r9d
 ___
 # unroll 2x4-wise, because 64-bit rotates kill Intel P4...
 for ($i=0;$i<4;$i++) {
@@ -188,8 +351,8 @@ $code.=<<___;
 	mov	$TX[0],$TX[1]
 .Lcmov$i:
 	add	$TX[0]#b,$TY#b
-	xor	($dat,$TY),%al
-	ror	\$8,%eax
+	xor	($dat,$TY),%r8b
+	ror	\$8,%r8d
 ___
 push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
 }
@@ -207,16 +370,16 @@ $code.=<<___;
 	mov	$TX[0],$TX[1]
 .Lcmov$i:
 	add	$TX[0]#b,$TY#b
-	xor	($dat,$TY),%bl
-	ror	\$8,%ebx
+	xor	($dat,$TY),%r9b
+	ror	\$8,%r9d
 ___
 push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
 }
 $code.=<<___;
 	lea	-8($len),$len
-	mov	%eax,($out)
+	mov	%r8d,($out)
 	lea	8($inp),$inp
-	mov	%ebx,4($out)
+	mov	%r9d,4($out)
 	lea	8($out),$out
 
 	test	\$-8,$len
@@ -229,6 +392,7 @@ $code.=<<___;
 .align	16
 .Lcloop1:
 	add	$TX[0]#b,$YY#b
+	movzb	$YY#b,$YY#d
 	movzb	($dat,$YY),$TY#d
 	movb	$TX[0]#b,($dat,$YY)
 	movb	$TY#b,($dat,$XX[0])
@@ -260,16 +424,16 @@ $code.=<<___;
 	ret
 .size	RC4,.-RC4
 ___
+}
 
 $idx="%r8";
 $ido="%r9";
 
 $code.=<<___;
-.extern	OPENSSL_ia32cap_P
-.globl	RC4_set_key
-.type	RC4_set_key,\@function,3
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,\@function,3
 .align	16
-RC4_set_key:
+private_RC4_set_key:
 	lea	8($dat),$dat
 	lea	($inp,$len),$inp
 	neg	$len
@@ -280,12 +444,9 @@ RC4_set_key:
 	xor	%r11,%r11
 
 	mov	OPENSSL_ia32cap_P(%rip),$idx#d
-	bt	\$20,$idx#d
-	jnc	.Lw1stloop
-	bt	\$30,$idx#d
-	setc	$ido#b
-	mov	$ido#d,260($dat)
-	jmp	.Lc1stloop
+	bt	\$20,$idx#d	# RC4_CHAR?
+	jc	.Lc1stloop
+	jmp	.Lw1stloop
 
 .align	16
 .Lw1stloop:
@@ -339,7 +500,7 @@ RC4_set_key:
 	mov	%eax,-8($dat)
 	mov	%eax,-4($dat)
 	ret
-.size	RC4_set_key,.-RC4_set_key
+.size	private_RC4_set_key,.-private_RC4_set_key
 
 .globl	RC4_options
 .type	RC4_options,\@abi-omnipotent
@@ -348,18 +509,20 @@ RC4_options:
 	lea	.Lopts(%rip),%rax
 	mov	OPENSSL_ia32cap_P(%rip),%edx
 	bt	\$20,%edx
-	jnc	.Ldone
-	add	\$12,%rax
+	jc	.L8xchar
 	bt	\$30,%edx
 	jnc	.Ldone
-	add	\$13,%rax
+	add	\$25,%rax
+	ret
+.L8xchar:
+	add	\$12,%rax
 .Ldone:
 	ret
 .align	64
 .Lopts:
 .asciz	"rc4(8x,int)"
 .asciz	"rc4(8x,char)"
-.asciz	"rc4(1x,char)"
+.asciz	"rc4(16x,int)"
 .asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 .size	RC4_options,.-RC4_options
@@ -482,22 +645,32 @@ key_se_handler:
 	.rva	.LSEH_end_RC4
 	.rva	.LSEH_info_RC4
 
-	.rva	.LSEH_begin_RC4_set_key
-	.rva	.LSEH_end_RC4_set_key
-	.rva	.LSEH_info_RC4_set_key
+	.rva	.LSEH_begin_private_RC4_set_key
+	.rva	.LSEH_end_private_RC4_set_key
+	.rva	.LSEH_info_private_RC4_set_key
 
 .section	.xdata
 .align	8
 .LSEH_info_RC4:
 	.byte	9,0,0,0
 	.rva	stream_se_handler
-.LSEH_info_RC4_set_key:
+.LSEH_info_private_RC4_set_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 ___
 }
 
-$code =~ s/#([bwd])/$1/gm;
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
 
 print $code;
 
diff --git a/main/openssl/crypto/rc4/rc4.h b/main/openssl/crypto/rc4/rc4.h
index 29d1accc..88ceb46b 100644
--- a/main/openssl/crypto/rc4/rc4.h
+++ b/main/openssl/crypto/rc4/rc4.h
@@ -79,6 +79,7 @@ typedef struct rc4_key_st
  
 const char *RC4_options(void);
 void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
 void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
 		unsigned char *outdata);
 
diff --git a/main/openssl/crypto/rc4/rc4_skey.c b/main/openssl/crypto/rc4/rc4_skey.c
index b22c40b0..fda27636 100644
--- a/main/openssl/crypto/rc4/rc4_skey.c
+++ b/main/openssl/crypto/rc4/rc4_skey.c
@@ -85,7 +85,7 @@ const char *RC4_options(void)
  * Date: Wed, 14 Sep 1994 06:35:31 GMT
  */
 
-void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
 	{
         register RC4_INT tmp;
         register int id1,id2;
@@ -104,40 +104,6 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
 		d[(n)]=d[id2]; \
 		d[id2]=tmp; }
 
-#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
-# if	defined(__i386)   || defined(__i386__)   || defined(_M_IX86) || \
-	defined(__INTEL__) || \
-	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
-	if (sizeof(RC4_INT) > 1) {
-		/*
-		 * Unlike all other x86 [and x86_64] implementations,
-		 * Intel P4 core [including EM64T] was found to perform
-		 * poorly with wider RC4_INT. Performance improvement
-		 * for IA-32 hand-coded assembler turned out to be 2.8x
-		 * if re-coded for RC4_CHAR! It's however inappropriate
-		 * to just switch to RC4_CHAR for x86[_64], as non-P4
-		 * implementations suffer from significant performance
-		 * losses then, e.g. PIII exhibits >2x deterioration,
-		 * and so does Opteron. In order to assure optimal
-		 * all-round performance, let us [try to] detect P4 at
-		 * run-time by checking upon HTT bit in CPU capability
-		 * vector and set up compressed key schedule, which is
-		 * recognized by correspondingly updated assembler
-		 * module...
-		 *				<appro@fy.chalmers.se>
-		 */
-		if (OPENSSL_ia32cap_P & (1<<28)) {
-			unsigned char *cp=(unsigned char *)d;
-
-			for (i=0;i<256;i++) cp[i]=i;
-			for (i=0;i<256;i++) SK_LOOP(cp,i);
-			/* mark schedule as compressed! */
-			d[256/sizeof(RC4_INT)]=-1;
-			return;
-		}
-	}
-# endif
-#endif
 	for (i=0; i < 256; i++) d[i]=i;
 	for (i=0; i < 256; i+=4)
 		{
diff --git a/main/openssl/crypto/rc4/rc4_utl.c b/main/openssl/crypto/rc4/rc4_utl.c
new file mode 100644
index 00000000..ab3f02fe
--- /dev/null
+++ b/main/openssl/crypto/rc4/rc4_utl.c
@@ -0,0 +1,62 @@
+/* crypto/rc4/rc4_utl.c -*- mode:C; c-file-style: "eay" -*- */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include <openssl/rc4.h>
+
+void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(RC4);
+#endif
+	private_RC4_set_key(key, len, data);
+	}
diff --git a/main/openssl/crypto/rc4/rc4test.c b/main/openssl/crypto/rc4/rc4test.c
index 633a79e7..4312605c 100644
--- a/main/openssl/crypto/rc4/rc4test.c
+++ b/main/openssl/crypto/rc4/rc4test.c
@@ -120,6 +120,12 @@ int main(int argc, char *argv[])
 	RC4_KEY key;
 	unsigned char obuf[512];
 
+#if !defined(OPENSSL_PIC)
+	void OPENSSL_cpuid_setup(void);
+
+	OPENSSL_cpuid_setup();
+#endif
+
 	for (i=0; i<6; i++)
 		{
 		RC4_set_key(&key,keys[i][0],&(keys[i][1]));
diff --git a/main/openssl/crypto/ripemd/ripemd.h b/main/openssl/crypto/ripemd/ripemd.h
index 5942eb61..189bd8c9 100644
--- a/main/openssl/crypto/ripemd/ripemd.h
+++ b/main/openssl/crypto/ripemd/ripemd.h
@@ -91,6 +91,9 @@ typedef struct RIPEMD160state_st
 	unsigned int   num;
 	} RIPEMD160_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_RIPEMD160_Init(RIPEMD160_CTX *c);
+#endif
 int RIPEMD160_Init(RIPEMD160_CTX *c);
 int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len);
 int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c);
diff --git a/main/openssl/crypto/ripemd/rmd_dgst.c b/main/openssl/crypto/ripemd/rmd_dgst.c
index 59b017f8..d8e72da5 100644
--- a/main/openssl/crypto/ripemd/rmd_dgst.c
+++ b/main/openssl/crypto/ripemd/rmd_dgst.c
@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include "rmd_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 
 const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT;
 
@@ -69,7 +70,7 @@ const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT;
      void ripemd160_block(RIPEMD160_CTX *c, unsigned long *p,size_t num);
 #  endif
 
-int RIPEMD160_Init(RIPEMD160_CTX *c)
+fips_md_init(RIPEMD160)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=RIPEMD160_A;
@@ -104,21 +105,21 @@ void ripemd160_block_data_order (RIPEMD160_CTX *ctx, const void *p, size_t num)
 
 	A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E;
 
-	HOST_c2l(data,l); X( 0)=l;	HOST_c2l(data,l); X( 1)=l;
-	RIP1(A,B,C,D,E,WL00,SL00);	HOST_c2l(data,l); X( 2)=l;
-	RIP1(E,A,B,C,D,WL01,SL01);	HOST_c2l(data,l); X( 3)=l;
-	RIP1(D,E,A,B,C,WL02,SL02);	HOST_c2l(data,l); X( 4)=l;
-	RIP1(C,D,E,A,B,WL03,SL03);	HOST_c2l(data,l); X( 5)=l;
-	RIP1(B,C,D,E,A,WL04,SL04);	HOST_c2l(data,l); X( 6)=l;
-	RIP1(A,B,C,D,E,WL05,SL05);	HOST_c2l(data,l); X( 7)=l;
-	RIP1(E,A,B,C,D,WL06,SL06);	HOST_c2l(data,l); X( 8)=l;
-	RIP1(D,E,A,B,C,WL07,SL07);	HOST_c2l(data,l); X( 9)=l;
-	RIP1(C,D,E,A,B,WL08,SL08);	HOST_c2l(data,l); X(10)=l;
-	RIP1(B,C,D,E,A,WL09,SL09);	HOST_c2l(data,l); X(11)=l;
-	RIP1(A,B,C,D,E,WL10,SL10);	HOST_c2l(data,l); X(12)=l;
-	RIP1(E,A,B,C,D,WL11,SL11);	HOST_c2l(data,l); X(13)=l;
-	RIP1(D,E,A,B,C,WL12,SL12);	HOST_c2l(data,l); X(14)=l;
-	RIP1(C,D,E,A,B,WL13,SL13);	HOST_c2l(data,l); X(15)=l;
+	(void)HOST_c2l(data,l); X( 0)=l;(void)HOST_c2l(data,l); X( 1)=l;
+	RIP1(A,B,C,D,E,WL00,SL00);	(void)HOST_c2l(data,l); X( 2)=l;
+	RIP1(E,A,B,C,D,WL01,SL01);	(void)HOST_c2l(data,l); X( 3)=l;
+	RIP1(D,E,A,B,C,WL02,SL02);	(void)HOST_c2l(data,l); X( 4)=l;
+	RIP1(C,D,E,A,B,WL03,SL03);	(void)HOST_c2l(data,l); X( 5)=l;
+	RIP1(B,C,D,E,A,WL04,SL04);	(void)HOST_c2l(data,l); X( 6)=l;
+	RIP1(A,B,C,D,E,WL05,SL05);	(void)HOST_c2l(data,l); X( 7)=l;
+	RIP1(E,A,B,C,D,WL06,SL06);	(void)HOST_c2l(data,l); X( 8)=l;
+	RIP1(D,E,A,B,C,WL07,SL07);	(void)HOST_c2l(data,l); X( 9)=l;
+	RIP1(C,D,E,A,B,WL08,SL08);	(void)HOST_c2l(data,l); X(10)=l;
+	RIP1(B,C,D,E,A,WL09,SL09);	(void)HOST_c2l(data,l); X(11)=l;
+	RIP1(A,B,C,D,E,WL10,SL10);	(void)HOST_c2l(data,l); X(12)=l;
+	RIP1(E,A,B,C,D,WL11,SL11);	(void)HOST_c2l(data,l); X(13)=l;
+	RIP1(D,E,A,B,C,WL12,SL12);	(void)HOST_c2l(data,l); X(14)=l;
+	RIP1(C,D,E,A,B,WL13,SL13);	(void)HOST_c2l(data,l); X(15)=l;
 	RIP1(B,C,D,E,A,WL14,SL14);
 	RIP1(A,B,C,D,E,WL15,SL15);
 
diff --git a/main/openssl/crypto/ripemd/rmd_locl.h b/main/openssl/crypto/ripemd/rmd_locl.h
index f14b346e..2bd8957d 100644
--- a/main/openssl/crypto/ripemd/rmd_locl.h
+++ b/main/openssl/crypto/ripemd/rmd_locl.h
@@ -88,11 +88,11 @@ void ripemd160_block_data_order (RIPEMD160_CTX *c, const void *p,size_t num);
 #define HASH_FINAL              RIPEMD160_Final
 #define	HASH_MAKE_STRING(c,s)	do {	\
 	unsigned long ll;		\
-	ll=(c)->A; HOST_l2c(ll,(s));	\
-	ll=(c)->B; HOST_l2c(ll,(s));	\
-	ll=(c)->C; HOST_l2c(ll,(s));	\
-	ll=(c)->D; HOST_l2c(ll,(s));	\
-	ll=(c)->E; HOST_l2c(ll,(s));	\
+	ll=(c)->A; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->B; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->C; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->D; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->E; (void)HOST_l2c(ll,(s));	\
 	} while (0)
 #define HASH_BLOCK_DATA_ORDER   ripemd160_block_data_order
 
diff --git a/main/openssl/crypto/rsa/rsa.h b/main/openssl/crypto/rsa/rsa.h
index cf743436..5f269e57 100644
--- a/main/openssl/crypto/rsa/rsa.h
+++ b/main/openssl/crypto/rsa/rsa.h
@@ -222,12 +222,22 @@ struct rsa_st
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_RSA_PADDING, \
 				pad, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_padding(ctx, ppad) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, \
+				EVP_PKEY_CTRL_GET_RSA_PADDING, 0, ppad)
+
 #define EVP_PKEY_CTX_set_rsa_pss_saltlen(ctx, len) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
 				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
 				EVP_PKEY_CTRL_RSA_PSS_SALTLEN, \
 				len, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx, plen) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
+				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
+				EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, \
+				0, plen)
+
 #define EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, bits) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, NULL)
@@ -236,11 +246,24 @@ struct rsa_st
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, pubexp)
 
+#define	 EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md)
+
+#define	 EVP_PKEY_CTX_get_rsa_mgf1_md(ctx, pmd)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)pmd)
+
 #define EVP_PKEY_CTRL_RSA_PADDING	(EVP_PKEY_ALG_CTRL + 1)
 #define EVP_PKEY_CTRL_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 2)
 
 #define EVP_PKEY_CTRL_RSA_KEYGEN_BITS	(EVP_PKEY_ALG_CTRL + 3)
 #define EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP	(EVP_PKEY_ALG_CTRL + 4)
+#define EVP_PKEY_CTRL_RSA_MGF1_MD	(EVP_PKEY_ALG_CTRL + 5)
+
+#define EVP_PKEY_CTRL_GET_RSA_PADDING		(EVP_PKEY_ALG_CTRL + 6)
+#define EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 7)
+#define EVP_PKEY_CTRL_GET_RSA_MGF1_MD		(EVP_PKEY_ALG_CTRL + 8)
 
 #define RSA_PKCS1_PADDING	1
 #define RSA_SSLV23_PADDING	2
@@ -257,7 +280,7 @@ struct rsa_st
 
 RSA *	RSA_new(void);
 RSA *	RSA_new_method(ENGINE *engine);
-int	RSA_size(const RSA *);
+int	RSA_size(const RSA *rsa);
 
 /* Deprecated version */
 #ifndef OPENSSL_NO_DEPRECATED
@@ -300,6 +323,16 @@ const RSA_METHOD *RSA_null_method(void);
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPublicKey)
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPrivateKey)
 
+typedef struct rsa_pss_params_st
+	{
+	X509_ALGOR *hashAlgorithm;
+	X509_ALGOR *maskGenAlgorithm;
+	ASN1_INTEGER *saltLength;
+	ASN1_INTEGER *trailerField;
+	} RSA_PSS_PARAMS;
+
+DECLARE_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 #ifndef OPENSSL_NO_FP_API
 int	RSA_print_fp(FILE *fp, const RSA *r,int offset);
 #endif
@@ -380,6 +413,14 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen);
 
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, 
+			const unsigned char *EM, int sLen);
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen);
+
 int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new *new_func,
 	CRYPTO_EX_dup *dup_func, CRYPTO_EX_free *free_func);
 int RSA_set_ex_data(RSA *r,int idx,void *arg);
@@ -388,6 +429,25 @@ void *RSA_get_ex_data(const RSA *r, int idx);
 RSA *RSAPublicKey_dup(RSA *rsa);
 RSA *RSAPrivateKey_dup(RSA *rsa);
 
+/* If this flag is set the RSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define RSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define RSA_FLAG_NON_FIPS_ALLOW			0x0400
+/* Application has decided PRNG is good enough to generate a key: don't
+ * check.
+ */
+#define RSA_FLAG_CHECKED			0x0800
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -405,6 +465,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_PKEY_RSA_CTRL				 143
 #define RSA_F_PKEY_RSA_CTRL_STR				 144
 #define RSA_F_PKEY_RSA_SIGN				 142
+#define RSA_F_PKEY_RSA_VERIFY				 154
 #define RSA_F_PKEY_RSA_VERIFYRECOVER			 141
 #define RSA_F_RSA_BUILTIN_KEYGEN			 129
 #define RSA_F_RSA_CHECK_KEY				 123
@@ -413,6 +474,8 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_EAY_PUBLIC_DECRYPT			 103
 #define RSA_F_RSA_EAY_PUBLIC_ENCRYPT			 104
 #define RSA_F_RSA_GENERATE_KEY				 105
+#define RSA_F_RSA_GENERATE_KEY_EX			 155
+#define RSA_F_RSA_ITEM_VERIFY				 156
 #define RSA_F_RSA_MEMORY_LOCK				 130
 #define RSA_F_RSA_NEW_METHOD				 106
 #define RSA_F_RSA_NULL					 124
@@ -424,6 +487,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_PADDING_ADD_NONE			 107
 #define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP		 121
 #define RSA_F_RSA_PADDING_ADD_PKCS1_PSS			 125
+#define RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1		 148
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1		 108
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2		 109
 #define RSA_F_RSA_PADDING_ADD_SSLV23			 110
@@ -436,8 +500,12 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_PADDING_CHECK_X931			 128
 #define RSA_F_RSA_PRINT					 115
 #define RSA_F_RSA_PRINT_FP				 116
+#define RSA_F_RSA_PRIVATE_DECRYPT			 150
+#define RSA_F_RSA_PRIVATE_ENCRYPT			 151
 #define RSA_F_RSA_PRIV_DECODE				 137
 #define RSA_F_RSA_PRIV_ENCODE				 138
+#define RSA_F_RSA_PUBLIC_DECRYPT			 152
+#define RSA_F_RSA_PUBLIC_ENCRYPT			 153
 #define RSA_F_RSA_PUB_DECODE				 139
 #define RSA_F_RSA_SETUP_BLINDING			 136
 #define RSA_F_RSA_SIGN					 117
@@ -445,6 +513,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_VERIFY				 119
 #define RSA_F_RSA_VERIFY_ASN1_OCTET_STRING		 120
 #define RSA_F_RSA_VERIFY_PKCS1_PSS			 126
+#define RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1			 149
 
 /* Reason codes. */
 #define RSA_R_ALGORITHM_MISMATCH			 100
@@ -470,19 +539,24 @@ void ERR_load_RSA_strings(void);
 #define RSA_R_INVALID_HEADER				 137
 #define RSA_R_INVALID_KEYBITS				 145
 #define RSA_R_INVALID_MESSAGE_LENGTH			 131
+#define RSA_R_INVALID_MGF1_MD				 156
 #define RSA_R_INVALID_PADDING				 138
 #define RSA_R_INVALID_PADDING_MODE			 141
+#define RSA_R_INVALID_PSS_PARAMETERS			 149
 #define RSA_R_INVALID_PSS_SALTLEN			 146
+#define RSA_R_INVALID_SALT_LENGTH			 150
 #define RSA_R_INVALID_TRAILER				 139
 #define RSA_R_INVALID_X931_DIGEST			 142
 #define RSA_R_IQMP_NOT_INVERSE_OF_Q			 126
 #define RSA_R_KEY_SIZE_TOO_SMALL			 120
 #define RSA_R_LAST_OCTET_INVALID			 134
 #define RSA_R_MODULUS_TOO_LARGE				 105
+#define RSA_R_NON_FIPS_RSA_METHOD			 157
 #define RSA_R_NO_PUBLIC_EXPONENT			 140
 #define RSA_R_NULL_BEFORE_BLOCK_MISSING			 113
 #define RSA_R_N_DOES_NOT_EQUAL_P_Q			 127
 #define RSA_R_OAEP_DECODING_ERROR			 121
+#define RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE	 158
 #define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE	 148
 #define RSA_R_PADDING_CHECK_FAILED			 114
 #define RSA_R_P_NOT_PRIME				 128
@@ -493,7 +567,12 @@ void ERR_load_RSA_strings(void);
 #define RSA_R_SSLV3_ROLLBACK_ATTACK			 115
 #define RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD 116
 #define RSA_R_UNKNOWN_ALGORITHM_TYPE			 117
+#define RSA_R_UNKNOWN_MASK_DIGEST			 151
 #define RSA_R_UNKNOWN_PADDING_TYPE			 118
+#define RSA_R_UNKNOWN_PSS_DIGEST			 152
+#define RSA_R_UNSUPPORTED_MASK_ALGORITHM		 153
+#define RSA_R_UNSUPPORTED_MASK_PARAMETER		 154
+#define RSA_R_UNSUPPORTED_SIGNATURE_TYPE		 155
 #define RSA_R_VALUE_MISSING				 147
 #define RSA_R_WRONG_SIGNATURE_LENGTH			 119
 
diff --git a/main/openssl/crypto/rsa/rsa_ameth.c b/main/openssl/crypto/rsa/rsa_ameth.c
index 8c320988..5a2062f9 100644
--- a/main/openssl/crypto/rsa/rsa_ameth.c
+++ b/main/openssl/crypto/rsa/rsa_ameth.c
@@ -265,6 +265,147 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent,
 	return do_rsa_print(bp, pkey->pkey.rsa, indent, 1);
 	}
 
+static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg,
+					X509_ALGOR **pmaskHash)
+	{
+	const unsigned char *p;
+	int plen;
+	RSA_PSS_PARAMS *pss;
+
+	*pmaskHash = NULL;
+
+	if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE)
+		return NULL;
+	p = alg->parameter->value.sequence->data;
+	plen = alg->parameter->value.sequence->length;
+	pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen);
+
+	if (!pss)
+		return NULL;
+	
+	if (pss->maskGenAlgorithm)
+		{
+		ASN1_TYPE *param = pss->maskGenAlgorithm->parameter;
+		if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1
+			&& param->type == V_ASN1_SEQUENCE)
+			{
+			p = param->value.sequence->data;
+			plen = param->value.sequence->length;
+			*pmaskHash = d2i_X509_ALGOR(NULL, &p, plen);
+			}
+		}
+
+	return pss;
+	}
+
+static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, 
+				X509_ALGOR *maskHash, int indent)
+	{
+	int rv = 0;
+	if (!pss)
+		{
+		if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0)
+			return 0;
+		return 1;
+		}
+	if (BIO_puts(bp, "\n") <= 0)
+		goto err;
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Hash Algorithm: ") <= 0)
+		goto err;
+
+	if (pss->hashAlgorithm)
+		{
+		if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "sha1 (default)") <= 0)
+		goto err;
+
+	if (BIO_puts(bp, "\n") <= 0)
+		goto err;
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+
+	if (BIO_puts(bp, "Mask Algorithm: ") <= 0)
+			goto err;
+	if (pss->maskGenAlgorithm)
+		{
+		if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0)
+			goto err;
+		if (BIO_puts(bp, " with ") <= 0)
+			goto err;
+		if (maskHash)
+			{
+			if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0)
+			goto err;
+			}
+		else if (BIO_puts(bp, "INVALID") <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Salt Length: 0x") <= 0)
+			goto err;
+	if (pss->saltLength)
+		{
+		if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "0x14 (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Trailer Field: 0x") <= 0)
+			goto err;
+	if (pss->trailerField)
+		{
+		if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "BC (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+	
+	rv = 1;
+
+	err:
+	return rv;
+
+	}
+
+static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+					const ASN1_STRING *sig,
+					int indent, ASN1_PCTX *pctx)
+	{
+	if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss)
+		{
+		int rv;
+		RSA_PSS_PARAMS *pss;
+		X509_ALGOR *maskHash;
+		pss = rsa_pss_decode(sigalg, &maskHash);
+		rv = rsa_pss_param_print(bp, pss, maskHash, indent);
+		if (pss)
+			RSA_PSS_PARAMS_free(pss);
+		if (maskHash)
+			X509_ALGOR_free(maskHash);
+		if (!rv)
+			return 0;
+		}
+	else if (!sig && BIO_puts(bp, "\n") <= 0)
+		return 0;
+	if (sig)
+		return X509_signature_dump(bp, sig, indent);
+	return 1;
+	}
 
 static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 	{
@@ -310,6 +451,211 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 
 	}
 
+/* Customised RSA item verification routine. This is called 
+ * when a signature is encountered requiring special handling. We 
+ * currently only handle PSS.
+ */
+
+
+static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+			X509_ALGOR *sigalg, ASN1_BIT_STRING *sig,
+			EVP_PKEY *pkey)
+	{
+	int rv = -1;
+	int saltlen;
+	const EVP_MD *mgf1md = NULL, *md = NULL;
+	RSA_PSS_PARAMS *pss;
+	X509_ALGOR *maskHash;
+	EVP_PKEY_CTX *pkctx;
+	/* Sanity check: make sure it is PSS */
+	if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE);
+		return -1;
+		}
+	/* Decode PSS parameters */
+	pss = rsa_pss_decode(sigalg, &maskHash);
+
+	if (pss == NULL)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS);
+		goto err;
+		}
+	/* Check mask and lookup mask hash algorithm */
+	if (pss->maskGenAlgorithm)
+		{
+		if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM);
+			goto err;
+			}
+		if (!maskHash)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER);
+			goto err;
+			}
+		mgf1md = EVP_get_digestbyobj(maskHash->algorithm);
+		if (mgf1md == NULL)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST);
+			goto err;
+			}
+		}
+	else
+		mgf1md = EVP_sha1();
+
+	if (pss->hashAlgorithm)
+		{
+		md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm);
+		if (md == NULL)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST);
+			goto err;
+			}
+		}
+	else
+		md = EVP_sha1();
+
+	if (pss->saltLength)
+		{
+		saltlen = ASN1_INTEGER_get(pss->saltLength);
+
+		/* Could perform more salt length sanity checks but the main
+		 * RSA routines will trap other invalid values anyway.
+		 */
+		if (saltlen < 0)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH);
+			goto err;
+			}
+		}
+	else
+		saltlen = 20;
+
+	/* low-level routines support only trailer field 0xbc (value 1)
+	 * and PKCS#1 says we should reject any other value anyway.
+	 */
+	if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER);
+		goto err;
+		}
+
+	/* We have all parameters now set up context */
+
+	if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey))
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0)
+		goto err;
+	/* Carry on */
+	rv = 2;
+
+	err:
+	RSA_PSS_PARAMS_free(pss);
+	if (maskHash)
+		X509_ALGOR_free(maskHash);
+	return rv;
+	}
+
+static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *alg1, X509_ALGOR *alg2, 
+				ASN1_BIT_STRING *sig)
+	{
+	int pad_mode;
+	EVP_PKEY_CTX *pkctx = ctx->pctx;
+	if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0)
+		return 0;
+	if (pad_mode == RSA_PKCS1_PADDING)
+		return 2;
+	if (pad_mode == RSA_PKCS1_PSS_PADDING)
+		{
+		const EVP_MD *sigmd, *mgf1md;
+		RSA_PSS_PARAMS *pss = NULL;
+		X509_ALGOR *mgf1alg = NULL;
+		ASN1_STRING *os1 = NULL, *os2 = NULL;
+		EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx);
+		int saltlen, rv = 0;
+		sigmd = EVP_MD_CTX_md(ctx);
+		if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0)
+			goto err;
+		if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen))
+			goto err;
+		if (saltlen == -1)
+			saltlen = EVP_MD_size(sigmd);
+		else if (saltlen == -2)
+			{
+			saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2;
+			if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0)
+				saltlen--;
+			}
+		pss = RSA_PSS_PARAMS_new();
+		if (!pss)
+			goto err;
+		if (saltlen != 20)
+			{
+			pss->saltLength = ASN1_INTEGER_new();
+			if (!pss->saltLength)
+				goto err;
+			if (!ASN1_INTEGER_set(pss->saltLength, saltlen))
+				goto err;
+			}
+		if (EVP_MD_type(sigmd) != NID_sha1)
+			{
+			pss->hashAlgorithm = X509_ALGOR_new();
+			if (!pss->hashAlgorithm)
+				goto err;
+			X509_ALGOR_set_md(pss->hashAlgorithm, sigmd);
+			}
+		if (EVP_MD_type(mgf1md) != NID_sha1)
+			{
+			ASN1_STRING *stmp = NULL;
+			/* need to embed algorithm ID inside another */
+			mgf1alg = X509_ALGOR_new();
+			X509_ALGOR_set_md(mgf1alg, mgf1md);
+			if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR),
+									&stmp))
+					goto err;
+			pss->maskGenAlgorithm = X509_ALGOR_new();
+			if (!pss->maskGenAlgorithm)
+				goto err;
+			X509_ALGOR_set0(pss->maskGenAlgorithm,
+					OBJ_nid2obj(NID_mgf1),
+					V_ASN1_SEQUENCE, stmp);
+			}
+		/* Finally create string with pss parameter encoding. */
+		if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1))
+			goto err;
+		if (alg2)
+			{
+			os2 = ASN1_STRING_dup(os1);
+			if (!os2)
+				goto err;
+			X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss),
+						V_ASN1_SEQUENCE, os2);
+			}
+		X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss),
+					V_ASN1_SEQUENCE, os1);
+		os1 = os2 = NULL;
+		rv = 3;
+		err:
+		if (mgf1alg)
+			X509_ALGOR_free(mgf1alg);
+		if (pss)
+			RSA_PSS_PARAMS_free(pss);
+		if (os1)
+			ASN1_STRING_free(os1);
+		return rv;
+		
+		}
+	return 2;
+	}
 
 const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = 
 	{
@@ -335,10 +681,13 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] =
 
 		0,0,0,0,0,0,
 
+		rsa_sig_print,
 		int_rsa_free,
 		rsa_pkey_ctrl,
 		old_rsa_priv_decode,
-		old_rsa_priv_encode
+		old_rsa_priv_encode,
+		rsa_item_verify,
+		rsa_item_sign
 		},
 
 		{
diff --git a/main/openssl/crypto/rsa/rsa_asn1.c b/main/openssl/crypto/rsa/rsa_asn1.c
index 4efca8cd..6ed5de3d 100644
--- a/main/openssl/crypto/rsa/rsa_asn1.c
+++ b/main/openssl/crypto/rsa/rsa_asn1.c
@@ -60,6 +60,7 @@
 #include "cryptlib.h"
 #include <openssl/bn.h>
 #include <openssl/rsa.h>
+#include <openssl/x509.h>
 #include <openssl/asn1t.h>
 
 /* Override the default free and new methods */
@@ -96,6 +97,15 @@ ASN1_SEQUENCE_cb(RSAPublicKey, rsa_cb) = {
 	ASN1_SIMPLE(RSA, e, BIGNUM),
 } ASN1_SEQUENCE_END_cb(RSA, RSAPublicKey)
 
+ASN1_SEQUENCE(RSA_PSS_PARAMS) = {
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, hashAlgorithm, X509_ALGOR,0),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, maskGenAlgorithm, X509_ALGOR,1),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, saltLength, ASN1_INTEGER,2),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, trailerField, ASN1_INTEGER,3)
+} ASN1_SEQUENCE_END(RSA_PSS_PARAMS)
+
+IMPLEMENT_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPrivateKey, RSAPrivateKey)
 
 IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPublicKey, RSAPublicKey)
diff --git a/main/openssl/crypto/rsa/rsa_chk.c b/main/openssl/crypto/rsa/rsa_chk.c
index 9d848db8..cc30e771 100644
--- a/main/openssl/crypto/rsa/rsa_chk.c
+++ b/main/openssl/crypto/rsa/rsa_chk.c
@@ -59,6 +59,12 @@ int RSA_check_key(const RSA *key)
 	BN_CTX *ctx;
 	int r;
 	int ret=1;
+
+	if (!key->p || !key->q || !key->n || !key->e || !key->d)
+		{
+		RSAerr(RSA_F_RSA_CHECK_KEY, RSA_R_VALUE_MISSING);
+		return 0;
+		}
 	
 	i = BN_new();
 	j = BN_new();
diff --git a/main/openssl/crypto/rsa/rsa_crpt.c b/main/openssl/crypto/rsa/rsa_crpt.c
new file mode 100644
index 00000000..d3e44785
--- /dev/null
+++ b/main/openssl/crypto/rsa/rsa_crpt.c
@@ -0,0 +1,257 @@
+/* crypto/rsa/rsa_lib.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include <openssl/crypto.h>
+#include "cryptlib.h"
+#include <openssl/lhash.h>
+#include <openssl/bn.h>
+#include <openssl/rsa.h>
+#include <openssl/rand.h>
+#ifndef OPENSSL_NO_ENGINE
+#include <openssl/engine.h>
+#endif
+
+int RSA_size(const RSA *r)
+	{
+	return(BN_num_bytes(r->n));
+	}
+
+int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
+	}
+
+int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
+	}
+
+int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
+	}
+
+int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
+	}
+
+int RSA_flags(const RSA *r)
+	{
+	return((r == NULL)?0:r->meth->flags);
+	}
+
+void RSA_blinding_off(RSA *rsa)
+	{
+	if (rsa->blinding != NULL)
+		{
+		BN_BLINDING_free(rsa->blinding);
+		rsa->blinding=NULL;
+		}
+	rsa->flags &= ~RSA_FLAG_BLINDING;
+	rsa->flags |= RSA_FLAG_NO_BLINDING;
+	}
+
+int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
+	{
+	int ret=0;
+
+	if (rsa->blinding != NULL)
+		RSA_blinding_off(rsa);
+
+	rsa->blinding = RSA_setup_blinding(rsa, ctx);
+	if (rsa->blinding == NULL)
+		goto err;
+
+	rsa->flags |= RSA_FLAG_BLINDING;
+	rsa->flags &= ~RSA_FLAG_NO_BLINDING;
+	ret=1;
+err:
+	return(ret);
+	}
+
+static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
+	const BIGNUM *q, BN_CTX *ctx)
+{
+	BIGNUM *ret = NULL, *r0, *r1, *r2;
+
+	if (d == NULL || p == NULL || q == NULL)
+		return NULL;
+
+	BN_CTX_start(ctx);
+	r0 = BN_CTX_get(ctx);
+	r1 = BN_CTX_get(ctx);
+	r2 = BN_CTX_get(ctx);
+	if (r2 == NULL)
+		goto err;
+
+	if (!BN_sub(r1, p, BN_value_one())) goto err;
+	if (!BN_sub(r2, q, BN_value_one())) goto err;
+	if (!BN_mul(r0, r1, r2, ctx)) goto err;
+
+	ret = BN_mod_inverse(NULL, d, r0, ctx);
+err:
+	BN_CTX_end(ctx);
+	return ret;
+}
+
+BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
+{
+	BIGNUM local_n;
+	BIGNUM *e,*n;
+	BN_CTX *ctx;
+	BN_BLINDING *ret = NULL;
+
+	if (in_ctx == NULL)
+		{
+		if ((ctx = BN_CTX_new()) == NULL) return 0;
+		}
+	else
+		ctx = in_ctx;
+
+	BN_CTX_start(ctx);
+	e  = BN_CTX_get(ctx);
+	if (e == NULL)
+		{
+		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	if (rsa->e == NULL)
+		{
+		e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
+		if (e == NULL)
+			{
+			RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
+			goto err;
+			}
+		}
+	else
+		e = rsa->e;
+
+	
+	if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
+		{
+		/* if PRNG is not properly seeded, resort to secret
+		 * exponent as unpredictable seed */
+		RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
+		}
+
+	if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
+		{
+		/* Set BN_FLG_CONSTTIME flag */
+		n = &local_n;
+		BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
+		}
+	else
+		n = rsa->n;
+
+	ret = BN_BLINDING_create_param(NULL, e, n, ctx,
+			rsa->meth->bn_mod_exp, rsa->_method_mod_n);
+	if (ret == NULL)
+		{
+		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
+		goto err;
+		}
+	CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
+err:
+	BN_CTX_end(ctx);
+	if (in_ctx == NULL)
+		BN_CTX_free(ctx);
+	if(rsa->e == NULL)
+		BN_free(e);
+
+	return ret;
+}
diff --git a/main/openssl/crypto/rsa/rsa_eay.c b/main/openssl/crypto/rsa/rsa_eay.c
index 7c941885..88ee2cb5 100644
--- a/main/openssl/crypto/rsa/rsa_eay.c
+++ b/main/openssl/crypto/rsa/rsa_eay.c
@@ -314,51 +314,56 @@ static BN_BLINDING *rsa_get_blinding(RSA *rsa, int *local, BN_CTX *ctx)
 	return ret;
 }
 
-static int rsa_blinding_convert(BN_BLINDING *b, int local, BIGNUM *f,
-	BIGNUM *r, BN_CTX *ctx)
-{
-	if (local)
+static int rsa_blinding_convert(BN_BLINDING *b, BIGNUM *f, BIGNUM *unblind,
+	BN_CTX *ctx)
+	{
+	if (unblind == NULL)
+		/* Local blinding: store the unblinding factor
+		 * in BN_BLINDING. */
 		return BN_BLINDING_convert_ex(f, NULL, b, ctx);
 	else
 		{
-		int ret;
-		CRYPTO_r_lock(CRYPTO_LOCK_RSA_BLINDING);
-		ret = BN_BLINDING_convert_ex(f, r, b, ctx);
-		CRYPTO_r_unlock(CRYPTO_LOCK_RSA_BLINDING);
-		return ret;
-		}
-}
-
-static int rsa_blinding_invert(BN_BLINDING *b, int local, BIGNUM *f,
-	BIGNUM *r, BN_CTX *ctx)
-{
-	if (local)
-		return BN_BLINDING_invert_ex(f, NULL, b, ctx);
-	else
-		{
+		/* Shared blinding: store the unblinding factor
+		 * outside BN_BLINDING. */
 		int ret;
 		CRYPTO_w_lock(CRYPTO_LOCK_RSA_BLINDING);
-		ret = BN_BLINDING_invert_ex(f, r, b, ctx);
+		ret = BN_BLINDING_convert_ex(f, unblind, b, ctx);
 		CRYPTO_w_unlock(CRYPTO_LOCK_RSA_BLINDING);
 		return ret;
 		}
-}
+	}
+
+static int rsa_blinding_invert(BN_BLINDING *b, BIGNUM *f, BIGNUM *unblind,
+	BN_CTX *ctx)
+	{
+	/* For local blinding, unblind is set to NULL, and BN_BLINDING_invert_ex
+	 * will use the unblinding factor stored in BN_BLINDING.
+	 * If BN_BLINDING is shared between threads, unblind must be non-null:
+	 * BN_BLINDING_invert_ex will then use the local unblinding factor,
+	 * and will only read the modulus from BN_BLINDING.
+	 * In both cases it's safe to access the blinding without a lock.
+	 */
+	return BN_BLINDING_invert_ex(f, unblind, b, ctx);
+	}
 
 /* signing */
 static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
 	     unsigned char *to, RSA *rsa, int padding)
 	{
-	BIGNUM *f, *ret, *br, *res;
+	BIGNUM *f, *ret, *res;
 	int i,j,k,num=0,r= -1;
 	unsigned char *buf=NULL;
 	BN_CTX *ctx=NULL;
 	int local_blinding = 0;
+	/* Used only if the blinding structure is shared. A non-NULL unblind
+	 * instructs rsa_blinding_convert() and rsa_blinding_invert() to store
+	 * the unblinding factor outside the blinding structure. */
+	BIGNUM *unblind = NULL;
 	BN_BLINDING *blinding = NULL;
 
 	if ((ctx=BN_CTX_new()) == NULL) goto err;
 	BN_CTX_start(ctx);
 	f   = BN_CTX_get(ctx);
-	br  = BN_CTX_get(ctx);
 	ret = BN_CTX_get(ctx);
 	num = BN_num_bytes(rsa->n);
 	buf = OPENSSL_malloc(num);
@@ -406,8 +411,15 @@ static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
 		}
 	
 	if (blinding != NULL)
-		if (!rsa_blinding_convert(blinding, local_blinding, f, br, ctx))
+		{
+		if (!local_blinding && ((unblind = BN_CTX_get(ctx)) == NULL))
+			{
+			RSAerr(RSA_F_RSA_EAY_PRIVATE_ENCRYPT,ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		if (!rsa_blinding_convert(blinding, f, unblind, ctx))
 			goto err;
+		}
 
 	if ( (rsa->flags & RSA_FLAG_EXT_PKEY) ||
 		((rsa->p != NULL) &&
@@ -441,7 +453,7 @@ static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
 		}
 
 	if (blinding)
-		if (!rsa_blinding_invert(blinding, local_blinding, ret, br, ctx))
+		if (!rsa_blinding_invert(blinding, ret, unblind, ctx))
 			goto err;
 
 	if (padding == RSA_X931_PADDING)
@@ -480,18 +492,21 @@ err:
 static int RSA_eay_private_decrypt(int flen, const unsigned char *from,
 	     unsigned char *to, RSA *rsa, int padding)
 	{
-	BIGNUM *f, *ret, *br;
+	BIGNUM *f, *ret;
 	int j,num=0,r= -1;
 	unsigned char *p;
 	unsigned char *buf=NULL;
 	BN_CTX *ctx=NULL;
 	int local_blinding = 0;
+	/* Used only if the blinding structure is shared. A non-NULL unblind
+	 * instructs rsa_blinding_convert() and rsa_blinding_invert() to store
+	 * the unblinding factor outside the blinding structure. */
+	BIGNUM *unblind = NULL;
 	BN_BLINDING *blinding = NULL;
 
 	if((ctx = BN_CTX_new()) == NULL) goto err;
 	BN_CTX_start(ctx);
 	f   = BN_CTX_get(ctx);
-	br  = BN_CTX_get(ctx);
 	ret = BN_CTX_get(ctx);
 	num = BN_num_bytes(rsa->n);
 	buf = OPENSSL_malloc(num);
@@ -529,8 +544,15 @@ static int RSA_eay_private_decrypt(int flen, const unsigned char *from,
 		}
 	
 	if (blinding != NULL)
-		if (!rsa_blinding_convert(blinding, local_blinding, f, br, ctx))
+		{
+		if (!local_blinding && ((unblind = BN_CTX_get(ctx)) == NULL))
+			{
+			RSAerr(RSA_F_RSA_EAY_PRIVATE_DECRYPT,ERR_R_MALLOC_FAILURE);
 			goto err;
+			}
+		if (!rsa_blinding_convert(blinding, f, unblind, ctx))
+			goto err;
+		}
 
 	/* do the decrypt */
 	if ( (rsa->flags & RSA_FLAG_EXT_PKEY) ||
@@ -564,7 +586,7 @@ static int RSA_eay_private_decrypt(int flen, const unsigned char *from,
 		}
 
 	if (blinding)
-		if (!rsa_blinding_invert(blinding, local_blinding, ret, br, ctx))
+		if (!rsa_blinding_invert(blinding, ret, unblind, ctx))
 			goto err;
 
 	p=buf;
@@ -825,12 +847,12 @@ static int RSA_eay_mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx)
 	if (!BN_mod(r0,pr1,rsa->p,ctx)) goto err;
 
 	/* If p < q it is occasionally possible for the correction of
-         * adding 'p' if r0 is negative above to leave the result still
+	 * adding 'p' if r0 is negative above to leave the result still
 	 * negative. This can break the private key operations: the following
 	 * second correction should *always* correct this rare occurrence.
 	 * This will *never* happen with OpenSSL generated keys because
-         * they ensure p > q [steve]
-         */
+	 * they ensure p > q [steve]
+	 */
 	if (BN_is_negative(r0))
 		if (!BN_add(r0,r0,rsa->p)) goto err;
 	if (!BN_mul(r1,r0,rsa->q,ctx)) goto err;
diff --git a/main/openssl/crypto/rsa/rsa_err.c b/main/openssl/crypto/rsa/rsa_err.c
index cf9f1106..46e0bf99 100644
--- a/main/openssl/crypto/rsa/rsa_err.c
+++ b/main/openssl/crypto/rsa/rsa_err.c
@@ -1,6 +1,6 @@
 /* crypto/rsa/rsa_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -78,6 +78,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_PKEY_RSA_CTRL),	"PKEY_RSA_CTRL"},
 {ERR_FUNC(RSA_F_PKEY_RSA_CTRL_STR),	"PKEY_RSA_CTRL_STR"},
 {ERR_FUNC(RSA_F_PKEY_RSA_SIGN),	"PKEY_RSA_SIGN"},
+{ERR_FUNC(RSA_F_PKEY_RSA_VERIFY),	"PKEY_RSA_VERIFY"},
 {ERR_FUNC(RSA_F_PKEY_RSA_VERIFYRECOVER),	"PKEY_RSA_VERIFYRECOVER"},
 {ERR_FUNC(RSA_F_RSA_BUILTIN_KEYGEN),	"RSA_BUILTIN_KEYGEN"},
 {ERR_FUNC(RSA_F_RSA_CHECK_KEY),	"RSA_check_key"},
@@ -86,6 +87,8 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_DECRYPT),	"RSA_EAY_PUBLIC_DECRYPT"},
 {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_ENCRYPT),	"RSA_EAY_PUBLIC_ENCRYPT"},
 {ERR_FUNC(RSA_F_RSA_GENERATE_KEY),	"RSA_generate_key"},
+{ERR_FUNC(RSA_F_RSA_GENERATE_KEY_EX),	"RSA_generate_key_ex"},
+{ERR_FUNC(RSA_F_RSA_ITEM_VERIFY),	"RSA_ITEM_VERIFY"},
 {ERR_FUNC(RSA_F_RSA_MEMORY_LOCK),	"RSA_memory_lock"},
 {ERR_FUNC(RSA_F_RSA_NEW_METHOD),	"RSA_new_method"},
 {ERR_FUNC(RSA_F_RSA_NULL),	"RSA_NULL"},
@@ -97,6 +100,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_NONE),	"RSA_padding_add_none"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP),	"RSA_padding_add_PKCS1_OAEP"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS),	"RSA_padding_add_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1),	"RSA_padding_add_PKCS1_PSS_mgf1"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1),	"RSA_padding_add_PKCS1_type_1"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2),	"RSA_padding_add_PKCS1_type_2"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_SSLV23),	"RSA_padding_add_SSLv23"},
@@ -109,8 +113,12 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_PADDING_CHECK_X931),	"RSA_padding_check_X931"},
 {ERR_FUNC(RSA_F_RSA_PRINT),	"RSA_print"},
 {ERR_FUNC(RSA_F_RSA_PRINT_FP),	"RSA_print_fp"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_DECRYPT),	"RSA_private_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_ENCRYPT),	"RSA_private_encrypt"},
 {ERR_FUNC(RSA_F_RSA_PRIV_DECODE),	"RSA_PRIV_DECODE"},
 {ERR_FUNC(RSA_F_RSA_PRIV_ENCODE),	"RSA_PRIV_ENCODE"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_DECRYPT),	"RSA_public_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_ENCRYPT),	"RSA_public_encrypt"},
 {ERR_FUNC(RSA_F_RSA_PUB_DECODE),	"RSA_PUB_DECODE"},
 {ERR_FUNC(RSA_F_RSA_SETUP_BLINDING),	"RSA_setup_blinding"},
 {ERR_FUNC(RSA_F_RSA_SIGN),	"RSA_sign"},
@@ -118,6 +126,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_VERIFY),	"RSA_verify"},
 {ERR_FUNC(RSA_F_RSA_VERIFY_ASN1_OCTET_STRING),	"RSA_verify_ASN1_OCTET_STRING"},
 {ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS),	"RSA_verify_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1),	"RSA_verify_PKCS1_PSS_mgf1"},
 {0,NULL}
 	};
 
@@ -146,19 +155,24 @@ static ERR_STRING_DATA RSA_str_reasons[]=
 {ERR_REASON(RSA_R_INVALID_HEADER)        ,"invalid header"},
 {ERR_REASON(RSA_R_INVALID_KEYBITS)       ,"invalid keybits"},
 {ERR_REASON(RSA_R_INVALID_MESSAGE_LENGTH),"invalid message length"},
+{ERR_REASON(RSA_R_INVALID_MGF1_MD)       ,"invalid mgf1 md"},
 {ERR_REASON(RSA_R_INVALID_PADDING)       ,"invalid padding"},
 {ERR_REASON(RSA_R_INVALID_PADDING_MODE)  ,"invalid padding mode"},
+{ERR_REASON(RSA_R_INVALID_PSS_PARAMETERS),"invalid pss parameters"},
 {ERR_REASON(RSA_R_INVALID_PSS_SALTLEN)   ,"invalid pss saltlen"},
+{ERR_REASON(RSA_R_INVALID_SALT_LENGTH)   ,"invalid salt length"},
 {ERR_REASON(RSA_R_INVALID_TRAILER)       ,"invalid trailer"},
 {ERR_REASON(RSA_R_INVALID_X931_DIGEST)   ,"invalid x931 digest"},
 {ERR_REASON(RSA_R_IQMP_NOT_INVERSE_OF_Q) ,"iqmp not inverse of q"},
 {ERR_REASON(RSA_R_KEY_SIZE_TOO_SMALL)    ,"key size too small"},
 {ERR_REASON(RSA_R_LAST_OCTET_INVALID)    ,"last octet invalid"},
 {ERR_REASON(RSA_R_MODULUS_TOO_LARGE)     ,"modulus too large"},
+{ERR_REASON(RSA_R_NON_FIPS_RSA_METHOD)   ,"non fips rsa method"},
 {ERR_REASON(RSA_R_NO_PUBLIC_EXPONENT)    ,"no public exponent"},
 {ERR_REASON(RSA_R_NULL_BEFORE_BLOCK_MISSING),"null before block missing"},
 {ERR_REASON(RSA_R_N_DOES_NOT_EQUAL_P_Q)  ,"n does not equal p q"},
 {ERR_REASON(RSA_R_OAEP_DECODING_ERROR)   ,"oaep decoding error"},
+{ERR_REASON(RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE),"operation not allowed in fips mode"},
 {ERR_REASON(RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE),"operation not supported for this keytype"},
 {ERR_REASON(RSA_R_PADDING_CHECK_FAILED)  ,"padding check failed"},
 {ERR_REASON(RSA_R_P_NOT_PRIME)           ,"p not prime"},
@@ -169,7 +183,12 @@ static ERR_STRING_DATA RSA_str_reasons[]=
 {ERR_REASON(RSA_R_SSLV3_ROLLBACK_ATTACK) ,"sslv3 rollback attack"},
 {ERR_REASON(RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD),"the asn1 object identifier is not known for this md"},
 {ERR_REASON(RSA_R_UNKNOWN_ALGORITHM_TYPE),"unknown algorithm type"},
+{ERR_REASON(RSA_R_UNKNOWN_MASK_DIGEST)   ,"unknown mask digest"},
 {ERR_REASON(RSA_R_UNKNOWN_PADDING_TYPE)  ,"unknown padding type"},
+{ERR_REASON(RSA_R_UNKNOWN_PSS_DIGEST)    ,"unknown pss digest"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_ALGORITHM),"unsupported mask algorithm"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_PARAMETER),"unsupported mask parameter"},
+{ERR_REASON(RSA_R_UNSUPPORTED_SIGNATURE_TYPE),"unsupported signature type"},
 {ERR_REASON(RSA_R_VALUE_MISSING)         ,"value missing"},
 {ERR_REASON(RSA_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"},
 {0,NULL}
diff --git a/main/openssl/crypto/rsa/rsa_gen.c b/main/openssl/crypto/rsa/rsa_gen.c
index 767f7ab6..42290cce 100644
--- a/main/openssl/crypto/rsa/rsa_gen.c
+++ b/main/openssl/crypto/rsa/rsa_gen.c
@@ -67,6 +67,9 @@
 #include "cryptlib.h"
 #include <openssl/bn.h>
 #include <openssl/rsa.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb);
 
@@ -77,8 +80,20 @@ static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb)
  * now just because key-generation is part of RSA_METHOD. */
 int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_GENERATE_KEY_EX, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
 	if(rsa->meth->rsa_keygen)
 		return rsa->meth->rsa_keygen(rsa, bits, e_value, cb);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_rsa_generate_key_ex(rsa, bits, e_value, cb);
+#endif
 	return rsa_builtin_keygen(rsa, bits, e_value, cb);
 	}
 
diff --git a/main/openssl/crypto/rsa/rsa_lib.c b/main/openssl/crypto/rsa/rsa_lib.c
index de45088d..c95ceafc 100644
--- a/main/openssl/crypto/rsa/rsa_lib.c
+++ b/main/openssl/crypto/rsa/rsa_lib.c
@@ -67,6 +67,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char RSA_version[]="RSA" OPENSSL_VERSION_PTEXT;
 
 static const RSA_METHOD *default_RSA_meth=NULL;
@@ -87,12 +91,15 @@ const RSA_METHOD *RSA_get_default_method(void)
 	{
 	if (default_RSA_meth == NULL)
 		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_rsa_pkcs1_ssleay();
+		else
+			return RSA_PKCS1_SSLeay();
+#else
 #ifdef RSA_NULL
 		default_RSA_meth=RSA_null_method();
 #else
-#if 0 /* was: #ifdef RSAref */
-		default_RSA_meth=RSA_PKCS1_RSAref();
-#else
 		default_RSA_meth=RSA_PKCS1_SSLeay();
 #endif
 #endif
@@ -181,7 +188,7 @@ RSA *RSA_new_method(ENGINE *engine)
 	ret->blinding=NULL;
 	ret->mt_blinding=NULL;
 	ret->bignum_data=NULL;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~RSA_FLAG_NON_FIPS_ALLOW;
 	if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_RSA, ret, &ret->ex_data))
 		{
 #ifndef OPENSSL_NO_ENGINE
@@ -280,163 +287,6 @@ void *RSA_get_ex_data(const RSA *r, int idx)
 	return(CRYPTO_get_ex_data(&r->ex_data,idx));
 	}
 
-int RSA_size(const RSA *r)
-	{
-	return(BN_num_bytes(r->n));
-	}
-
-int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
-	}
-
-int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
-	}
-
-int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
-	}
-
-int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
-	}
-
-int RSA_flags(const RSA *r)
-	{
-	return((r == NULL)?0:r->meth->flags);
-	}
-
-void RSA_blinding_off(RSA *rsa)
-	{
-	if (rsa->blinding != NULL)
-		{
-		BN_BLINDING_free(rsa->blinding);
-		rsa->blinding=NULL;
-		}
-	rsa->flags &= ~RSA_FLAG_BLINDING;
-	rsa->flags |= RSA_FLAG_NO_BLINDING;
-	}
-
-int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
-	{
-	int ret=0;
-
-	if (rsa->blinding != NULL)
-		RSA_blinding_off(rsa);
-
-	rsa->blinding = RSA_setup_blinding(rsa, ctx);
-	if (rsa->blinding == NULL)
-		goto err;
-
-	rsa->flags |= RSA_FLAG_BLINDING;
-	rsa->flags &= ~RSA_FLAG_NO_BLINDING;
-	ret=1;
-err:
-	return(ret);
-	}
-
-static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
-	const BIGNUM *q, BN_CTX *ctx)
-{
-	BIGNUM *ret = NULL, *r0, *r1, *r2;
-
-	if (d == NULL || p == NULL || q == NULL)
-		return NULL;
-
-	BN_CTX_start(ctx);
-	r0 = BN_CTX_get(ctx);
-	r1 = BN_CTX_get(ctx);
-	r2 = BN_CTX_get(ctx);
-	if (r2 == NULL)
-		goto err;
-
-	if (!BN_sub(r1, p, BN_value_one())) goto err;
-	if (!BN_sub(r2, q, BN_value_one())) goto err;
-	if (!BN_mul(r0, r1, r2, ctx)) goto err;
-
-	ret = BN_mod_inverse(NULL, d, r0, ctx);
-err:
-	BN_CTX_end(ctx);
-	return ret;
-}
-
-BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
-{
-	BIGNUM local_n;
-	BIGNUM *e,*n;
-	BN_CTX *ctx;
-	BN_BLINDING *ret = NULL;
-
-	if (in_ctx == NULL)
-		{
-		if ((ctx = BN_CTX_new()) == NULL) return 0;
-		}
-	else
-		ctx = in_ctx;
-
-	BN_CTX_start(ctx);
-	e  = BN_CTX_get(ctx);
-	if (e == NULL)
-		{
-		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
-		goto err;
-		}
-
-	if (rsa->e == NULL)
-		{
-		e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
-		if (e == NULL)
-			{
-			RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
-			goto err;
-			}
-		}
-	else
-		e = rsa->e;
-
-	
-	if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
-		{
-		/* if PRNG is not properly seeded, resort to secret
-		 * exponent as unpredictable seed */
-		RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
-		}
-
-	if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
-		{
-		/* Set BN_FLG_CONSTTIME flag */
-		n = &local_n;
-		BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
-		}
-	else
-		n = rsa->n;
-
-	ret = BN_BLINDING_create_param(NULL, e, n, ctx,
-			rsa->meth->bn_mod_exp, rsa->_method_mod_n);
-	if (ret == NULL)
-		{
-		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
-		goto err;
-		}
-	CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
-err:
-	BN_CTX_end(ctx);
-	if (in_ctx == NULL)
-		BN_CTX_free(ctx);
-	if(rsa->e == NULL)
-		BN_free(e);
-
-	return ret;
-}
-
 int RSA_memory_lock(RSA *r)
 	{
 	int i,j,k,off;
diff --git a/main/openssl/crypto/rsa/rsa_oaep.c b/main/openssl/crypto/rsa/rsa_oaep.c
index 18d307ea..af4d24a5 100644
--- a/main/openssl/crypto/rsa/rsa_oaep.c
+++ b/main/openssl/crypto/rsa/rsa_oaep.c
@@ -56,7 +56,8 @@ int RSA_padding_add_PKCS1_OAEP(unsigned char *to, int tlen,
 	seed = to + 1;
 	db = to + SHA_DIGEST_LENGTH + 1;
 
-	EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL))
+		return 0;
 	memset(db + SHA_DIGEST_LENGTH, 0,
 		emlen - flen - 2 * SHA_DIGEST_LENGTH - 1);
 	db[emlen - flen - SHA_DIGEST_LENGTH - 1] = 0x01;
@@ -145,9 +146,10 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
 	for (i = 0; i < dblen; i++)
 		db[i] ^= maskeddb[i];
 
-	EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL))
+		return -1;
 
-	if (memcmp(db, phash, SHA_DIGEST_LENGTH) != 0 || bad)
+	if (CRYPTO_memcmp(db, phash, SHA_DIGEST_LENGTH) != 0 || bad)
 		goto decoding_err;
 	else
 		{
diff --git a/main/openssl/crypto/rsa/rsa_pmeth.c b/main/openssl/crypto/rsa/rsa_pmeth.c
index c6892ecd..157aa5c4 100644
--- a/main/openssl/crypto/rsa/rsa_pmeth.c
+++ b/main/openssl/crypto/rsa/rsa_pmeth.c
@@ -63,6 +63,12 @@
 #include <openssl/rsa.h>
 #include <openssl/bn.h>
 #include <openssl/evp.h>
+#ifndef OPENSSL_NO_CMS
+#include <openssl/cms.h>
+#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include "evp_locl.h"
 #include "rsa_locl.h"
 
@@ -79,6 +85,8 @@ typedef struct
 	int pad_mode;
 	/* message digest */
 	const EVP_MD *md;
+	/* message digest for MGF1 */
+	const EVP_MD *mgf1md;
 	/* PSS/OAEP salt length */
 	int saltlen;
 	/* Temp buffer */
@@ -95,6 +103,7 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx)
 	rctx->pub_exp = NULL;
 	rctx->pad_mode = RSA_PKCS1_PADDING;
 	rctx->md = NULL;
+	rctx->mgf1md = NULL;
 	rctx->tbuf = NULL;
 
 	rctx->saltlen = -2;
@@ -147,6 +156,31 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx)
 		OPENSSL_free(rctx);
 		}
 	}
+#ifdef OPENSSL_FIPS
+/* FIP checker. Return value indicates status of context parameters:
+ * 1  : redirect to FIPS.
+ * 0  : don't redirect to FIPS.
+ * -1 : illegal operation in FIPS mode.
+ */
+
+static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx)
+	{
+	RSA_PKEY_CTX *rctx = ctx->data;
+	RSA *rsa = ctx->pkey->pkey.rsa;
+	int rv = -1;
+	if (!FIPS_mode())
+		return 0;
+	if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)
+		rv = 0;
+	if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv)
+		return -1;
+	if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS))
+		return rv;
+	if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS))
+		return rv;
+	return 1;
+	}
+#endif
 
 static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 					const unsigned char *tbs, size_t tbslen)
@@ -155,6 +189,15 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 	RSA_PKEY_CTX *rctx = ctx->data;
 	RSA *rsa = ctx->pkey->pkey.rsa;
 
+#ifdef OPENSSL_FIPS
+	ret = pkey_fips_check_ctx(ctx);
+	if (ret < 0)
+		{
+		RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+		return -1;
+		}
+#endif
+
 	if (rctx->md)
 		{
 		if (tbslen != (size_t)EVP_MD_size(rctx->md))
@@ -163,7 +206,36 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 					RSA_R_INVALID_DIGEST_LENGTH);
 			return -1;
 			}
-		if (rctx->pad_mode == RSA_X931_PADDING)
+#ifdef OPENSSL_FIPS
+		if (ret > 0)
+			{
+			unsigned int slen;
+			ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md,
+							rctx->pad_mode,
+							rctx->saltlen,
+							rctx->mgf1md,
+							sig, &slen);
+			if (ret > 0)
+				*siglen = slen;
+			else
+				*siglen = 0;
+			return ret;
+			}
+#endif
+
+		if (EVP_MD_type(rctx->md) == NID_mdc2)
+			{
+			unsigned int sltmp;
+			if (rctx->pad_mode != RSA_PKCS1_PADDING)
+				return -1;
+			ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2,
+						tbs, tbslen, sig, &sltmp, rsa);
+
+			if (ret <= 0)
+				return ret;
+			ret = sltmp;
+			}
+		else if (rctx->pad_mode == RSA_X931_PADDING)
 			{
 			if (!setup_tbuf(rctx, ctx))
 				return -1;
@@ -186,8 +258,10 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 			{
 			if (!setup_tbuf(rctx, ctx))
 				return -1;
-			if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs,
-						rctx->md, rctx->saltlen))
+			if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa,
+						rctx->tbuf, tbs,
+						rctx->md, rctx->mgf1md,
+						rctx->saltlen))
 				return -1;
 			ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf,
 						sig, rsa, RSA_NO_PADDING);
@@ -269,8 +343,30 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
 	RSA_PKEY_CTX *rctx = ctx->data;
 	RSA *rsa = ctx->pkey->pkey.rsa;
 	size_t rslen;
+#ifdef OPENSSL_FIPS
+	int rv;
+	rv = pkey_fips_check_ctx(ctx);
+	if (rv < 0)
+		{
+		RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+		return -1;
+		}
+#endif
 	if (rctx->md)
 		{
+#ifdef OPENSSL_FIPS
+		if (rv > 0)
+			{
+			return FIPS_rsa_verify_digest(rsa,
+							tbs, tbslen,
+							rctx->md,
+							rctx->pad_mode,
+							rctx->saltlen,
+							rctx->mgf1md,
+							sig, siglen);
+							
+			}
+#endif
 		if (rctx->pad_mode == RSA_PKCS1_PADDING)
 			return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen,
 					sig, siglen, rsa);
@@ -289,7 +385,8 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
 							rsa, RSA_NO_PADDING);
 			if (ret <= 0)
 				return 0;
-			ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md,
+			ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs,
+						rctx->md, rctx->mgf1md,
 						rctx->tbuf, rctx->saltlen);
 			if (ret <= 0)
 				return 0;
@@ -403,15 +500,25 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 				RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE);
 		return -2;
 
+		case EVP_PKEY_CTRL_GET_RSA_PADDING:
+		*(int *)p2 = rctx->pad_mode;
+		return 1;
+
 		case EVP_PKEY_CTRL_RSA_PSS_SALTLEN:
-		if (p1 < -2)
-			return -2;
+		case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN:
 		if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
 			{
 			RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN);
 			return -2;
 			}
-		rctx->saltlen = p1;
+		if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN)
+			*(int *)p2 = rctx->saltlen;
+		else
+			{
+			if (p1 < -2)
+				return -2;
+			rctx->saltlen = p1;
+			}
 		return 1;
 
 		case EVP_PKEY_CTRL_RSA_KEYGEN_BITS:
@@ -435,16 +542,45 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 		rctx->md = p2;
 		return 1;
 
+		case EVP_PKEY_CTRL_RSA_MGF1_MD:
+		case EVP_PKEY_CTRL_GET_RSA_MGF1_MD:
+		if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
+			{
+			RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD);
+			return -2;
+			}
+		if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD)
+			{
+			if (rctx->mgf1md)
+				*(const EVP_MD **)p2 = rctx->mgf1md;
+			else
+				*(const EVP_MD **)p2 = rctx->md;
+			}
+		else
+			rctx->mgf1md = p2;
+		return 1;
+
 		case EVP_PKEY_CTRL_DIGESTINIT:
 		case EVP_PKEY_CTRL_PKCS7_ENCRYPT:
 		case EVP_PKEY_CTRL_PKCS7_DECRYPT:
 		case EVP_PKEY_CTRL_PKCS7_SIGN:
+		return 1;
 #ifndef OPENSSL_NO_CMS
-		case EVP_PKEY_CTRL_CMS_ENCRYPT:
 		case EVP_PKEY_CTRL_CMS_DECRYPT:
+		{
+		X509_ALGOR *alg = NULL;
+		ASN1_OBJECT *encalg = NULL;
+		if (p2)
+			CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg);
+		if (alg)
+			X509_ALGOR_get0(&encalg, NULL, NULL, alg);
+		if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep)
+			rctx->pad_mode = RSA_PKCS1_OAEP_PADDING;
+		}
+		case EVP_PKEY_CTRL_CMS_ENCRYPT:
 		case EVP_PKEY_CTRL_CMS_SIGN:
-#endif
 		return 1;
+#endif
 		case EVP_PKEY_CTRL_PEER_KEY:
 			RSAerr(RSA_F_PKEY_RSA_CTRL,
 			RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE);
@@ -475,6 +611,8 @@ static int pkey_rsa_ctrl_str(EVP_PKEY_CTX *ctx,
 			pm = RSA_NO_PADDING;
 		else if (!strcmp(value, "oeap"))
 			pm = RSA_PKCS1_OAEP_PADDING;
+		else if (!strcmp(value, "oaep"))
+			pm = RSA_PKCS1_OAEP_PADDING;
 		else if (!strcmp(value, "x931"))
 			pm = RSA_X931_PADDING;
 		else if (!strcmp(value, "pss"))
diff --git a/main/openssl/crypto/rsa/rsa_pss.c b/main/openssl/crypto/rsa/rsa_pss.c
index ac211e2f..5f9f533d 100644
--- a/main/openssl/crypto/rsa/rsa_pss.c
+++ b/main/openssl/crypto/rsa/rsa_pss.c
@@ -73,6 +73,13 @@ static const unsigned char zeroes[] = {0,0,0,0,0,0,0,0};
 int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 			const EVP_MD *Hash, const unsigned char *EM, int sLen)
 	{
+	return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen);
+	}
+
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash,
+			const unsigned char *EM, int sLen)
+	{
 	int i;
 	int ret = 0;
 	int hLen, maskedDBLen, MSBits, emLen;
@@ -80,6 +87,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	unsigned char *DB = NULL;
 	EVP_MD_CTX ctx;
 	unsigned char H_[EVP_MAX_MD_SIZE];
+	EVP_MD_CTX_init(&ctx);
+
+	if (mgf1Hash == NULL)
+		mgf1Hash = Hash;
 
 	hLen = EVP_MD_size(Hash);
 	if (hLen < 0)
@@ -94,7 +105,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	else if (sLen == -2)	sLen = -2;
 	else if (sLen < -2)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
 
@@ -102,7 +113,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	emLen = RSA_size(rsa);
 	if (EM[0] & (0xFF << MSBits))
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID);
 		goto err;
 		}
 	if (MSBits == 0)
@@ -112,12 +123,12 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 		}
 	if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE);
 		goto err;
 		}
 	if (EM[emLen - 1] != 0xbc)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID);
 		goto err;
 		}
 	maskedDBLen = emLen - hLen - 1;
@@ -125,10 +136,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	DB = OPENSSL_malloc(maskedDBLen);
 	if (!DB)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE);
 		goto err;
 		}
-	if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0)
+	if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0)
 		goto err;
 	for (i = 0; i < maskedDBLen; i++)
 		DB[i] ^= EM[i];
@@ -137,25 +148,28 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ;
 	if (DB[i++] != 0x1)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED);
 		goto err;
 		}
 	if (sLen >= 0 && (maskedDBLen - i) != sLen)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
-	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, Hash, NULL);
-	EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
-	EVP_DigestUpdate(&ctx, mHash, hLen);
+	if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+		|| !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+		|| !EVP_DigestUpdate(&ctx, mHash, hLen))
+		goto err;
 	if (maskedDBLen - i)
-		EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i);
-	EVP_DigestFinal(&ctx, H_, NULL);
-	EVP_MD_CTX_cleanup(&ctx);
+		{
+		if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i))
+			goto err;
+		}
+	if (!EVP_DigestFinal_ex(&ctx, H_, NULL))
+		goto err;
 	if (memcmp(H_, H, hLen))
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE);
 		ret = 0;
 		}
 	else 
@@ -164,6 +178,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	err:
 	if (DB)
 		OPENSSL_free(DB);
+	EVP_MD_CTX_cleanup(&ctx);
 
 	return ret;
 
@@ -173,12 +188,22 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen)
 	{
+	return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen);
+	}
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen)
+	{
 	int i;
 	int ret = 0;
 	int hLen, maskedDBLen, MSBits, emLen;
 	unsigned char *H, *salt = NULL, *p;
 	EVP_MD_CTX ctx;
 
+	if (mgf1Hash == NULL)
+		mgf1Hash = Hash;
+
 	hLen = EVP_MD_size(Hash);
 	if (hLen < 0)
 		goto err;
@@ -192,7 +217,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 	else if (sLen == -2)	sLen = -2;
 	else if (sLen < -2)
 		{
-		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
 
@@ -209,8 +234,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 		}
 	else if (emLen < (hLen + sLen + 2))
 		{
-		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
-		   RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
+		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
 		goto err;
 		}
 	if (sLen > 0)
@@ -218,8 +242,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 		salt = OPENSSL_malloc(sLen);
 		if (!salt)
 			{
-			RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
-		   		ERR_R_MALLOC_FAILURE);
+			RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE);
 			goto err;
 			}
 		if (RAND_bytes(salt, sLen) <= 0)
@@ -228,16 +251,18 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 	maskedDBLen = emLen - hLen - 1;
 	H = EM + maskedDBLen;
 	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, Hash, NULL);
-	EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
-	EVP_DigestUpdate(&ctx, mHash, hLen);
-	if (sLen)
-		EVP_DigestUpdate(&ctx, salt, sLen);
-	EVP_DigestFinal(&ctx, H, NULL);
+	if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+		|| !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+		|| !EVP_DigestUpdate(&ctx, mHash, hLen))
+		goto err;
+	if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen))
+		goto err;
+	if (!EVP_DigestFinal_ex(&ctx, H, NULL))
+		goto err;
 	EVP_MD_CTX_cleanup(&ctx);
 
 	/* Generate dbMask in place then perform XOR on it */
-	if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash))
+	if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash))
 		goto err;
 
 	p = EM;
diff --git a/main/openssl/crypto/rsa/rsa_sign.c b/main/openssl/crypto/rsa/rsa_sign.c
index 0be4ec7f..b6f6037a 100644
--- a/main/openssl/crypto/rsa/rsa_sign.c
+++ b/main/openssl/crypto/rsa/rsa_sign.c
@@ -77,6 +77,14 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len,
 	const unsigned char *s = NULL;
 	X509_ALGOR algor;
 	ASN1_OCTET_STRING digest;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_SIGN, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
 	if((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign)
 		{
 		return rsa->meth->rsa_sign(type, m, m_len,
@@ -153,6 +161,15 @@ int int_rsa_verify(int dtype, const unsigned char *m,
 	unsigned char *s;
 	X509_SIG *sig=NULL;
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_INT_RSA_VERIFY, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
+
 	if (siglen != (unsigned int)RSA_size(rsa))
 		{
 		RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_WRONG_SIGNATURE_LENGTH);
@@ -182,6 +199,22 @@ int int_rsa_verify(int dtype, const unsigned char *m,
 	i=RSA_public_decrypt((int)siglen,sigbuf,s,rsa,RSA_PKCS1_PADDING);
 
 	if (i <= 0) goto err;
+	/* Oddball MDC2 case: signature can be OCTET STRING.
+	 * check for correct tag and length octets.
+	 */
+	if (dtype == NID_mdc2 && i == 18 && s[0] == 0x04 && s[1] == 0x10)
+		{
+		if (rm)
+			{
+			memcpy(rm, s + 2, 16);
+			*prm_len = 16;
+			ret = 1;
+			}
+		else if(memcmp(m, s + 2, 16))
+			RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_BAD_SIGNATURE);
+		else
+			ret = 1;
+		}
 
 	/* Special case: SSL signature */
 	if(dtype == NID_md5_sha1) {
diff --git a/main/openssl/crypto/s390xcap.c b/main/openssl/crypto/s390xcap.c
index ffbe0235..f2e94ef4 100644
--- a/main/openssl/crypto/s390xcap.c
+++ b/main/openssl/crypto/s390xcap.c
@@ -4,7 +4,7 @@
 #include <setjmp.h>
 #include <signal.h>
 
-extern unsigned long OPENSSL_s390xcap_P;
+extern unsigned long OPENSSL_s390xcap_P[];
 
 static sigjmp_buf ill_jmp;
 static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
@@ -16,7 +16,9 @@ void OPENSSL_cpuid_setup(void)
 	sigset_t oset;
 	struct sigaction ill_act,oact;
 
-	if (OPENSSL_s390xcap_P) return;
+	if (OPENSSL_s390xcap_P[0]) return;
+
+	OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1);
 
 	memset(&ill_act,0,sizeof(ill_act));
 	ill_act.sa_handler = ill_handler;
@@ -27,10 +29,8 @@ void OPENSSL_cpuid_setup(void)
 	sigaction (SIGILL,&ill_act,&oact);
 
 	/* protection against missing store-facility-list-extended */
-	if (sigsetjmp(ill_jmp,0) == 0)
-		OPENSSL_s390xcap_P = OPENSSL_s390x_facilities();
-	else
-		OPENSSL_s390xcap_P = 1UL<<63;
+	if (sigsetjmp(ill_jmp,1) == 0)
+		OPENSSL_s390x_facilities();
 
 	sigaction (SIGILL,&oact,NULL);
 	sigprocmask(SIG_SETMASK,&oset,NULL);
diff --git a/main/openssl/crypto/s390xcpuid.S b/main/openssl/crypto/s390xcpuid.S
index b053c6a2..06815347 100644
--- a/main/openssl/crypto/s390xcpuid.S
+++ b/main/openssl/crypto/s390xcpuid.S
@@ -5,10 +5,14 @@
 .align	16
 OPENSSL_s390x_facilities:
 	lghi	%r0,0
-	.long	0xb2b0f010	# stfle	16(%r15)
-	lg	%r2,16(%r15)
-	larl	%r1,OPENSSL_s390xcap_P
-	stg	%r2,0(%r1)
+	larl	%r2,OPENSSL_s390xcap_P
+	stg	%r0,8(%r2)
+	.long	0xb2b02000	# stfle	0(%r2)
+	brc	8,.Ldone
+	lghi	%r0,1
+	.long	0xb2b02000	# stfle 0(%r2)
+.Ldone:
+	lg	%r2,0(%r2)
 	br	%r14
 .size	OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
 
@@ -58,6 +62,9 @@ OPENSSL_wipe_cpu:
 .type	OPENSSL_cleanse,@function
 .align	16
 OPENSSL_cleanse:
+#if !defined(__s390x__) && !defined(__s390x)
+	llgfr	%r3,%r3
+#endif
 	lghi	%r4,15
 	lghi	%r0,0
 	clgr	%r3,%r4
@@ -89,4 +96,4 @@ OPENSSL_cleanse:
 .section	.init
 	brasl	%r14,OPENSSL_cpuid_setup
 
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
diff --git a/main/openssl/crypto/sha/asm/sha1-586.S b/main/openssl/crypto/sha/asm/sha1-586.S
new file mode 100644
index 00000000..e77f6541
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-586.S
@@ -0,0 +1,1380 @@
+.file	"sha1-586.s"
+.text
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,@function
+.align	16
+sha1_block_data_order:
+.L_sha1_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%ebp
+	movl	24(%esp),%esi
+	movl	28(%esp),%eax
+	subl	$76,%esp
+	shll	$6,%eax
+	addl	%esi,%eax
+	movl	%eax,104(%esp)
+	movl	16(%ebp),%edi
+	jmp	.L000loop
+.align	16
+.L000loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edx,12(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%edx,28(%esp)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edx,44(%esp)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,48(%esp)
+	movl	%ebx,52(%esp)
+	movl	%ecx,56(%esp)
+	movl	%edx,60(%esp)
+	movl	%esi,100(%esp)
+	movl	(%ebp),%eax
+	movl	4(%ebp),%ebx
+	movl	8(%ebp),%ecx
+	movl	12(%ebp),%edx
+
+	movl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	4(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	8(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	12(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	16(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	20(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	24(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	28(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	32(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	36(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	40(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	44(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	48(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	52(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	56(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	60(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	32(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	xorl	%esi,%ebp
+	addl	%ebp,%eax
+	movl	%ecx,%ebp
+	rorl	$2,%edx
+	movl	%ebx,(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	36(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	xorl	%edi,%ebp
+	addl	%ebp,%esi
+	movl	%ebx,%ebp
+	rorl	$2,%ecx
+	movl	%eax,4(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	40(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	movl	%eax,%ebp
+	rorl	$2,%ebx
+	movl	%esi,8(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	44(%esp),%edi
+	andl	%eax,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	xorl	%ecx,%ebp
+	addl	%ebp,%edx
+	movl	%esi,%ebp
+	rorl	$2,%eax
+	movl	%edi,12(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,52(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,56(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,60(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	8(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	4(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,4(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	8(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,8(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	12(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,12(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	16(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	4(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,16(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	20(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	8(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,20(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	24(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	32(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	12(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,24(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	28(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	16(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,28(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	32(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	20(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,32(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	36(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	4(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	24(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,36(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	40(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	8(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	28(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,40(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	44(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	12(%esp),%edi
+	andl	%eax,%ebp
+	xorl	32(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,44(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	48(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	16(%esp),%edx
+	andl	%esi,%ebp
+	xorl	36(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,48(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	52(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	20(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	40(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,52(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	56(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	24(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	44(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,56(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	60(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	4(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	28(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	48(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,60(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	8(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	32(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	52(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	4(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	36(%esp),%edi
+	andl	%eax,%ebp
+	xorl	56(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,4(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	8(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	16(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	40(%esp),%edx
+	andl	%esi,%ebp
+	xorl	60(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,8(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	12(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	44(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,12(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	16(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	24(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	48(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	4(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,16(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	20(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	28(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	52(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	8(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,20(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	24(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	32(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	56(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	12(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,24(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	28(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	60(%esp),%edi
+	andl	%eax,%ebp
+	xorl	16(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,28(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	32(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	40(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	(%esp),%edx
+	andl	%esi,%ebp
+	xorl	20(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,32(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	36(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	4(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	24(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,36(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	40(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	48(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	8(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	28(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,40(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	44(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	52(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	12(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	32(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,44(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	48(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	56(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	36(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,48(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	52(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	60(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,52(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	56(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	24(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,56(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	60(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	4(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,60(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,4(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,8(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,12(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%edi,%edx,1),%edi
+	addl	%ebp,%edi
+	movl	96(%esp),%ebp
+	movl	100(%esp),%edx
+	addl	(%ebp),%edi
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%eax
+	addl	12(%ebp),%ebx
+	addl	16(%ebp),%ecx
+	movl	%edi,(%ebp)
+	addl	$64,%edx
+	movl	%esi,4(%ebp)
+	cmpl	104(%esp),%edx
+	movl	%eax,8(%ebp)
+	movl	%ecx,%edi
+	movl	%ebx,12(%ebp)
+	movl	%edx,%esi
+	movl	%ecx,16(%ebp)
+	jb	.L000loop
+	addl	$76,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/main/openssl/crypto/sha/asm/sha1-586.pl b/main/openssl/crypto/sha/asm/sha1-586.pl
index a1f87628..2b119ffa 100644
--- a/main/openssl/crypto/sha/asm/sha1-586.pl
+++ b/main/openssl/crypto/sha/asm/sha1-586.pl
@@ -12,6 +12,8 @@
 # commentary below], and in 2006 the rest was rewritten in order to
 # gain freedom to liberate licensing terms.
 
+# January, September 2004.
+#
 # It was noted that Intel IA-32 C compiler generates code which
 # performs ~30% *faster* on P4 CPU than original *hand-coded*
 # SHA1 assembler implementation. To address this problem (and
@@ -31,12 +33,92 @@
 # ----------------------------------------------------------------
 #					<appro@fy.chalmers.se>
 
+# August 2009.
+#
+# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
+# '(c&d) + (b&(c^d))', which allows to accumulate partial results
+# and lighten "pressure" on scratch registers. This resulted in
+# >12% performance improvement on contemporary AMD cores (with no
+# degradation on other CPUs:-). Also, the code was revised to maximize
+# "distance" between instructions producing input to 'lea' instruction
+# and the 'lea' instruction itself, which is essential for Intel Atom
+# core and resulted in ~15% improvement.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
+# and in SSE2 context was first explored by Dean Gaudet in 2004, see
+# http://arctic.org/~dean/crypto/sha1.html. Since then several things
+# have changed that made it interesting again:
+#
+# a) XMM units became faster and wider;
+# b) instruction set became more versatile;
+# c) an important observation was made by Max Locktykhin, which made
+#    it possible to reduce amount of instructions required to perform
+#    the operation in question, for further details see
+#    http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
+
+# April 2011.
+#
+# Add AVX code path, probably most controversial... The thing is that
+# switch to AVX alone improves performance by as little as 4% in
+# comparison to SSSE3 code path. But below result doesn't look like
+# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
+# pair of µ-ops, and it's the additional µ-ops, two per round, that
+# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
+# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
+# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
+# cycles per processed byte. But 'sh[rl]d' is not something that used
+# to be fast, nor does it appear to be fast in upcoming Bulldozer
+# [according to its optimization manual]. Which is why AVX code path
+# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
+# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
+# makes no sense to keep the AVX code path. If somebody feels that
+# strongly, it's probably more appropriate to discuss possibility of
+# using vector rotate XOP on AMD...
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86		SSSE3		AVX
+# Pentium	15.7		-
+# PIII		11.5		-
+# P4		10.6		-
+# AMD K8	7.1		-
+# Core2		7.3		6.1/+20%	-
+# Atom		12.5		9.5(*)/+32%	-
+# Westmere	7.3		5.6/+30%	-
+# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+70%
+#
+# (*)	Loop is 1056 instructions long and expected result is ~8.25.
+#	It remains mystery [to me] why ILP is limited to 1.7.
+#
+# (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
 
+$xmm=$ymm=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+$ymm=1 if ($xmm &&
+		`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+			=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+		$1>=2.19);	# first version supporting AVX
+
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 
+		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+		$1>=2.03);	# first version supporting AVX
+
+&external_label("OPENSSL_ia32cap_P") if ($xmm);
+
+
 $A="eax";
 $B="ebx";
 $C="ecx";
@@ -47,6 +129,10 @@ $tmp1="ebp";
 
 @V=($A,$B,$C,$D,$E,$T);
 
+$alt=0;	# 1 denotes alternative IALU implementation, which performs
+	# 8% *worse* on P4, same on Westmere and Atom, 2% better on
+	# Sandy Bridge...
+
 sub BODY_00_15
 	{
 	local($n,$a,$b,$c,$d,$e,$f)=@_;
@@ -59,16 +145,18 @@ sub BODY_00_15
 	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
 	 &xor($f,$d);
 	&add($tmp1,$e);			# tmp1+=e;
-	 &and($f,$b);
-	&mov($e,&swtmp($n%16));		# e becomes volatile and is loaded
+	 &mov($e,&swtmp($n%16));	# e becomes volatile and is loaded
 	 				# with xi, also note that e becomes
 					# f in next round...
-	 &xor($f,$d);			# f holds F_00_19(b,c,d)
+	&and($f,$b);
 	&rotr($b,2);			# b=ROTATE(b,30)
-	 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
+	 &xor($f,$d);			# f holds F_00_19(b,c,d)
+	&lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
 
-	if ($n==15) { &add($f,$tmp1); }	# f+=tmp1
+	if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
+		      &add($f,$tmp1); }	# f+=tmp1
 	else        { &add($tmp1,$f); }	# f becomes a in next round
+	&mov($tmp1,$a)			if ($alt && $n==15);
 	}
 
 sub BODY_16_19
@@ -77,22 +165,41 @@ sub BODY_16_19
 
 	&comment("16_19 $n");
 
-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
-	 &mov($tmp1,$c);		# tmp1 to hold F_00_19(b,c,d)
-	&xor($f,&swtmp(($n+2)%16));
-	 &xor($tmp1,$d);
-	&xor($f,&swtmp(($n+8)%16));
-	 &and($tmp1,$b);		# tmp1 holds F_00_19(b,c,d)
-	&rotr($b,2);			# b=ROTATE(b,30)
+if ($alt) {
+	&xor($c,$d);
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&and($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d), b&=c^d
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($tmp1,$d);			# tmp1=F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($e,$tmp1);		# e+=F_00_19(b,c,d)
+	&xor($c,$d);			# restore $c
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,$n==16?2:7);		# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
 	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
-	&mov(&swtmp($n%16),$f);		# xi=f
-	&lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
-	 &mov($e,$a);			# e becomes volatile
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &add($f,$tmp1);		# f+=F_00_19(b,c,d)
-	&add($f,$e);			# f+=ROTATE(a,5)
+	&add($e,$tmp1);			# e+=F_00_19(b,c,d)
+	 &mov($tmp1,$a);
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=ROTATE(a,5)
+}
 	}
 
 sub BODY_20_39
@@ -102,21 +209,41 @@ sub BODY_20_39
 
 	&comment("20_39 $n");
 
+if ($alt) {
+	&xor($tmp1,$c);			# tmp1 to hold F_20_39(b,c,d), b^=c
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+8)%16));
+	&add($e,$tmp1);			# e+=F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f)		if($n<77);# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if($n==39);# warm up for BODY_40_59
+	&and($tmp1,$b)			if($n==39);
+	 &lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	&mov($e,&swtmp(($n+1)%16))	if($n<79);# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+	&rotr($a,5)			if ($n==79);
+} else {
 	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
-	 &mov($f,&swtmp($n%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
-	&rotr($b,2);			# b=ROTATE(b,30)
-	 &xor($f,&swtmp(($n+2)%16));
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
 	&xor($tmp1,$c);
 	 &xor($f,&swtmp(($n+8)%16));
 	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
-	 &add($tmp1,$e);
-	&mov(&swtmp($n%16),$f);		# xi=f
-	 &mov($e,$a);			# e becomes volatile
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &lea($f,&DWP($K,$f,$tmp1));	# f+=K_20_39+e
-	&add($f,$e);			# f+=ROTATE(a,5)
+	 &add($e,$tmp1);		# e+=F_20_39(b,c,d)
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov($tmp1,$a);
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
+	&lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
+	&add($f,$tmp1);			# f+=ROTATE(a,5)
+}
 	}
 
 sub BODY_40_59
@@ -125,41 +252,86 @@ sub BODY_40_59
 
 	&comment("40_59 $n");
 
-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
-	 &mov($tmp1,&swtmp(($n+2)%16));
-	&xor($f,$tmp1);
-	 &mov($tmp1,&swtmp(($n+8)%16));
-	&xor($f,$tmp1);
-	 &mov($tmp1,&swtmp(($n+13)%16));
-	&xor($f,$tmp1);			# f holds xa^xb^xc^xd
-	 &mov($tmp1,$b);		# tmp1 to hold F_40_59(b,c,d)
+if ($alt) {
+	&add($e,$tmp1);			# e+=b&(c^d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&mov($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($c,$d);			# restore $c
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
-	 &or($tmp1,$c);
-	&mov(&swtmp($n%16),$f);		# xi=f
-	 &and($tmp1,$d);
-	&lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
-	 &mov($e,$b);			# e becomes volatile and is used
-					# to calculate F_40_59(b,c,d)
+	 &and($tmp1,$c);
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &add($e,$tmp1);		# e+=c&d
+	&mov($tmp1,$a);			# b in next round
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if ($n<59);
+	&and($tmp1,$b)			if ($n<59);# tmp1 to hold F_40_59(b,c,d)
+	 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_40_59(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($tmp1,$e);		# b&(c^d)+=e
 	&rotr($b,2);			# b=ROTATE(b,30)
-	 &and($e,$c);
-	&or($tmp1,$e);			# tmp1 holds F_40_59(b,c,d)		
-	 &mov($e,$a);
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &add($f,$tmp1);		# f+=tmp1;
+	 &mov($e,$a);			# e becomes volatile
+	&rotl($e,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
+	 &mov($tmp1,$c);
 	&add($f,$e);			# f+=ROTATE(a,5)
+	 &and($tmp1,$d);
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=c&d
+}
 	}
 
 &function_begin("sha1_block_data_order");
+if ($xmm) {
+  &static_label("ssse3_shortcut");
+  &static_label("avx_shortcut")		if ($ymm);
+  &static_label("K_XX_XX");
+
+	&call	(&label("pic_point"));	# make it PIC!
+  &set_label("pic_point");
+	&blindpop($tmp1);
+	&picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+
+	&mov	($A,&DWP(0,$T));
+	&mov	($D,&DWP(4,$T));
+	&test	($D,1<<9);		# check SSSE3 bit
+	&jz	(&label("x86"));
+	&test	($A,1<<24);		# check FXSR bit
+	&jz	(&label("x86"));
+	if ($ymm) {
+		&and	($D,1<<28);		# mask AVX bit
+		&and	($A,1<<30);		# mask "Intel CPU" bit
+		&or	($A,$D);
+		&cmp	($A,1<<28|1<<30);
+		&je	(&label("avx_shortcut"));
+	}
+	&jmp	(&label("ssse3_shortcut"));
+  &set_label("x86",16);
+}
 	&mov($tmp1,&wparam(0));	# SHA_CTX *c
 	&mov($T,&wparam(1));	# const void *input
 	&mov($A,&wparam(2));	# size_t num
-	&stack_push(16);	# allocate X[16]
+	&stack_push(16+3);	# allocate X[16]
 	&shl($A,6);
 	&add($A,$T);
 	&mov(&wparam(2),$A);	# pointer beyond the end of input
 	&mov($E,&DWP(16,$tmp1));# pre-load E
+	&jmp(&label("loop"));
 
-	&set_label("loop",16);
+&set_label("loop",16);
 
 	# copy input chunk to X, but reversing byte order!
 	for ($i=0; $i<16; $i+=4)
@@ -213,8 +385,845 @@ sub BODY_40_59
 	&mov(&DWP(16,$tmp1),$C);
 	&jb(&label("loop"));
 
-	&stack_pop(16);
+	&stack_pop(16+3);
 &function_end("sha1_block_data_order");
+
+if ($xmm) {
+######################################################################
+# The SSSE3 implementation.
+#
+# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
+# 32 elements of the message schedule or Xupdate outputs. First 4
+# quadruples are simply byte-swapped input, next 4 are calculated
+# according to method originally suggested by Dean Gaudet (modulo
+# being implemented in SSSE3). Once 8 quadruples or 32 elements are
+# collected, it switches to routine proposed by Max Locktyukhin.
+#
+# Calculations inevitably require temporary reqisters, and there are
+# no %xmm registers left to spare. For this reason part of the ring
+# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
+# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
+# X[-5], and X[4] - X[-4]...
+#
+# Another notable optimization is aggressive stack frame compression
+# aiming to minimize amount of 9-byte instructions...
+#
+# Yet another notable optimization is "jumping" $B variable. It means
+# that there is no register permanently allocated for $B value. This
+# allowed to eliminate one instruction from body_20_39...
+#
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+&function_begin("_sha1_block_data_order_ssse3");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("ssse3_shortcut");
+
+	&movdqa	(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&movdqa	(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&movdqa	(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&movdqa	(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&movdqa	(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&movdqa	(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&movdqa	(&QWP(112+16,"esp"),@X[5]);
+	&movdqa	(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&movdqa	(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&movdqa	(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&movdqu	(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&movdqu	(@X[-3&7],&QWP(-48,$inp));
+	&movdqu	(@X[-2&7],&QWP(-32,$inp));
+	&movdqu	(@X[-1&7],&QWP(-16,$inp));
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&pshufb	(@X[-3&7],@X[2]);
+	&pshufb	(@X[-2&7],@X[2]);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&pshufb	(@X[-1&7],@X[2]);
+	&paddd	(@X[-4&7],@X[3]);		# add K_00_19
+	&paddd	(@X[-3&7],@X[3]);
+	&paddd	(@X[-2&7],@X[3]);
+	&movdqa	(&QWP(0,"esp"),@X[-4&7]);	# X[]+K xfer to IALU
+	&psubd	(@X[-4&7],@X[3]);		# restore X[]
+	&movdqa	(&QWP(0+16,"esp"),@X[-3&7]);
+	&psubd	(@X[-3&7],@X[3]);
+	&movdqa	(&QWP(0+32,"esp"),@X[-2&7]);
+	&psubd	(@X[-2&7],@X[3]);
+	&movdqa	(@X[0],@X[-3&7]);
+	&jmp	(&label("loop"));
+
+######################################################################
+# SSE instruction sequence is first broken to groups of indepentent
+# instructions, independent in respect to their inputs and shifter
+# (not all architectures have more than one). Then IALU instructions
+# are "knitted in" between the SSE groups. Distance is maintained for
+# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
+# [which allegedly also implements SSSE3]...
+#
+# Temporary registers usage. X[2] is volatile at the entry and at the
+# end is restored from backtrace ring buffer. X[3] is expected to
+# contain current K_XX_XX constant and is used to caclulate X[-1]+K
+# from previous round, it becomes volatile the moment the value is
+# saved to stack for transfer to IALU. X[4] becomes volatile whenever
+# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
+# end it is loaded with next K_XX_XX [which becomes X[3] in next
+# round]...
+#
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	&movdqa	(@X[2],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@X[3],@X[-1&7]);
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[2],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@X[4],@X[0]);
+	&movdqa	(@X[2],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[2],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@X[3],@X[4]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[4],30);
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@X[3],2);
+	&pxor	(@X[0],@X[4]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
+	  &movdqa	(@X[1],@X[-2&7])	if ($Xi<7);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[2],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@X[2],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 if ($Xi%5) {
+	  &movdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@X[2],@X[0]);
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@X[2],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	  &movdqa	(@X[3],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&movdqa	(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&movdqa	(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&movdqu	(@X[-4&7],&QWP(0,$inp));	# load input
+	&movdqu	(@X[-3&7],&QWP(16,$inp));
+	&movdqu	(@X[-2&7],&QWP(32,$inp));
+	&movdqu	(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@X[3]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+&set_label("loop",16);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	($B,@T[0]);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+	&movdqa	(@X[0],@X[-3&7]);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+
+&function_end("_sha1_block_data_order_ssse3");
+
+if ($ymm) {
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+&function_begin("_sha1_block_data_order_avx");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("avx_shortcut");
+	&vzeroall();
+
+	&vmovdqa(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&vmovdqa(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&vmovdqa(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&vmovdqa(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&vmovdqa(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&vmovdqa(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&vmovdqa(&QWP(112+16,"esp"),@X[5]);
+	&vmovdqa(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&vmovdqa(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&vmovdqa(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&vmovdqu(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&vmovdqu(@X[-3&7],&QWP(-48,$inp));
+	&vmovdqu(@X[-2&7],&QWP(-32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(-16,$inp));
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&vpshufb(@X[-3&7],@X[-3&7],@X[2]);
+	&vpshufb(@X[-2&7],@X[-2&7],@X[2]);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&vpshufb(@X[-1&7],@X[-1&7],@X[2]);
+	&vpaddd	(@X[0],@X[-4&7],@X[3]);		# add K_00_19
+	&vpaddd	(@X[1],@X[-3&7],@X[3]);
+	&vpaddd	(@X[2],@X[-2&7],@X[3]);
+	&vmovdqa(&QWP(0,"esp"),@X[0]);		# X[]+K xfer to IALU
+	&vmovdqa(&QWP(0+16,"esp"),@X[1]);
+	&vmovdqa(&QWP(0+32,"esp"),@X[2]);
+	&jmp	(&label("loop"));
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@X[2],@X[-1&7],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[2],@X[2],@X[-2&7]);		# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[2],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@X[4],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[3],@X[4],30);
+	&vpor	(@X[0],@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@X[4],@X[4],2);
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[4]);		# "X[0]"^=("X[0]"<<96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@X[2],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 if ($Xi%5) {
+	  &vmovdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &vmovdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@X[2],@X[0],30);
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@X[2]);	# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&vmovdqa(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&vmovdqa(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&vmovdqu(@X[-4&7],&QWP(0,$inp));	# load input
+	&vmovdqu(@X[-3&7],&QWP(16,$inp));
+	&vmovdqu(@X[-2&7],&QWP(32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb	(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa	(&QWP(0+16*$Xi,"esp"),@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+&set_label("loop",16);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	($B,@T[0]);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+	&vzeroall();
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+&function_end("_sha1_block_data_order_avx");
+}
+&set_label("K_XX_XX",64);
+&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999);	# K_00_19
+&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1);	# K_20_39
+&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc);	# K_40_59
+&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6);	# K_60_79
+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# pbswap mask
+}
 &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
diff --git a/main/openssl/crypto/sha/asm/sha1-alpha.pl b/main/openssl/crypto/sha/asm/sha1-alpha.pl
new file mode 100644
index 00000000..6c4b9251
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-alpha.pl
@@ -0,0 +1,322 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for Alpha.
+
+# On 21264 performance is 33% better than code generated by vendor
+# compiler, and 75% better than GCC [3.4], and in absolute terms is
+# 8.7 cycles per processed byte. Implementation features vectorized
+# byte swap, but not Xupdate.
+
+@X=(	"\$0",	"\$1",	"\$2",	"\$3",	"\$4",	"\$5",	"\$6",	"\$7",
+	"\$8",	"\$9",	"\$10",	"\$11",	"\$12",	"\$13",	"\$14",	"\$15");
+$ctx="a0";	# $16
+$inp="a1";
+$num="a2";
+$A="a3";
+$B="a4";	# 20
+$C="a5";
+$D="t8";
+$E="t9";	@V=($A,$B,$C,$D,$E);
+$t0="t10";	# 24
+$t1="t11";
+$t2="ra";
+$t3="t12";
+$K="AT";	# 28
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+	ldq_u	@X[0],0+0($inp)
+	ldq_u	@X[1],0+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<14);
+	ldq_u	@X[$i+2],($i+2)*4+0($inp)
+	ldq_u	@X[$i+3],($i+2)*4+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<15);
+	extql	@X[$i],$inp,@X[$i]
+	extqh	@X[$i+1],$inp,@X[$i+1]
+
+	or	@X[$i+1],@X[$i],@X[$i]	# pair of 32-bit values are fetched
+
+	srl	@X[$i],24,$t0		# vectorized byte swap
+	srl	@X[$i],8,$t2
+
+	sll	@X[$i],8,$t3
+	sll	@X[$i],24,@X[$i]
+	zapnot	$t0,0x11,$t0
+	zapnot	$t2,0x22,$t2
+
+	zapnot	@X[$i],0x88,@X[$i]
+	or	$t0,$t2,$t0
+	zapnot	$t3,0x44,$t3
+	sll	$a,5,$t1
+
+	or	@X[$i],$t0,@X[$i]
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	zapnot	$a,0xf,$a
+
+	or	@X[$i],$t3,@X[$i]
+	srl	$a,27,$t0
+	bic	$d,$b,$t3
+	sll	$b,30,$b
+
+	extll	@X[$i],4,@X[$i+1]	# extract upper half
+	or	$t2,$t3,$t2
+	addl	@X[$i],$e,$e
+
+	addl	$t1,$e,$e
+	srl	$b,32,$t3
+	zapnot	@X[$i],0xf,@X[$i]
+
+	addl	$t0,$e,$e
+	addl	$t2,$e,$e
+	or	$t3,$b,$b
+___
+$code.=<<___ if (($i&1) && $i<15);
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	zapnot	$a,0xf,$a
+
+	srl	$a,27,$t0
+	addl	@X[$i%16],$e,$e
+	bic	$d,$b,$t3
+	sll	$b,30,$b
+
+	or	$t2,$t3,$t2
+	addl	$t1,$e,$e
+	srl	$b,32,$t3
+	zapnot	@X[$i],0xf,@X[$i]
+
+	addl	$t0,$e,$e
+	addl	$t2,$e,$e
+	or	$t3,$b,$b
+___
+$code.=<<___ if ($i>=15);	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	zapnot	$a,0xf,$a
+	addl	@X[$i%16],$e,$e
+	bic	$d,$b,$t3
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	srl	$a,27,$t0
+	addl	$t1,$e,$e
+	or	$t2,$t3,$t2
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$b
+	addl	$t0,$e,$e
+	srl	@X[$j%16],31,$t1
+
+	addl	$t2,$e,$e
+	srl	$b,32,$t3
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+	or	$t1,@X[$j%16],@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$t3
+	addl	$t1,$e,$e
+	xor	$b,$c,$t2
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	srl	$b,2,$b
+	addl	@X[$i%16],$e,$e
+	xor	$d,$t2,$t2
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	srl	@X[$j%16],31,$t1
+	addl	$t2,$e,$e
+	srl	$a,27,$t0
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	addl	$t0,$e,$e
+	or	$t1,@X[$j%16],@X[$j%16]
+___
+$code.=<<___ if ($i<77);
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+___
+$code.=<<___ if ($i==79);	# with context fetch
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	ldl	@X[0],0($ctx)
+
+	sll	$b,30,$t3
+	addl	$t1,$e,$e
+	xor	$b,$c,$t2
+	ldl	@X[1],4($ctx)
+
+	srl	$b,2,$b
+	addl	@X[$i%16],$e,$e
+	xor	$d,$t2,$t2
+	ldl	@X[2],8($ctx)
+
+	srl	$a,27,$t0
+	addl	$t2,$e,$e
+	ldl	@X[3],12($ctx)
+
+	or	$t3,$b,$b
+	addl	$t0,$e,$e
+	ldl	@X[4],16($ctx)
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	srl	$a,27,$t0
+	and	$b,$c,$t2
+	and	$b,$d,$t3
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$b
+	addl	$t1,$e,$e
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	srl	@X[$j%16],31,$t1
+	addl	$t0,$e,$e
+	or	$t2,$t3,$t2
+	and	$c,$d,$t3
+
+	or	$t2,$t3,$t2
+	srl	$b,32,$t3
+	addl	@X[$i%16],$e,$e
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	addl	$t2,$e,$e
+	or	$t1,@X[$j%16],@X[$j%16]
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+___
+}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	sha1_block_data_order
+.align	5
+.ent	sha1_block_data_order
+sha1_block_data_order:
+	lda	sp,-64(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	stq	s2,24(sp)
+	stq	s3,32(sp)
+	stq	s4,40(sp)
+	stq	s5,48(sp)
+	stq	fp,56(sp)
+	.mask	0x0400fe00,-64
+	.frame	sp,64,ra
+	.prologue 0
+
+	ldl	$A,0($ctx)
+	ldl	$B,4($ctx)
+	sll	$num,6,$num
+	ldl	$C,8($ctx)
+	ldl	$D,12($ctx)
+	ldl	$E,16($ctx)
+	addq	$inp,$num,$num
+
+.Lloop:
+	.set	noreorder
+	ldah	$K,23170(zero)
+	zapnot	$B,0xf,$B
+	lda	$K,31129($K)	# K_00_19
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,28378(zero)
+	lda	$K,-5215($K)	# K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,-28900(zero)
+	lda	$K,-17188($K)	# K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,-13725(zero)
+	lda	$K,-15914($K)	# K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	addl	@X[0],$A,$A
+	addl	@X[1],$B,$B
+	addl	@X[2],$C,$C
+	addl	@X[3],$D,$D
+	addl	@X[4],$E,$E
+	stl	$A,0($ctx)
+	stl	$B,4($ctx)
+	addq	$inp,64,$inp
+	stl	$C,8($ctx)
+	stl	$D,12($ctx)
+	stl	$E,16($ctx)
+	cmpult	$inp,$num,$t1
+	bne	$t1,.Lloop
+
+	.set	noreorder
+	ldq	ra,0(sp)
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	ldq	s2,24(sp)
+	ldq	s3,32(sp)
+	ldq	s4,40(sp)
+	ldq	s5,48(sp)
+	ldq	fp,56(sp)
+	lda	sp,64(sp)
+	ret	(ra)
+.end	sha1_block_data_order
+.ascii	"SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha1-armv4-large.pl b/main/openssl/crypto/sha/asm/sha1-armv4-large.pl
index 79e3f613..33da3e0e 100644
--- a/main/openssl/crypto/sha/asm/sha1-armv4-large.pl
+++ b/main/openssl/crypto/sha/asm/sha1-armv4-large.pl
@@ -47,6 +47,10 @@
 # Cortex A8 core and in absolute terms ~870 cycles per input block
 # [or 13.6 cycles per byte].
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 10%
+# improvement on Cortex A8 core and 12.2 cycles per byte.
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -76,31 +80,41 @@ $code.=<<___;
 	add	$e,$K,$e,ror#2			@ E+=K_xx_xx
 	ldr	$t3,[$Xi,#2*4]
 	eor	$t0,$t0,$t1
-	eor	$t2,$t2,$t3
+	eor	$t2,$t2,$t3			@ 1 cycle stall
 	eor	$t1,$c,$d			@ F_xx_xx
 	mov	$t0,$t0,ror#31
 	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
 	eor	$t0,$t0,$t2,ror#31
+	str	$t0,[$Xi,#-4]!
 	$opt1					@ F_xx_xx
 	$opt2					@ F_xx_xx
 	add	$e,$e,$t0			@ E+=X[i]
-	str	$t0,[$Xi,#-4]!
 ___
 }
 
 sub BODY_00_15 {
 my ($a,$b,$c,$d,$e)=@_;
 $code.=<<___;
-	ldrb	$t0,[$inp],#4
-	ldrb	$t1,[$inp,#-1]
-	ldrb	$t2,[$inp,#-2]
+#if __ARM_ARCH__<7
+	ldrb	$t1,[$inp,#2]
+	ldrb	$t0,[$inp,#3]
+	ldrb	$t2,[$inp,#1]
 	add	$e,$K,$e,ror#2			@ E+=K_00_19
-	ldrb	$t3,[$inp,#-3]
+	ldrb	$t3,[$inp],#4
+	orr	$t0,$t0,$t1,lsl#8
+	eor	$t1,$c,$d			@ F_xx_xx
+	orr	$t0,$t0,$t2,lsl#16
 	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
-	orr	$t0,$t1,$t0,lsl#24
+	orr	$t0,$t0,$t3,lsl#24
+#else
+	ldr	$t0,[$inp],#4			@ handles unaligned
+	add	$e,$K,$e,ror#2			@ E+=K_00_19
 	eor	$t1,$c,$d			@ F_xx_xx
-	orr	$t0,$t0,$t2,lsl#8
-	orr	$t0,$t0,$t3,lsl#16
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	$t0,$t0				@ byte swap
+#endif
+#endif
 	and	$t1,$b,$t1,ror#2
 	add	$e,$e,$t0			@ E+=X[i]
 	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
@@ -136,6 +150,8 @@ ___
 }
 
 $code=<<___;
+#include "arm_arch.h"
+
 .text
 
 .global	sha1_block_data_order
@@ -161,7 +177,7 @@ for($i=0;$i<5;$i++) {
 $code.=<<___;
 	teq	$Xi,sp
 	bne	.L_00_15		@ [((11+4)*5+2)*3]
-	sub	sp,sp,#5*4
+	sub	sp,sp,#25*4
 ___
 	&BODY_00_15(@V);	unshift(@V,pop(@V));
 	&BODY_16_19(@V);	unshift(@V,pop(@V));
@@ -171,7 +187,6 @@ ___
 $code.=<<___;
 
 	ldr	$K,.LK_20_39		@ [+15+16*4]
-	sub	sp,sp,#20*4
 	cmn	sp,#0			@ [+3], clear carry to denote 20_39
 .L_20_39_or_60_79:
 ___
@@ -210,10 +225,14 @@ $code.=<<___;
 	teq	$inp,$len
 	bne	.Lloop			@ [+18], total 1307
 
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .align	2
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1
diff --git a/main/openssl/crypto/sha/asm/sha1-armv4-large.s b/main/openssl/crypto/sha/asm/sha1-armv4-large.s
index 7f687d9f..639ae78a 100644
--- a/main/openssl/crypto/sha/asm/sha1-armv4-large.s
+++ b/main/openssl/crypto/sha/asm/sha1-armv4-large.s
@@ -1,3 +1,5 @@
+#include "arm_arch.h"
+
 .text
 
 .global	sha1_block_data_order
@@ -16,76 +18,126 @@ sha1_block_data_order:
 	mov	r6,r6,ror#30
 	mov	r7,r7,ror#30		@ [6]
 .L_00_15:
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
 	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r4,r10,ror#2
 	add	r7,r7,r9			@ E+=X[i]
 	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r6,r8,r6,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r4,r5			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
 	eor	r10,r4,r5			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r3,r10,ror#2
 	add	r6,r6,r9			@ E+=X[i]
 	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r5,r8,r5,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r3,r4			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
 	eor	r10,r3,r4			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r7,r10,ror#2
 	add	r5,r5,r9			@ E+=X[i]
 	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r4,r8,r4,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r7,r3			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
 	eor	r10,r7,r3			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r6,r10,ror#2
 	add	r4,r4,r9			@ E+=X[i]
 	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r3,r8,r3,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r6,r7			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
 	eor	r10,r6,r7			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r5,r10,ror#2
 	add	r3,r3,r9			@ E+=X[i]
 	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
@@ -93,17 +145,27 @@ sha1_block_data_order:
 	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
 	teq	r14,sp
 	bne	.L_00_15		@ [((11+4)*5+2)*3]
-	sub	sp,sp,#5*4
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+	sub	sp,sp,#25*4
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
 	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r4,r10,ror#2
 	add	r7,r7,r9			@ E+=X[i]
 	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
@@ -115,15 +177,15 @@ sha1_block_data_order:
 	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r4,r5			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r3,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r6,r6,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
 	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
 	ldr	r9,[r14,#15*4]
@@ -132,15 +194,15 @@ sha1_block_data_order:
 	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r3,r4			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r7,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r5,r5,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
 	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
 	ldr	r9,[r14,#15*4]
@@ -149,15 +211,15 @@ sha1_block_data_order:
 	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r7,r3			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r6,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r4,r4,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
 	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
 	ldr	r9,[r14,#15*4]
@@ -166,20 +228,19 @@ sha1_block_data_order:
 	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r6,r7			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r5,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r3,r3,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
 	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
 
 	ldr	r8,.LK_20_39		@ [+15+16*4]
-	sub	sp,sp,#20*4
 	cmn	sp,#0			@ [+3], clear carry to denote 20_39
 .L_20_39_or_60_79:
 	ldr	r9,[r14,#15*4]
@@ -188,15 +249,15 @@ sha1_block_data_order:
 	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r5,r6			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r4,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r7,r7,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -204,15 +265,15 @@ sha1_block_data_order:
 	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r4,r5			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r3,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r6,r6,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -220,15 +281,15 @@ sha1_block_data_order:
 	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r3,r4			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r7,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r5,r5,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -236,15 +297,15 @@ sha1_block_data_order:
 	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r7,r3			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r6,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r4,r4,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -252,15 +313,15 @@ sha1_block_data_order:
 	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r6,r7			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r5,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r3,r3,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
 	teq	r14,sp			@ preserve carry
 	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
@@ -275,15 +336,15 @@ sha1_block_data_order:
 	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r5,r6			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r4,r10,ror#2					@ F_xx_xx
 	and r11,r5,r6					@ F_xx_xx
 	add	r7,r7,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
 	add	r7,r7,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -292,15 +353,15 @@ sha1_block_data_order:
 	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r4,r5			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r3,r10,ror#2					@ F_xx_xx
 	and r11,r4,r5					@ F_xx_xx
 	add	r6,r6,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
 	add	r6,r6,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -309,15 +370,15 @@ sha1_block_data_order:
 	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r3,r4			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r7,r10,ror#2					@ F_xx_xx
 	and r11,r3,r4					@ F_xx_xx
 	add	r5,r5,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
 	add	r5,r5,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -326,15 +387,15 @@ sha1_block_data_order:
 	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r7,r3			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r6,r10,ror#2					@ F_xx_xx
 	and r11,r7,r3					@ F_xx_xx
 	add	r4,r4,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
 	add	r4,r4,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -343,15 +404,15 @@ sha1_block_data_order:
 	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r6,r7			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r5,r10,ror#2					@ F_xx_xx
 	and r11,r6,r7					@ F_xx_xx
 	add	r3,r3,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
 	add	r3,r3,r11,ror#2
 	teq	r14,sp
@@ -373,10 +434,14 @@ sha1_block_data_order:
 	teq	r1,r2
 	bne	.Lloop			@ [+18], total 1307
 
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .align	2
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1
diff --git a/main/openssl/crypto/sha/asm/sha1-ia64.pl b/main/openssl/crypto/sha/asm/sha1-ia64.pl
index 51c4f47e..02d35d16 100644
--- a/main/openssl/crypto/sha/asm/sha1-ia64.pl
+++ b/main/openssl/crypto/sha/asm/sha1-ia64.pl
@@ -15,7 +15,7 @@
 # is >50% better than HP C and >2x better than gcc.
 
 $code=<<___;
-.ident  \"sha1-ia64.s, version 1.2\"
+.ident  \"sha1-ia64.s, version 1.3\"
 .ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
 .explicit
 
@@ -26,14 +26,10 @@ if ($^O eq "hpux") {
     $ADDP="addp4";
     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
 } else { $ADDP="add"; }
-for (@ARGV) {	$big_endian=1 if (/\-DB_ENDIAN/);
-		$big_endian=0 if (/\-DL_ENDIAN/);   }
-if (!defined($big_endian))
-	    {	$big_endian=(unpack('L',pack('N',1))==1);   }
 
 #$human=1;
 if ($human) {	# useful for visual code auditing...
-	($A,$B,$C,$D,$E,$T)   = ("A","B","C","D","E","T");
+	($A,$B,$C,$D,$E)   = ("A","B","C","D","E");
 	($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
 	    (	"K_00_19","K_20_39","K_40_59","K_60_79"	);
@@ -41,47 +37,50 @@ if ($human) {	# useful for visual code auditing...
 		"X8", "X9","X10","X11","X12","X13","X14","X15"	);
 }
 else {
-	($A,$B,$C,$D,$E,$T)   = ("loc0","loc1","loc2","loc3","loc4","loc5");
-	($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
+	($A,$B,$C,$D,$E)   =    ("loc0","loc1","loc2","loc3","loc4");
+	($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
-	    (	"r14", "r15", "loc11", "loc12"	);
+	    (	"r14", "r15", "loc10", "loc11"	);
 	@X= (	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
 		"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"	);
 }
 
 sub BODY_00_15 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___ if ($i==0);
-{ .mmi;	ld1	$X[$i&0xf]=[inp],2	    // MSB
+{ .mmi;	ld1	$X[$i]=[inp],2		    // MSB
 	ld1	tmp2=[tmp3],2		};;
 { .mmi;	ld1	tmp0=[inp],2
 	ld1	tmp4=[tmp3],2		    // LSB
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp2,8,8	};;
+	dep	$X[$i]=$X[$i],tmp2,8,8	};;
 ___
 if ($i<15) {
 	$code.=<<___;
-{ .mmi;	ld1	$X[($i+1)&0xf]=[inp],2	    // +1
+{ .mmi;	ld1	$Xn=[inp],2		    // forward Xload
+	nop.m	0x0
 	dep	tmp1=tmp0,tmp4,8,8	};;
-{ .mmi;	ld1	tmp2=[tmp3],2		    // +1
+{ .mmi;	ld1	tmp2=[tmp3],2		    // forward Xload
 	and	tmp4=$c,$b
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp0=$e,$K_00_19
+	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
+	andcm	tmp1=$d,$b
 	dep.z	tmp5=$a,5,27		};; // a<<5
-{ .mmi;	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
+{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xload
+	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		};; // a>>27
-{ .mmi;	ld1	tmp0=[inp],2		    // +1
-	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
+{ .mmi;	ld1	tmp0=[inp],2		    // forward Xload
+	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
-{ .mmi;	ld1	tmp4=[tmp3],2		    // +1
+{ .mmi;	ld1	tmp4=[tmp3],2		    // forward Xload
 	or	tmp5=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp5		    // f+=ROTATE(a,5)
-	dep	$X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8	// +1
-	mux2	$X[$i&0xf]=$X[$i&0xf],0x44	} //;;
+{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)
+	dep	$Xn=$Xn,tmp2,8,8	    // forward Xload
+	mux2	$X[$i]=$X[$i],0x44	} //;;
 
 ___
 	}
@@ -89,24 +88,24 @@ else	{
 	$code.=<<___;
 { .mii;	and	tmp3=$c,$b
 	dep	tmp1=tmp0,tmp4,8,8;;
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp0=$e,$K_00_19
+	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
+	andcm	tmp1=$d,$b
 	dep.z	tmp5=$a,5,27		};; // a<<5
-{ .mmi;	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
+{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xupdate
+	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
 	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
+	xor	$Xn=$Xn,tmp3		    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi; or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-	mux2	$X[$i&0xf]=$X[$i&0xf],0x44  };;
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+	mux2	$X[$i]=$X[$i],0x44	};;
 
 ___
 	}
@@ -114,27 +113,28 @@ ___
 
 sub BODY_16_19 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
-	and	tmp0=$c,$b
+{ .mib;	add	$e=$e,$K_00_19		    // e+=K_00_19
 	dep.z	tmp5=$a,5,27		}   // a<<5
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp4=$e,$K_00_19	};;
-{ .mmi;	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=$f,tmp4		    // f+=e+K_00_19
+{ .mib;	andcm	tmp1=$d,$b
+	and	tmp0=$c,$b		};;
+{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
+{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16]	// forward Xupdate
 	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_00_19(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	add	$e=$e,tmp0		    // f+=F_00_19(b,c,d)
+	xor	$Xn=$Xn,tmp3		    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
 	nop.i	0			};;
 
 ___
@@ -142,49 +142,47 @@ ___
 
 sub BODY_20_39 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
+my	($i,$a,$b,$c,$d,$e,$Konst)=@_;
 	$Konst = $K_20_39 if (!defined($Konst));
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 if ($i<79) {
 $code.=<<___;
-{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
+{ .mib;	add	$e=$e,$Konst		    // e+=K_XX_XX
 	dep.z	tmp5=$a,5,27		}   // a<<5
 { .mib;	xor	tmp0=$c,$b
-	add	tmp4=$e,$Konst		};;
-{ .mmi;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
-	add	$f=$f,tmp4		    // f+=e+K_20_39
+	xor	$Xn=$Xn,$X[($j+2)%16]	};; // forward Xupdate
+{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
-	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
+	xor	$Xn=$Xn,$X[($j+8)%16]	};; // forward Xupdate
+{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
+	xor	$Xn=$Xn,$X[($j+13)%16]	    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
 	nop.i	0			};;
 
 ___
 }
 else {
 $code.=<<___;
-{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
+{ .mib;	add	$e=$e,$Konst		    // e+=K_60_79
 	dep.z	tmp5=$a,5,27		}   // a<<5
 { .mib;	xor	tmp0=$c,$b
-	add	tmp4=$e,$Konst		};;
-{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
-	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mib;	add	$f=$f,tmp4		    // f+=e+K_20_39
 	add	$h1=$h1,$a		};; // wrap up
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
-	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30) ;;?
-{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
+{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	extr.u	tmp1=$a,27,5		}   // a>>27
+{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
 	add	$h3=$h3,$c		};; // wrap up
-{ .mib;	add	tmp3=1,inp		    // used in unaligned codepath
-	add	$f=$f,tmp1		}   // f+=ROTATE(a,5)
-{ .mib;	add	$h2=$h2,$b		    // wrap up
+{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
+	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
+	shrp	$b=tmp6,tmp6,2		};; // b=ROTATE(b,30) ;;?
+{ .mmi;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	add	tmp3=1,inp		    // used in unaligned codepath
 	add	$h4=$h4,$d		};; // wrap up
 
 ___
@@ -193,29 +191,29 @@ ___
 
 sub BODY_40_59 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
-	and	tmp0=$c,$b
+{ .mib;	add	$e=$e,$K_40_59		    // e+=K_40_59
 	dep.z	tmp5=$a,5,27		}   // a<<5
-{ .mmi;	and	tmp1=$d,$b
-	add	tmp4=$e,$K_40_59	};;
-{ .mmi;	or	tmp0=tmp0,tmp1		    // (b&c)|(b&d)
-	add	$f=$f,tmp4		    // f+=e+K_40_59
+{ .mib;	and	tmp1=$c,$d
+	xor	tmp0=$c,$d		};;
+{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	add	tmp5=tmp5,tmp1		    // a<<5+(c&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	and	tmp4=$c,$d
-	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
-	};;
-{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	and	tmp0=tmp0,$b
+	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] };;	// forward Xupdate
+{ .mmi;	add	$e=$e,tmp0		    // e+=b&(c^d)
+	add	tmp5=tmp5,tmp1		    // ROTATE(a,5)+(c&d)
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
-{ .mmi;	or	tmp0=tmp0,tmp4		    // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
+{ .mmi;	xor	$Xn=$Xn,tmp3
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp0		    // f+=F_40_59(b,c,d)
-	shrp	$e=tmp2,tmp2,31;;	    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-	add	$f=$f,tmp1		};; // f+=ROTATE(a,5)
+{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)+(c&d)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+	nop.i	0x0			};;
 
 ___
 }
@@ -237,7 +235,7 @@ inp=r33;	// in1
 .align	32
 sha1_block_data_order:
 	.prologue
-{ .mmi;	alloc	tmp1=ar.pfs,3,15,0,0
+{ .mmi;	alloc	tmp1=ar.pfs,3,14,0,0
 	$ADDP	tmp0=4,ctx
 	.save	ar.lc,r3
 	mov	r3=ar.lc		}
@@ -245,8 +243,8 @@ sha1_block_data_order:
 	$ADDP	inp=0,inp
 	mov	r2=pr			};;
 tmp4=in2;
-tmp5=loc13;
-tmp6=loc14;
+tmp5=loc12;
+tmp6=loc13;
 	.body
 { .mlx;	ld4	$h0=[ctx],8
 	movl	$K_00_19=0x5a827999	}
@@ -273,7 +271,8 @@ tmp6=loc14;
 
 ___
 
-{ my $i,@V=($A,$B,$C,$D,$E,$T);
+{ my $i;
+  my @V=($A,$B,$C,$D,$E);
 
 	for($i=0;$i<16;$i++)	{ &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
 	for(;$i<20;$i++)	{ &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
@@ -281,12 +280,12 @@ ___
 	for(;$i<60;$i++)	{ &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
 	for(;$i<80;$i++)	{ &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
 
-	(($V[5] eq $D) and ($V[0] eq $E)) or die;	# double-check
+	(($V[0] eq $A) and ($V[4] eq $E)) or die;	# double-check
 }
 
 $code.=<<___;
-{ .mmb;	add	$h0=$h0,$E
-	nop.m	0
+{ .mmb;	add	$h0=$h0,$A
+	add	$h2=$h2,$C
 	br.ctop.dptk.many	.Ldtop	};;
 .Ldend:
 { .mmi;	add	tmp0=4,ctx
diff --git a/main/openssl/crypto/sha/asm/sha1-mips.S b/main/openssl/crypto/sha/asm/sha1-mips.S
new file mode 100644
index 00000000..865da255
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-mips.S
@@ -0,0 +1,1664 @@
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.align	5
+.globl	sha1_block_data_order
+.ent	sha1_block_data_order
+sha1_block_data_order:
+	.frame	$29,16*4,$31
+	.mask	3237937152,-4
+	.set	noreorder
+	sub $29,16*4
+	sw	$31,(16-1)*4($29)
+	sw	$30,(16-2)*4($29)
+	sw	$23,(16-3)*4($29)
+	sw	$22,(16-4)*4($29)
+	sw	$21,(16-5)*4($29)
+	sw	$20,(16-6)*4($29)
+	sw	$19,(16-7)*4($29)
+	sw	$18,(16-8)*4($29)
+	sw	$17,(16-9)*4($29)
+	sw	$16,(16-10)*4($29)
+	sll $6,6
+	add $6,$5
+	sw	$6,0($29)
+	lw	$1,0($4)
+	lw	$2,4($4)
+	lw	$3,8($4)
+	lw	$7,12($4)
+	b	.Loop
+	lw	$24,16($4)
+.align	4
+.Loop:
+	.set	reorder
+	lwl	$8,3($5)
+	lui	$31,0x5a82
+	lwr	$8,0($5)
+	ori	$31,0x7999	# K_00_19
+	srl	$25,$8,24	# byte swap(0)
+	srl	$6,$8,8
+	andi	$30,$8,0xFF00
+	sll	$8,$8,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$8,$25
+	or	$6,$30
+	or	$8,$6
+	 lwl	$9,1*4+3($5)
+	sll	$25,$1,5	# 0
+	addu	$24,$31
+	 lwr	$9,1*4+0($5)
+	srl	$6,$1,27
+	addu	$24,$25
+	xor	$25,$3,$7
+	addu	$24,$6
+	sll	$30,$2,30
+	and	$25,$2
+	srl	$2,$2,2
+	xor	$25,$7
+	addu	$24,$8
+	or	$2,$30
+	addu	$24,$25
+	srl	$25,$9,24	# byte swap(1)
+	srl	$6,$9,8
+	andi	$30,$9,0xFF00
+	sll	$9,$9,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$9,$25
+	or	$6,$30
+	or	$9,$6
+	 lwl	$10,2*4+3($5)
+	sll	$25,$24,5	# 1
+	addu	$7,$31
+	 lwr	$10,2*4+0($5)
+	srl	$6,$24,27
+	addu	$7,$25
+	xor	$25,$2,$3
+	addu	$7,$6
+	sll	$30,$1,30
+	and	$25,$1
+	srl	$1,$1,2
+	xor	$25,$3
+	addu	$7,$9
+	or	$1,$30
+	addu	$7,$25
+	srl	$25,$10,24	# byte swap(2)
+	srl	$6,$10,8
+	andi	$30,$10,0xFF00
+	sll	$10,$10,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$10,$25
+	or	$6,$30
+	or	$10,$6
+	 lwl	$11,3*4+3($5)
+	sll	$25,$7,5	# 2
+	addu	$3,$31
+	 lwr	$11,3*4+0($5)
+	srl	$6,$7,27
+	addu	$3,$25
+	xor	$25,$1,$2
+	addu	$3,$6
+	sll	$30,$24,30
+	and	$25,$24
+	srl	$24,$24,2
+	xor	$25,$2
+	addu	$3,$10
+	or	$24,$30
+	addu	$3,$25
+	srl	$25,$11,24	# byte swap(3)
+	srl	$6,$11,8
+	andi	$30,$11,0xFF00
+	sll	$11,$11,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$11,$25
+	or	$6,$30
+	or	$11,$6
+	 lwl	$12,4*4+3($5)
+	sll	$25,$3,5	# 3
+	addu	$2,$31
+	 lwr	$12,4*4+0($5)
+	srl	$6,$3,27
+	addu	$2,$25
+	xor	$25,$24,$1
+	addu	$2,$6
+	sll	$30,$7,30
+	and	$25,$7
+	srl	$7,$7,2
+	xor	$25,$1
+	addu	$2,$11
+	or	$7,$30
+	addu	$2,$25
+	srl	$25,$12,24	# byte swap(4)
+	srl	$6,$12,8
+	andi	$30,$12,0xFF00
+	sll	$12,$12,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$12,$25
+	or	$6,$30
+	or	$12,$6
+	 lwl	$13,5*4+3($5)
+	sll	$25,$2,5	# 4
+	addu	$1,$31
+	 lwr	$13,5*4+0($5)
+	srl	$6,$2,27
+	addu	$1,$25
+	xor	$25,$7,$24
+	addu	$1,$6
+	sll	$30,$3,30
+	and	$25,$3
+	srl	$3,$3,2
+	xor	$25,$24
+	addu	$1,$12
+	or	$3,$30
+	addu	$1,$25
+	srl	$25,$13,24	# byte swap(5)
+	srl	$6,$13,8
+	andi	$30,$13,0xFF00
+	sll	$13,$13,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$13,$25
+	or	$6,$30
+	or	$13,$6
+	 lwl	$14,6*4+3($5)
+	sll	$25,$1,5	# 5
+	addu	$24,$31
+	 lwr	$14,6*4+0($5)
+	srl	$6,$1,27
+	addu	$24,$25
+	xor	$25,$3,$7
+	addu	$24,$6
+	sll	$30,$2,30
+	and	$25,$2
+	srl	$2,$2,2
+	xor	$25,$7
+	addu	$24,$13
+	or	$2,$30
+	addu	$24,$25
+	srl	$25,$14,24	# byte swap(6)
+	srl	$6,$14,8
+	andi	$30,$14,0xFF00
+	sll	$14,$14,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$14,$25
+	or	$6,$30
+	or	$14,$6
+	 lwl	$15,7*4+3($5)
+	sll	$25,$24,5	# 6
+	addu	$7,$31
+	 lwr	$15,7*4+0($5)
+	srl	$6,$24,27
+	addu	$7,$25
+	xor	$25,$2,$3
+	addu	$7,$6
+	sll	$30,$1,30
+	and	$25,$1
+	srl	$1,$1,2
+	xor	$25,$3
+	addu	$7,$14
+	or	$1,$30
+	addu	$7,$25
+	srl	$25,$15,24	# byte swap(7)
+	srl	$6,$15,8
+	andi	$30,$15,0xFF00
+	sll	$15,$15,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$15,$25
+	or	$6,$30
+	or	$15,$6
+	 lwl	$16,8*4+3($5)
+	sll	$25,$7,5	# 7
+	addu	$3,$31
+	 lwr	$16,8*4+0($5)
+	srl	$6,$7,27
+	addu	$3,$25
+	xor	$25,$1,$2
+	addu	$3,$6
+	sll	$30,$24,30
+	and	$25,$24
+	srl	$24,$24,2
+	xor	$25,$2
+	addu	$3,$15
+	or	$24,$30
+	addu	$3,$25
+	srl	$25,$16,24	# byte swap(8)
+	srl	$6,$16,8
+	andi	$30,$16,0xFF00
+	sll	$16,$16,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$16,$25
+	or	$6,$30
+	or	$16,$6
+	 lwl	$17,9*4+3($5)
+	sll	$25,$3,5	# 8
+	addu	$2,$31
+	 lwr	$17,9*4+0($5)
+	srl	$6,$3,27
+	addu	$2,$25
+	xor	$25,$24,$1
+	addu	$2,$6
+	sll	$30,$7,30
+	and	$25,$7
+	srl	$7,$7,2
+	xor	$25,$1
+	addu	$2,$16
+	or	$7,$30
+	addu	$2,$25
+	srl	$25,$17,24	# byte swap(9)
+	srl	$6,$17,8
+	andi	$30,$17,0xFF00
+	sll	$17,$17,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$17,$25
+	or	$6,$30
+	or	$17,$6
+	 lwl	$18,10*4+3($5)
+	sll	$25,$2,5	# 9
+	addu	$1,$31
+	 lwr	$18,10*4+0($5)
+	srl	$6,$2,27
+	addu	$1,$25
+	xor	$25,$7,$24
+	addu	$1,$6
+	sll	$30,$3,30
+	and	$25,$3
+	srl	$3,$3,2
+	xor	$25,$24
+	addu	$1,$17
+	or	$3,$30
+	addu	$1,$25
+	srl	$25,$18,24	# byte swap(10)
+	srl	$6,$18,8
+	andi	$30,$18,0xFF00
+	sll	$18,$18,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$18,$25
+	or	$6,$30
+	or	$18,$6
+	 lwl	$19,11*4+3($5)
+	sll	$25,$1,5	# 10
+	addu	$24,$31
+	 lwr	$19,11*4+0($5)
+	srl	$6,$1,27
+	addu	$24,$25
+	xor	$25,$3,$7
+	addu	$24,$6
+	sll	$30,$2,30
+	and	$25,$2
+	srl	$2,$2,2
+	xor	$25,$7
+	addu	$24,$18
+	or	$2,$30
+	addu	$24,$25
+	srl	$25,$19,24	# byte swap(11)
+	srl	$6,$19,8
+	andi	$30,$19,0xFF00
+	sll	$19,$19,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$19,$25
+	or	$6,$30
+	or	$19,$6
+	 lwl	$20,12*4+3($5)
+	sll	$25,$24,5	# 11
+	addu	$7,$31
+	 lwr	$20,12*4+0($5)
+	srl	$6,$24,27
+	addu	$7,$25
+	xor	$25,$2,$3
+	addu	$7,$6
+	sll	$30,$1,30
+	and	$25,$1
+	srl	$1,$1,2
+	xor	$25,$3
+	addu	$7,$19
+	or	$1,$30
+	addu	$7,$25
+	srl	$25,$20,24	# byte swap(12)
+	srl	$6,$20,8
+	andi	$30,$20,0xFF00
+	sll	$20,$20,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$20,$25
+	or	$6,$30
+	or	$20,$6
+	 lwl	$21,13*4+3($5)
+	sll	$25,$7,5	# 12
+	addu	$3,$31
+	 lwr	$21,13*4+0($5)
+	srl	$6,$7,27
+	addu	$3,$25
+	xor	$25,$1,$2
+	addu	$3,$6
+	sll	$30,$24,30
+	and	$25,$24
+	srl	$24,$24,2
+	xor	$25,$2
+	addu	$3,$20
+	or	$24,$30
+	addu	$3,$25
+	srl	$25,$21,24	# byte swap(13)
+	srl	$6,$21,8
+	andi	$30,$21,0xFF00
+	sll	$21,$21,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$21,$25
+	or	$6,$30
+	or	$21,$6
+	 lwl	$22,14*4+3($5)
+	sll	$25,$3,5	# 13
+	addu	$2,$31
+	 lwr	$22,14*4+0($5)
+	srl	$6,$3,27
+	addu	$2,$25
+	xor	$25,$24,$1
+	addu	$2,$6
+	sll	$30,$7,30
+	and	$25,$7
+	srl	$7,$7,2
+	xor	$25,$1
+	addu	$2,$21
+	or	$7,$30
+	addu	$2,$25
+	srl	$25,$22,24	# byte swap(14)
+	srl	$6,$22,8
+	andi	$30,$22,0xFF00
+	sll	$22,$22,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$22,$25
+	or	$6,$30
+	or	$22,$6
+	 lwl	$23,15*4+3($5)
+	sll	$25,$2,5	# 14
+	addu	$1,$31
+	 lwr	$23,15*4+0($5)
+	srl	$6,$2,27
+	addu	$1,$25
+	xor	$25,$7,$24
+	addu	$1,$6
+	sll	$30,$3,30
+	and	$25,$3
+	srl	$3,$3,2
+	xor	$25,$24
+	addu	$1,$22
+	or	$3,$30
+	addu	$1,$25
+	srl	$25,$23,24	# byte swap(15)
+	srl	$6,$23,8
+	andi	$30,$23,0xFF00
+	sll	$23,$23,24
+	andi	$6,0xFF00
+	sll	$30,$30,8
+	or	$23,$25
+	or	$23,$6
+	or	$23,$30
+	 xor	$8,$10
+	sll	$25,$1,5	# 15
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$8,$16
+	xor	$25,$3,$7
+	addu	$24,$6
+	 xor	$8,$21
+	sll	$30,$2,30
+	and	$25,$2
+	 srl	$6,$8,31
+	 addu	$8,$8
+	srl	$2,$2,2
+	xor	$25,$7
+	 or	$8,$6
+	addu	$24,$23
+	or	$2,$30
+	addu	$24,$25
+	 xor	$9,$11
+	sll	$25,$24,5	# 16
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$9,$17
+	xor	$25,$2,$3
+	addu	$7,$6
+	 xor	$9,$22
+	sll	$30,$1,30
+	and	$25,$1
+	 srl	$6,$9,31
+	 addu	$9,$9
+	srl	$1,$1,2
+	xor	$25,$3
+	 or	$9,$6
+	addu	$7,$8
+	or	$1,$30
+	addu	$7,$25
+	 xor	$10,$12
+	sll	$25,$7,5	# 17
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$10,$18
+	xor	$25,$1,$2
+	addu	$3,$6
+	 xor	$10,$23
+	sll	$30,$24,30
+	and	$25,$24
+	 srl	$6,$10,31
+	 addu	$10,$10
+	srl	$24,$24,2
+	xor	$25,$2
+	 or	$10,$6
+	addu	$3,$9
+	or	$24,$30
+	addu	$3,$25
+	 xor	$11,$13
+	sll	$25,$3,5	# 18
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$11,$19
+	xor	$25,$24,$1
+	addu	$2,$6
+	 xor	$11,$8
+	sll	$30,$7,30
+	and	$25,$7
+	 srl	$6,$11,31
+	 addu	$11,$11
+	srl	$7,$7,2
+	xor	$25,$1
+	 or	$11,$6
+	addu	$2,$10
+	or	$7,$30
+	addu	$2,$25
+	 xor	$12,$14
+	sll	$25,$2,5	# 19
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$12,$20
+	xor	$25,$7,$24
+	addu	$1,$6
+	 xor	$12,$9
+	sll	$30,$3,30
+	and	$25,$3
+	 srl	$6,$12,31
+	 addu	$12,$12
+	srl	$3,$3,2
+	xor	$25,$24
+	 or	$12,$6
+	addu	$1,$11
+	or	$3,$30
+	addu	$1,$25
+	lui	$31,0x6ed9
+	ori	$31,0xeba1	# K_20_39
+	 xor	$13,$15
+	sll	$25,$1,5	# 20
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$13,$21
+	xor	$25,$3,$7
+	addu	$24,$6
+	 xor	$13,$10
+	sll	$30,$2,30
+	xor	$25,$2
+	 srl	$6,$13,31
+	 addu	$13,$13
+	srl	$2,$2,2
+	addu	$24,$12
+	 or	$13,$6
+	or	$2,$30
+	addu	$24,$25
+	 xor	$14,$16
+	sll	$25,$24,5	# 21
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$14,$22
+	xor	$25,$2,$3
+	addu	$7,$6
+	 xor	$14,$11
+	sll	$30,$1,30
+	xor	$25,$1
+	 srl	$6,$14,31
+	 addu	$14,$14
+	srl	$1,$1,2
+	addu	$7,$13
+	 or	$14,$6
+	or	$1,$30
+	addu	$7,$25
+	 xor	$15,$17
+	sll	$25,$7,5	# 22
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$15,$23
+	xor	$25,$1,$2
+	addu	$3,$6
+	 xor	$15,$12
+	sll	$30,$24,30
+	xor	$25,$24
+	 srl	$6,$15,31
+	 addu	$15,$15
+	srl	$24,$24,2
+	addu	$3,$14
+	 or	$15,$6
+	or	$24,$30
+	addu	$3,$25
+	 xor	$16,$18
+	sll	$25,$3,5	# 23
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$16,$8
+	xor	$25,$24,$1
+	addu	$2,$6
+	 xor	$16,$13
+	sll	$30,$7,30
+	xor	$25,$7
+	 srl	$6,$16,31
+	 addu	$16,$16
+	srl	$7,$7,2
+	addu	$2,$15
+	 or	$16,$6
+	or	$7,$30
+	addu	$2,$25
+	 xor	$17,$19
+	sll	$25,$2,5	# 24
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$17,$9
+	xor	$25,$7,$24
+	addu	$1,$6
+	 xor	$17,$14
+	sll	$30,$3,30
+	xor	$25,$3
+	 srl	$6,$17,31
+	 addu	$17,$17
+	srl	$3,$3,2
+	addu	$1,$16
+	 or	$17,$6
+	or	$3,$30
+	addu	$1,$25
+	 xor	$18,$20
+	sll	$25,$1,5	# 25
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$18,$10
+	xor	$25,$3,$7
+	addu	$24,$6
+	 xor	$18,$15
+	sll	$30,$2,30
+	xor	$25,$2
+	 srl	$6,$18,31
+	 addu	$18,$18
+	srl	$2,$2,2
+	addu	$24,$17
+	 or	$18,$6
+	or	$2,$30
+	addu	$24,$25
+	 xor	$19,$21
+	sll	$25,$24,5	# 26
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$19,$11
+	xor	$25,$2,$3
+	addu	$7,$6
+	 xor	$19,$16
+	sll	$30,$1,30
+	xor	$25,$1
+	 srl	$6,$19,31
+	 addu	$19,$19
+	srl	$1,$1,2
+	addu	$7,$18
+	 or	$19,$6
+	or	$1,$30
+	addu	$7,$25
+	 xor	$20,$22
+	sll	$25,$7,5	# 27
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$20,$12
+	xor	$25,$1,$2
+	addu	$3,$6
+	 xor	$20,$17
+	sll	$30,$24,30
+	xor	$25,$24
+	 srl	$6,$20,31
+	 addu	$20,$20
+	srl	$24,$24,2
+	addu	$3,$19
+	 or	$20,$6
+	or	$24,$30
+	addu	$3,$25
+	 xor	$21,$23
+	sll	$25,$3,5	# 28
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$21,$13
+	xor	$25,$24,$1
+	addu	$2,$6
+	 xor	$21,$18
+	sll	$30,$7,30
+	xor	$25,$7
+	 srl	$6,$21,31
+	 addu	$21,$21
+	srl	$7,$7,2
+	addu	$2,$20
+	 or	$21,$6
+	or	$7,$30
+	addu	$2,$25
+	 xor	$22,$8
+	sll	$25,$2,5	# 29
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$22,$14
+	xor	$25,$7,$24
+	addu	$1,$6
+	 xor	$22,$19
+	sll	$30,$3,30
+	xor	$25,$3
+	 srl	$6,$22,31
+	 addu	$22,$22
+	srl	$3,$3,2
+	addu	$1,$21
+	 or	$22,$6
+	or	$3,$30
+	addu	$1,$25
+	 xor	$23,$9
+	sll	$25,$1,5	# 30
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$23,$15
+	xor	$25,$3,$7
+	addu	$24,$6
+	 xor	$23,$20
+	sll	$30,$2,30
+	xor	$25,$2
+	 srl	$6,$23,31
+	 addu	$23,$23
+	srl	$2,$2,2
+	addu	$24,$22
+	 or	$23,$6
+	or	$2,$30
+	addu	$24,$25
+	 xor	$8,$10
+	sll	$25,$24,5	# 31
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$8,$16
+	xor	$25,$2,$3
+	addu	$7,$6
+	 xor	$8,$21
+	sll	$30,$1,30
+	xor	$25,$1
+	 srl	$6,$8,31
+	 addu	$8,$8
+	srl	$1,$1,2
+	addu	$7,$23
+	 or	$8,$6
+	or	$1,$30
+	addu	$7,$25
+	 xor	$9,$11
+	sll	$25,$7,5	# 32
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$9,$17
+	xor	$25,$1,$2
+	addu	$3,$6
+	 xor	$9,$22
+	sll	$30,$24,30
+	xor	$25,$24
+	 srl	$6,$9,31
+	 addu	$9,$9
+	srl	$24,$24,2
+	addu	$3,$8
+	 or	$9,$6
+	or	$24,$30
+	addu	$3,$25
+	 xor	$10,$12
+	sll	$25,$3,5	# 33
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$10,$18
+	xor	$25,$24,$1
+	addu	$2,$6
+	 xor	$10,$23
+	sll	$30,$7,30
+	xor	$25,$7
+	 srl	$6,$10,31
+	 addu	$10,$10
+	srl	$7,$7,2
+	addu	$2,$9
+	 or	$10,$6
+	or	$7,$30
+	addu	$2,$25
+	 xor	$11,$13
+	sll	$25,$2,5	# 34
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$11,$19
+	xor	$25,$7,$24
+	addu	$1,$6
+	 xor	$11,$8
+	sll	$30,$3,30
+	xor	$25,$3
+	 srl	$6,$11,31
+	 addu	$11,$11
+	srl	$3,$3,2
+	addu	$1,$10
+	 or	$11,$6
+	or	$3,$30
+	addu	$1,$25
+	 xor	$12,$14
+	sll	$25,$1,5	# 35
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$12,$20
+	xor	$25,$3,$7
+	addu	$24,$6
+	 xor	$12,$9
+	sll	$30,$2,30
+	xor	$25,$2
+	 srl	$6,$12,31
+	 addu	$12,$12
+	srl	$2,$2,2
+	addu	$24,$11
+	 or	$12,$6
+	or	$2,$30
+	addu	$24,$25
+	 xor	$13,$15
+	sll	$25,$24,5	# 36
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$13,$21
+	xor	$25,$2,$3
+	addu	$7,$6
+	 xor	$13,$10
+	sll	$30,$1,30
+	xor	$25,$1
+	 srl	$6,$13,31
+	 addu	$13,$13
+	srl	$1,$1,2
+	addu	$7,$12
+	 or	$13,$6
+	or	$1,$30
+	addu	$7,$25
+	 xor	$14,$16
+	sll	$25,$7,5	# 37
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$14,$22
+	xor	$25,$1,$2
+	addu	$3,$6
+	 xor	$14,$11
+	sll	$30,$24,30
+	xor	$25,$24
+	 srl	$6,$14,31
+	 addu	$14,$14
+	srl	$24,$24,2
+	addu	$3,$13
+	 or	$14,$6
+	or	$24,$30
+	addu	$3,$25
+	 xor	$15,$17
+	sll	$25,$3,5	# 38
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$15,$23
+	xor	$25,$24,$1
+	addu	$2,$6
+	 xor	$15,$12
+	sll	$30,$7,30
+	xor	$25,$7
+	 srl	$6,$15,31
+	 addu	$15,$15
+	srl	$7,$7,2
+	addu	$2,$14
+	 or	$15,$6
+	or	$7,$30
+	addu	$2,$25
+	 xor	$16,$18
+	sll	$25,$2,5	# 39
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$16,$8
+	xor	$25,$7,$24
+	addu	$1,$6
+	 xor	$16,$13
+	sll	$30,$3,30
+	xor	$25,$3
+	 srl	$6,$16,31
+	 addu	$16,$16
+	srl	$3,$3,2
+	addu	$1,$15
+	 or	$16,$6
+	or	$3,$30
+	addu	$1,$25
+	lui	$31,0x8f1b
+	ori	$31,0xbcdc	# K_40_59
+	 xor	$17,$19
+	sll	$25,$1,5	# 40
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$17,$9
+	and	$25,$3,$7
+	addu	$24,$6
+	 xor	$17,$14
+	sll	$30,$2,30
+	addu	$24,$25
+	 srl	$6,$17,31
+	xor	$25,$3,$7
+	 addu	$17,$17
+	and	$25,$2
+	srl	$2,$2,2
+	 or	$17,$6
+	addu	$24,$16
+	or	$2,$30
+	addu	$24,$25
+	 xor	$18,$20
+	sll	$25,$24,5	# 41
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$18,$10
+	and	$25,$2,$3
+	addu	$7,$6
+	 xor	$18,$15
+	sll	$30,$1,30
+	addu	$7,$25
+	 srl	$6,$18,31
+	xor	$25,$2,$3
+	 addu	$18,$18
+	and	$25,$1
+	srl	$1,$1,2
+	 or	$18,$6
+	addu	$7,$17
+	or	$1,$30
+	addu	$7,$25
+	 xor	$19,$21
+	sll	$25,$7,5	# 42
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$19,$11
+	and	$25,$1,$2
+	addu	$3,$6
+	 xor	$19,$16
+	sll	$30,$24,30
+	addu	$3,$25
+	 srl	$6,$19,31
+	xor	$25,$1,$2
+	 addu	$19,$19
+	and	$25,$24
+	srl	$24,$24,2
+	 or	$19,$6
+	addu	$3,$18
+	or	$24,$30
+	addu	$3,$25
+	 xor	$20,$22
+	sll	$25,$3,5	# 43
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$20,$12
+	and	$25,$24,$1
+	addu	$2,$6
+	 xor	$20,$17
+	sll	$30,$7,30
+	addu	$2,$25
+	 srl	$6,$20,31
+	xor	$25,$24,$1
+	 addu	$20,$20
+	and	$25,$7
+	srl	$7,$7,2
+	 or	$20,$6
+	addu	$2,$19
+	or	$7,$30
+	addu	$2,$25
+	 xor	$21,$23
+	sll	$25,$2,5	# 44
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$21,$13
+	and	$25,$7,$24
+	addu	$1,$6
+	 xor	$21,$18
+	sll	$30,$3,30
+	addu	$1,$25
+	 srl	$6,$21,31
+	xor	$25,$7,$24
+	 addu	$21,$21
+	and	$25,$3
+	srl	$3,$3,2
+	 or	$21,$6
+	addu	$1,$20
+	or	$3,$30
+	addu	$1,$25
+	 xor	$22,$8
+	sll	$25,$1,5	# 45
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$22,$14
+	and	$25,$3,$7
+	addu	$24,$6
+	 xor	$22,$19
+	sll	$30,$2,30
+	addu	$24,$25
+	 srl	$6,$22,31
+	xor	$25,$3,$7
+	 addu	$22,$22
+	and	$25,$2
+	srl	$2,$2,2
+	 or	$22,$6
+	addu	$24,$21
+	or	$2,$30
+	addu	$24,$25
+	 xor	$23,$9
+	sll	$25,$24,5	# 46
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$23,$15
+	and	$25,$2,$3
+	addu	$7,$6
+	 xor	$23,$20
+	sll	$30,$1,30
+	addu	$7,$25
+	 srl	$6,$23,31
+	xor	$25,$2,$3
+	 addu	$23,$23
+	and	$25,$1
+	srl	$1,$1,2
+	 or	$23,$6
+	addu	$7,$22
+	or	$1,$30
+	addu	$7,$25
+	 xor	$8,$10
+	sll	$25,$7,5	# 47
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$8,$16
+	and	$25,$1,$2
+	addu	$3,$6
+	 xor	$8,$21
+	sll	$30,$24,30
+	addu	$3,$25
+	 srl	$6,$8,31
+	xor	$25,$1,$2
+	 addu	$8,$8
+	and	$25,$24
+	srl	$24,$24,2
+	 or	$8,$6
+	addu	$3,$23
+	or	$24,$30
+	addu	$3,$25
+	 xor	$9,$11
+	sll	$25,$3,5	# 48
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$9,$17
+	and	$25,$24,$1
+	addu	$2,$6
+	 xor	$9,$22
+	sll	$30,$7,30
+	addu	$2,$25
+	 srl	$6,$9,31
+	xor	$25,$24,$1
+	 addu	$9,$9
+	and	$25,$7
+	srl	$7,$7,2
+	 or	$9,$6
+	addu	$2,$8
+	or	$7,$30
+	addu	$2,$25
+	 xor	$10,$12
+	sll	$25,$2,5	# 49
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$10,$18
+	and	$25,$7,$24
+	addu	$1,$6
+	 xor	$10,$23
+	sll	$30,$3,30
+	addu	$1,$25
+	 srl	$6,$10,31
+	xor	$25,$7,$24
+	 addu	$10,$10
+	and	$25,$3
+	srl	$3,$3,2
+	 or	$10,$6
+	addu	$1,$9
+	or	$3,$30
+	addu	$1,$25
+	 xor	$11,$13
+	sll	$25,$1,5	# 50
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$11,$19
+	and	$25,$3,$7
+	addu	$24,$6
+	 xor	$11,$8
+	sll	$30,$2,30
+	addu	$24,$25
+	 srl	$6,$11,31
+	xor	$25,$3,$7
+	 addu	$11,$11
+	and	$25,$2
+	srl	$2,$2,2
+	 or	$11,$6
+	addu	$24,$10
+	or	$2,$30
+	addu	$24,$25
+	 xor	$12,$14
+	sll	$25,$24,5	# 51
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$12,$20
+	and	$25,$2,$3
+	addu	$7,$6
+	 xor	$12,$9
+	sll	$30,$1,30
+	addu	$7,$25
+	 srl	$6,$12,31
+	xor	$25,$2,$3
+	 addu	$12,$12
+	and	$25,$1
+	srl	$1,$1,2
+	 or	$12,$6
+	addu	$7,$11
+	or	$1,$30
+	addu	$7,$25
+	 xor	$13,$15
+	sll	$25,$7,5	# 52
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$13,$21
+	and	$25,$1,$2
+	addu	$3,$6
+	 xor	$13,$10
+	sll	$30,$24,30
+	addu	$3,$25
+	 srl	$6,$13,31
+	xor	$25,$1,$2
+	 addu	$13,$13
+	and	$25,$24
+	srl	$24,$24,2
+	 or	$13,$6
+	addu	$3,$12
+	or	$24,$30
+	addu	$3,$25
+	 xor	$14,$16
+	sll	$25,$3,5	# 53
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$14,$22
+	and	$25,$24,$1
+	addu	$2,$6
+	 xor	$14,$11
+	sll	$30,$7,30
+	addu	$2,$25
+	 srl	$6,$14,31
+	xor	$25,$24,$1
+	 addu	$14,$14
+	and	$25,$7
+	srl	$7,$7,2
+	 or	$14,$6
+	addu	$2,$13
+	or	$7,$30
+	addu	$2,$25
+	 xor	$15,$17
+	sll	$25,$2,5	# 54
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$15,$23
+	and	$25,$7,$24
+	addu	$1,$6
+	 xor	$15,$12
+	sll	$30,$3,30
+	addu	$1,$25
+	 srl	$6,$15,31
+	xor	$25,$7,$24
+	 addu	$15,$15
+	and	$25,$3
+	srl	$3,$3,2
+	 or	$15,$6
+	addu	$1,$14
+	or	$3,$30
+	addu	$1,$25
+	 xor	$16,$18
+	sll	$25,$1,5	# 55
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$16,$8
+	and	$25,$3,$7
+	addu	$24,$6
+	 xor	$16,$13
+	sll	$30,$2,30
+	addu	$24,$25
+	 srl	$6,$16,31
+	xor	$25,$3,$7
+	 addu	$16,$16
+	and	$25,$2
+	srl	$2,$2,2
+	 or	$16,$6
+	addu	$24,$15
+	or	$2,$30
+	addu	$24,$25
+	 xor	$17,$19
+	sll	$25,$24,5	# 56
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$17,$9
+	and	$25,$2,$3
+	addu	$7,$6
+	 xor	$17,$14
+	sll	$30,$1,30
+	addu	$7,$25
+	 srl	$6,$17,31
+	xor	$25,$2,$3
+	 addu	$17,$17
+	and	$25,$1
+	srl	$1,$1,2
+	 or	$17,$6
+	addu	$7,$16
+	or	$1,$30
+	addu	$7,$25
+	 xor	$18,$20
+	sll	$25,$7,5	# 57
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$18,$10
+	and	$25,$1,$2
+	addu	$3,$6
+	 xor	$18,$15
+	sll	$30,$24,30
+	addu	$3,$25
+	 srl	$6,$18,31
+	xor	$25,$1,$2
+	 addu	$18,$18
+	and	$25,$24
+	srl	$24,$24,2
+	 or	$18,$6
+	addu	$3,$17
+	or	$24,$30
+	addu	$3,$25
+	 xor	$19,$21
+	sll	$25,$3,5	# 58
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$19,$11
+	and	$25,$24,$1
+	addu	$2,$6
+	 xor	$19,$16
+	sll	$30,$7,30
+	addu	$2,$25
+	 srl	$6,$19,31
+	xor	$25,$24,$1
+	 addu	$19,$19
+	and	$25,$7
+	srl	$7,$7,2
+	 or	$19,$6
+	addu	$2,$18
+	or	$7,$30
+	addu	$2,$25
+	 xor	$20,$22
+	sll	$25,$2,5	# 59
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$20,$12
+	and	$25,$7,$24
+	addu	$1,$6
+	 xor	$20,$17
+	sll	$30,$3,30
+	addu	$1,$25
+	 srl	$6,$20,31
+	xor	$25,$7,$24
+	 addu	$20,$20
+	and	$25,$3
+	srl	$3,$3,2
+	 or	$20,$6
+	addu	$1,$19
+	or	$3,$30
+	addu	$1,$25
+	lui	$31,0xca62
+	ori	$31,0xc1d6	# K_60_79
+	 xor	$21,$23
+	sll	$25,$1,5	# 60
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$21,$13
+	xor	$25,$3,$7
+	addu	$24,$6
+	 xor	$21,$18
+	sll	$30,$2,30
+	xor	$25,$2
+	 srl	$6,$21,31
+	 addu	$21,$21
+	srl	$2,$2,2
+	addu	$24,$20
+	 or	$21,$6
+	or	$2,$30
+	addu	$24,$25
+	 xor	$22,$8
+	sll	$25,$24,5	# 61
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$22,$14
+	xor	$25,$2,$3
+	addu	$7,$6
+	 xor	$22,$19
+	sll	$30,$1,30
+	xor	$25,$1
+	 srl	$6,$22,31
+	 addu	$22,$22
+	srl	$1,$1,2
+	addu	$7,$21
+	 or	$22,$6
+	or	$1,$30
+	addu	$7,$25
+	 xor	$23,$9
+	sll	$25,$7,5	# 62
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$23,$15
+	xor	$25,$1,$2
+	addu	$3,$6
+	 xor	$23,$20
+	sll	$30,$24,30
+	xor	$25,$24
+	 srl	$6,$23,31
+	 addu	$23,$23
+	srl	$24,$24,2
+	addu	$3,$22
+	 or	$23,$6
+	or	$24,$30
+	addu	$3,$25
+	 xor	$8,$10
+	sll	$25,$3,5	# 63
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$8,$16
+	xor	$25,$24,$1
+	addu	$2,$6
+	 xor	$8,$21
+	sll	$30,$7,30
+	xor	$25,$7
+	 srl	$6,$8,31
+	 addu	$8,$8
+	srl	$7,$7,2
+	addu	$2,$23
+	 or	$8,$6
+	or	$7,$30
+	addu	$2,$25
+	 xor	$9,$11
+	sll	$25,$2,5	# 64
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$9,$17
+	xor	$25,$7,$24
+	addu	$1,$6
+	 xor	$9,$22
+	sll	$30,$3,30
+	xor	$25,$3
+	 srl	$6,$9,31
+	 addu	$9,$9
+	srl	$3,$3,2
+	addu	$1,$8
+	 or	$9,$6
+	or	$3,$30
+	addu	$1,$25
+	 xor	$10,$12
+	sll	$25,$1,5	# 65
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$10,$18
+	xor	$25,$3,$7
+	addu	$24,$6
+	 xor	$10,$23
+	sll	$30,$2,30
+	xor	$25,$2
+	 srl	$6,$10,31
+	 addu	$10,$10
+	srl	$2,$2,2
+	addu	$24,$9
+	 or	$10,$6
+	or	$2,$30
+	addu	$24,$25
+	 xor	$11,$13
+	sll	$25,$24,5	# 66
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$11,$19
+	xor	$25,$2,$3
+	addu	$7,$6
+	 xor	$11,$8
+	sll	$30,$1,30
+	xor	$25,$1
+	 srl	$6,$11,31
+	 addu	$11,$11
+	srl	$1,$1,2
+	addu	$7,$10
+	 or	$11,$6
+	or	$1,$30
+	addu	$7,$25
+	 xor	$12,$14
+	sll	$25,$7,5	# 67
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$12,$20
+	xor	$25,$1,$2
+	addu	$3,$6
+	 xor	$12,$9
+	sll	$30,$24,30
+	xor	$25,$24
+	 srl	$6,$12,31
+	 addu	$12,$12
+	srl	$24,$24,2
+	addu	$3,$11
+	 or	$12,$6
+	or	$24,$30
+	addu	$3,$25
+	 xor	$13,$15
+	sll	$25,$3,5	# 68
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$13,$21
+	xor	$25,$24,$1
+	addu	$2,$6
+	 xor	$13,$10
+	sll	$30,$7,30
+	xor	$25,$7
+	 srl	$6,$13,31
+	 addu	$13,$13
+	srl	$7,$7,2
+	addu	$2,$12
+	 or	$13,$6
+	or	$7,$30
+	addu	$2,$25
+	 xor	$14,$16
+	sll	$25,$2,5	# 69
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$14,$22
+	xor	$25,$7,$24
+	addu	$1,$6
+	 xor	$14,$11
+	sll	$30,$3,30
+	xor	$25,$3
+	 srl	$6,$14,31
+	 addu	$14,$14
+	srl	$3,$3,2
+	addu	$1,$13
+	 or	$14,$6
+	or	$3,$30
+	addu	$1,$25
+	 xor	$15,$17
+	sll	$25,$1,5	# 70
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$15,$23
+	xor	$25,$3,$7
+	addu	$24,$6
+	 xor	$15,$12
+	sll	$30,$2,30
+	xor	$25,$2
+	 srl	$6,$15,31
+	 addu	$15,$15
+	srl	$2,$2,2
+	addu	$24,$14
+	 or	$15,$6
+	or	$2,$30
+	addu	$24,$25
+	 xor	$16,$18
+	sll	$25,$24,5	# 71
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$16,$8
+	xor	$25,$2,$3
+	addu	$7,$6
+	 xor	$16,$13
+	sll	$30,$1,30
+	xor	$25,$1
+	 srl	$6,$16,31
+	 addu	$16,$16
+	srl	$1,$1,2
+	addu	$7,$15
+	 or	$16,$6
+	or	$1,$30
+	addu	$7,$25
+	 xor	$17,$19
+	sll	$25,$7,5	# 72
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$17,$9
+	xor	$25,$1,$2
+	addu	$3,$6
+	 xor	$17,$14
+	sll	$30,$24,30
+	xor	$25,$24
+	 srl	$6,$17,31
+	 addu	$17,$17
+	srl	$24,$24,2
+	addu	$3,$16
+	 or	$17,$6
+	or	$24,$30
+	addu	$3,$25
+	 xor	$18,$20
+	sll	$25,$3,5	# 73
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$18,$10
+	xor	$25,$24,$1
+	addu	$2,$6
+	 xor	$18,$15
+	sll	$30,$7,30
+	xor	$25,$7
+	 srl	$6,$18,31
+	 addu	$18,$18
+	srl	$7,$7,2
+	addu	$2,$17
+	 or	$18,$6
+	or	$7,$30
+	addu	$2,$25
+	 xor	$19,$21
+	sll	$25,$2,5	# 74
+	addu	$1,$31
+	srl	$6,$2,27
+	addu	$1,$25
+	 xor	$19,$11
+	xor	$25,$7,$24
+	addu	$1,$6
+	 xor	$19,$16
+	sll	$30,$3,30
+	xor	$25,$3
+	 srl	$6,$19,31
+	 addu	$19,$19
+	srl	$3,$3,2
+	addu	$1,$18
+	 or	$19,$6
+	or	$3,$30
+	addu	$1,$25
+	 xor	$20,$22
+	sll	$25,$1,5	# 75
+	addu	$24,$31
+	srl	$6,$1,27
+	addu	$24,$25
+	 xor	$20,$12
+	xor	$25,$3,$7
+	addu	$24,$6
+	 xor	$20,$17
+	sll	$30,$2,30
+	xor	$25,$2
+	 srl	$6,$20,31
+	 addu	$20,$20
+	srl	$2,$2,2
+	addu	$24,$19
+	 or	$20,$6
+	or	$2,$30
+	addu	$24,$25
+	 xor	$21,$23
+	sll	$25,$24,5	# 76
+	addu	$7,$31
+	srl	$6,$24,27
+	addu	$7,$25
+	 xor	$21,$13
+	xor	$25,$2,$3
+	addu	$7,$6
+	 xor	$21,$18
+	sll	$30,$1,30
+	xor	$25,$1
+	 srl	$6,$21,31
+	 addu	$21,$21
+	srl	$1,$1,2
+	addu	$7,$20
+	 or	$21,$6
+	or	$1,$30
+	addu	$7,$25
+	 xor	$22,$8
+	sll	$25,$7,5	# 77
+	addu	$3,$31
+	srl	$6,$7,27
+	addu	$3,$25
+	 xor	$22,$14
+	xor	$25,$1,$2
+	addu	$3,$6
+	 xor	$22,$19
+	sll	$30,$24,30
+	xor	$25,$24
+	 srl	$6,$22,31
+	 addu	$22,$22
+	srl	$24,$24,2
+	addu	$3,$21
+	 or	$22,$6
+	or	$24,$30
+	addu	$3,$25
+	 xor	$23,$9
+	sll	$25,$3,5	# 78
+	addu	$2,$31
+	srl	$6,$3,27
+	addu	$2,$25
+	 xor	$23,$15
+	xor	$25,$24,$1
+	addu	$2,$6
+	 xor	$23,$20
+	sll	$30,$7,30
+	xor	$25,$7
+	 srl	$6,$23,31
+	 addu	$23,$23
+	srl	$7,$7,2
+	addu	$2,$22
+	 or	$23,$6
+	or	$7,$30
+	addu	$2,$25
+	 lw	$8,0($4)
+	sll	$25,$2,5	# 79
+	addu	$1,$31
+	 lw	$9,4($4)
+	srl	$6,$2,27
+	addu	$1,$25
+	 lw	$10,8($4)
+	xor	$25,$7,$24
+	addu	$1,$6
+	 lw	$11,12($4)
+	sll	$30,$3,30
+	xor	$25,$3
+	 lw	$12,16($4)
+	srl	$3,$3,2
+	addu	$1,$23
+	or	$3,$30
+	addu	$1,$25
+	add $5,64
+	lw	$6,0($29)
+
+	addu	$1,$8
+	addu	$2,$9
+	sw	$1,0($4)
+	addu	$3,$10
+	addu	$7,$11
+	sw	$2,4($4)
+	addu	$24,$12
+	sw	$3,8($4)
+	sw	$7,12($4)
+	sw	$24,16($4)
+	.set	noreorder
+	bne	$5,$6,.Loop
+	nop
+
+	.set	noreorder
+	lw	$31,(16-1)*4($29)
+	lw	$30,(16-2)*4($29)
+	lw	$23,(16-3)*4($29)
+	lw	$22,(16-4)*4($29)
+	lw	$21,(16-5)*4($29)
+	lw	$20,(16-6)*4($29)
+	lw	$19,(16-7)*4($29)
+	lw	$18,(16-8)*4($29)
+	lw	$17,(16-9)*4($29)
+	lw	$16,(16-10)*4($29)
+	jr	$31
+	add $29,16*4
+.end	sha1_block_data_order
+.rdata
+.asciiz	"SHA1 for MIPS, CRYPTOGAMS by <appro@openssl.org>"
diff --git a/main/openssl/crypto/sha/asm/sha1-mips.pl b/main/openssl/crypto/sha/asm/sha1-mips.pl
new file mode 100644
index 00000000..f1a702f3
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-mips.pl
@@ -0,0 +1,354 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for MIPS.
+
+# Performance improvement is 30% on unaligned input. The "secret" is
+# to deploy lwl/lwr pair to load unaligned input. One could have
+# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
+# compatible subroutine. There is room for minor optimization on
+# little-endian platforms...
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$PTR_SLL="dsll";	# incidentally works even on n32
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$PTR_SLL="sll";
+	$SZREG=4;
+}
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);   }
+open STDOUT,">$output";
+
+if (!defined($big_endian))
+            {   $big_endian=(unpack('L',pack('N',1))==1);   }
+
+# offsets of the Most and Least Significant Bytes
+$MSB=$big_endian?0:3;
+$LSB=3&~$MSB;
+
+@X=map("\$$_",(8..23));	# a4-a7,s0-s11
+
+$ctx=$a0;
+$inp=$a1;
+$num=$a2;
+$A="\$1";
+$B="\$2";
+$C="\$3";
+$D="\$7";
+$E="\$24";	@V=($A,$B,$C,$D,$E);
+$t0="\$25";
+$t1=$num;	# $num is offloaded to stack
+$t2="\$30";	# fp
+$K="\$31";	# ra
+
+sub BODY_00_14 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___	if (!$big_endian);
+	srl	$t0,@X[$i],24	# byte swap($i)
+	srl	$t1,@X[$i],8
+	andi	$t2,@X[$i],0xFF00
+	sll	@X[$i],@X[$i],24
+	andi	$t1,0xFF00
+	sll	$t2,$t2,8
+	or	@X[$i],$t0
+	or	$t1,$t2
+	or	@X[$i],$t1
+___
+$code.=<<___;
+	 lwl	@X[$j],$j*4+$MSB($inp)
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	 lwr	@X[$j],$j*4+$LSB($inp)
+	srl	$t1,$a,27
+	addu	$e,$t0
+	xor	$t0,$c,$d
+	addu	$e,$t1
+	sll	$t2,$b,30
+	and	$t0,$b
+	srl	$b,$b,2
+	xor	$t0,$d
+	addu	$e,@X[$i]
+	or	$b,$t2
+	addu	$e,$t0
+___
+}
+
+sub BODY_15_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+
+$code.=<<___	if (!$big_endian && $i==15);
+	srl	$t0,@X[$i],24	# byte swap($i)
+	srl	$t1,@X[$i],8
+	andi	$t2,@X[$i],0xFF00
+	sll	@X[$i],@X[$i],24
+	andi	$t1,0xFF00
+	sll	$t2,$t2,8
+	or	@X[$i],$t0
+	or	@X[$i],$t1
+	or	@X[$i],$t2
+___
+$code.=<<___;
+	 xor	@X[$j%16],@X[($j+2)%16]
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	srl	$t1,$a,27
+	addu	$e,$t0
+	 xor	@X[$j%16],@X[($j+8)%16]
+	xor	$t0,$c,$d
+	addu	$e,$t1
+	 xor	@X[$j%16],@X[($j+13)%16]
+	sll	$t2,$b,30
+	and	$t0,$b
+	 srl	$t1,@X[$j%16],31
+	 addu	@X[$j%16],@X[$j%16]
+	srl	$b,$b,2
+	xor	$t0,$d
+	 or	@X[$j%16],$t1
+	addu	$e,@X[$i%16]
+	or	$b,$t2
+	addu	$e,$t0
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+	 xor	@X[$j%16],@X[($j+2)%16]
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	srl	$t1,$a,27
+	addu	$e,$t0
+	 xor	@X[$j%16],@X[($j+8)%16]
+	xor	$t0,$c,$d
+	addu	$e,$t1
+	 xor	@X[$j%16],@X[($j+13)%16]
+	sll	$t2,$b,30
+	xor	$t0,$b
+	 srl	$t1,@X[$j%16],31
+	 addu	@X[$j%16],@X[$j%16]
+	srl	$b,$b,2
+	addu	$e,@X[$i%16]
+	 or	@X[$j%16],$t1
+	or	$b,$t2
+	addu	$e,$t0
+___
+$code.=<<___ if ($i==79);
+	 lw	@X[0],0($ctx)
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	 lw	@X[1],4($ctx)
+	srl	$t1,$a,27
+	addu	$e,$t0
+	 lw	@X[2],8($ctx)
+	xor	$t0,$c,$d
+	addu	$e,$t1
+	 lw	@X[3],12($ctx)
+	sll	$t2,$b,30
+	xor	$t0,$b
+	 lw	@X[4],16($ctx)
+	srl	$b,$b,2
+	addu	$e,@X[$i%16]
+	or	$b,$t2
+	addu	$e,$t0
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+	 xor	@X[$j%16],@X[($j+2)%16]
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	srl	$t1,$a,27
+	addu	$e,$t0
+	 xor	@X[$j%16],@X[($j+8)%16]
+	and	$t0,$c,$d
+	addu	$e,$t1
+	 xor	@X[$j%16],@X[($j+13)%16]
+	sll	$t2,$b,30
+	addu	$e,$t0
+	 srl	$t1,@X[$j%16],31
+	xor	$t0,$c,$d
+	 addu	@X[$j%16],@X[$j%16]
+	and	$t0,$b
+	srl	$b,$b,2
+	 or	@X[$j%16],$t1
+	addu	$e,@X[$i%16]
+	or	$b,$t2
+	addu	$e,$t0
+___
+}
+
+$FRAMESIZE=16;	# large enough to accomodate NUBI saved registers
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+$code=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.align	5
+.globl	sha1_block_data_order
+.ent	sha1_block_data_order
+sha1_block_data_order:
+	.frame	$sp,$FRAMESIZE*$SZREG,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
+	$REG_S	$ra,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_S	$fp,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_S	$s11,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_S	$s10,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_S	$s9,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_S	$s8,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_S	$s7,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_S	$s6,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_S	$s5,($FRAMESIZE-9)*$SZREG($sp)
+	$REG_S	$s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	$s3,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_S	$s2,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_S	$s1,($FRAMESIZE-13)*$SZREG($sp)
+	$REG_S	$s0,($FRAMESIZE-14)*$SZREG($sp)
+	$REG_S	$gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+	$PTR_SLL $num,6
+	$PTR_ADD $num,$inp
+	$REG_S	$num,0($sp)
+	lw	$A,0($ctx)
+	lw	$B,4($ctx)
+	lw	$C,8($ctx)
+	lw	$D,12($ctx)
+	b	.Loop
+	lw	$E,16($ctx)
+.align	4
+.Loop:
+	.set	reorder
+	lwl	@X[0],$MSB($inp)
+	lui	$K,0x5a82
+	lwr	@X[0],$LSB($inp)
+	ori	$K,0x7999	# K_00_19
+___
+for ($i=0;$i<15;$i++)	{ &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
+for (;$i<20;$i++)	{ &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	lui	$K,0x6ed9
+	ori	$K,0xeba1	# K_20_39
+___
+for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	lui	$K,0x8f1b
+	ori	$K,0xbcdc	# K_40_59
+___
+for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	lui	$K,0xca62
+	ori	$K,0xc1d6	# K_60_79
+___
+for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	$PTR_ADD $inp,64
+	$REG_L	$num,0($sp)
+
+	addu	$A,$X[0]
+	addu	$B,$X[1]
+	sw	$A,0($ctx)
+	addu	$C,$X[2]
+	addu	$D,$X[3]
+	sw	$B,4($ctx)
+	addu	$E,$X[4]
+	sw	$C,8($ctx)
+	sw	$D,12($ctx)
+	sw	$E,16($ctx)
+	.set	noreorder
+	bne	$inp,$num,.Loop
+	nop
+
+	.set	noreorder
+	$REG_L	$ra,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_L	$fp,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_L	$s11,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_L	$s10,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_L	$s9,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_L	$s8,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_L	$s7,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_L	$s6,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_L	$s5,($FRAMESIZE-9)*$SZREG($sp)
+	$REG_L	$s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_L	$s2,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_L	$s1,($FRAMESIZE-13)*$SZREG($sp)
+	$REG_L	$s0,($FRAMESIZE-14)*$SZREG($sp)
+	$REG_L	$gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end	sha1_block_data_order
+.rdata
+.asciiz	"SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha1-parisc.pl b/main/openssl/crypto/sha/asm/sha1-parisc.pl
new file mode 100644
index 00000000..6e5a328a
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-parisc.pl
@@ -0,0 +1,260 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for PA-RISC.
+
+# June 2009.
+#
+# On PA-7100LC performance is >30% better than gcc 3.2 generated code
+# for aligned input and >50% better for unaligned. Compared to vendor
+# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
+# few percent faster in 32-bit one (this for aligned input, data for
+# unaligned input is not available).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
+				#                 [+ argument transfer]
+$ctx="%r26";		# arg0
+$inp="%r25";		# arg1
+$num="%r24";		# arg2
+
+$t0="%r28";
+$t1="%r29";
+$K="%r31";
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
+
+@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<15);
+	addl	$K,$e,$e	; $i
+	shd	$a,$a,27,$t1
+	addl	@X[$i],$e,$e
+	and	$c,$b,$t0
+	addl	$t1,$e,$e
+	andcm	$d,$b,$t1
+	shd	$b,$b,2,$b
+	or	$t1,$t0,$t0
+	addl	$t0,$e,$e
+___
+$code.=<<___ if ($i>=15);	# with forward Xupdate
+	addl	$K,$e,$e	; $i
+	shd	$a,$a,27,$t1
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+	addl	@X[$i%16],$e,$e
+	and	$c,$b,$t0
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	addl	$t1,$e,$e
+	andcm	$d,$b,$t1
+	shd	$b,$b,2,$b
+	or	$t1,$t0,$t0
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	add	$t0,$e,$e
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]	; $i
+	addl	$K,$e,$e
+	shd	$a,$a,27,$t1
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	addl	@X[$i%16],$e,$e
+	xor	$b,$c,$t0
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	$d,$t0,$t0
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+	addl	$t0,$e,$e
+___
+$code.=<<___ if ($i==79);	# with context load
+	ldw	0($ctx),@X[0]	; $i
+	addl	$K,$e,$e
+	shd	$a,$a,27,$t1
+	ldw	4($ctx),@X[1]
+	addl	@X[$i%16],$e,$e
+	xor	$b,$c,$t0
+	ldw	8($ctx),@X[2]
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	$d,$t0,$t0
+	ldw	12($ctx),@X[3]
+	addl	$t0,$e,$e
+	ldw	16($ctx),@X[4]
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+	shd	$a,$a,27,$t1	; $i
+	addl	$K,$e,$e
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+	xor	$d,$c,$t0
+	addl	@X[$i%16],$e,$e
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	and	$b,$t0,$t0
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	addl	$t0,$e,$e
+	and	$d,$c,$t1
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+	addl	$t1,$e,$e
+___
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+sha1_block_data_order
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+
+	ldw	0($ctx),$A
+	ldw	4($ctx),$B
+	ldw	8($ctx),$C
+	ldw	12($ctx),$D
+	ldw	16($ctx),$E
+
+	extru	$inp,31,2,$t0		; t0=inp&3;
+	sh3addl	$t0,%r0,$t0		; t0*=8;
+	subi	32,$t0,$t0		; t0=32-t0;
+	mtctl	$t0,%cr11		; %sar=t0;
+
+L\$oop
+	ldi	3,$t0
+	andcm	$inp,$t0,$t0		; 64-bit neutral
+___
+	for ($i=0;$i<15;$i++) {		# load input block
+	$code.="\tldw	`4*$i`($t0),@X[$i]\n";		}
+$code.=<<___;
+	cmpb,*=	$inp,$t0,L\$aligned
+	ldw	60($t0),@X[15]
+	ldw	64($t0),@X[16]
+___
+	for ($i=0;$i<16;$i++) {		# align input
+	$code.="\tvshd	@X[$i],@X[$i+1],@X[$i]\n";	}
+$code.=<<___;
+L\$aligned
+	ldil	L'0x5a827000,$K		; K_00_19
+	ldo	0x999($K),$K
+___
+for ($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0x6ed9e000,$K		; K_20_39
+	ldo	0xba1($K),$K
+___
+
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0x8f1bb000,$K		; K_40_59
+	ldo	0xcdc($K),$K
+___
+
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0xca62c000,$K		; K_60_79
+	ldo	0x1d6($K),$K
+___
+for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	addl	@X[0],$A,$A
+	addl	@X[1],$B,$B
+	addl	@X[2],$C,$C
+	addl	@X[3],$D,$D
+	addl	@X[4],$E,$E
+	stw	$A,0($ctx)
+	stw	$B,4($ctx)
+	stw	$C,8($ctx)
+	stw	$D,12($ctx)
+	stw	$E,16($ctx)
+	addib,*<> -1,$num,L\$oop
+	ldo	64($inp),$inp
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/,\*/,/gm		if ($SIZE_T==4);
+$code =~ s/\bbv\b/bve/gm	if ($SIZE_T==8);
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha1-ppc.pl b/main/openssl/crypto/sha/asm/sha1-ppc.pl
index dcd0fcdf..2140dd2f 100755
--- a/main/openssl/crypto/sha/asm/sha1-ppc.pl
+++ b/main/openssl/crypto/sha/asm/sha1-ppc.pl
@@ -24,12 +24,14 @@ $flavour = shift;
 
 if ($flavour =~ /64/) {
 	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
 	$UCMP	="cmpld";
 	$STU	="stdu";
 	$POP	="ld";
 	$PUSH	="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
 	$UCMP	="cmplw";
 	$STU	="stwu";
 	$POP	="lwz";
@@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 
 $K  ="r0";
 $sp ="r1";
@@ -162,9 +165,8 @@ $code=<<___;
 .globl	.sha1_block_data_order
 .align	4
 .sha1_block_data_order:
+	$STU	$sp,-$FRAME($sp)
 	mflr	r0
-	$STU	$sp,`-($FRAME+64)`($sp)
-	$PUSH	r0,`$FRAME-$SIZE_T*18`($sp)
 	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
 	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
 	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@ $code=<<___;
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 	lwz	$A,0($ctx)
 	lwz	$B,4($ctx)
 	lwz	$C,8($ctx)
@@ -192,37 +195,14 @@ $code=<<___;
 Laligned:
 	mtctr	$num
 	bl	Lsha1_block_private
-Ldone:
-	$POP	r0,`$FRAME-$SIZE_T*18`($sp)
-	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
-	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
-	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
-	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
-	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
-	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
-	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
-	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
-	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
-	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
-	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
-	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
-	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
-	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
-	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
-	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
-	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
-	mtlr	r0
-	addi	$sp,$sp,`$FRAME+64`
-	blr
-___
+	b	Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align	4
 Lunaligned:
 	subfic	$t1,$inp,4096
@@ -237,7 +217,7 @@ Lunaligned:
 Lcross_page:
 	li	$t1,16
 	mtctr	$t1
-	addi	r20,$sp,$FRAME	; spot below the frame
+	addi	r20,$sp,$LOCALS	; spot within the frame
 Lmemcpy:
 	lbz	r16,0($inp)
 	lbz	r17,1($inp)
@@ -251,15 +231,40 @@ Lmemcpy:
 	addi	r20,r20,4
 	bdnz	Lmemcpy
 
-	$PUSH	$inp,`$FRAME-$SIZE_T*19`($sp)
+	$PUSH	$inp,`$FRAME-$SIZE_T*18`($sp)
 	li	$t1,1
-	addi	$inp,$sp,$FRAME
+	addi	$inp,$sp,$LOCALS
 	mtctr	$t1
 	bl	Lsha1_block_private
-	$POP	$inp,`$FRAME-$SIZE_T*19`($sp)
+	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
 	addic.	$num,$num,-1
 	bne-	Lunaligned
-	b	Ldone
+
+Ldone:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
+	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
+	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
+	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
+	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
+	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
+	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
+	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
+	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
+	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
+	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
+	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
+	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
+	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
+	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
+	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
+	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
+	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
+	mtlr	r0
+	addi	$sp,$sp,$FRAME
+	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 ___
 
 # This is private block function, which uses tailored calling
@@ -309,6 +314,8 @@ $code.=<<___;
 	addi	$inp,$inp,`16*4`
 	bdnz-	Lsha1_block_private
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 ___
 $code.=<<___;
 .asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
diff --git a/main/openssl/crypto/sha/asm/sha1-s390x.pl b/main/openssl/crypto/sha/asm/sha1-s390x.pl
index 4b178482..9193dda4 100644
--- a/main/openssl/crypto/sha/asm/sha1-s390x.pl
+++ b/main/openssl/crypto/sha/asm/sha1-s390x.pl
@@ -21,9 +21,28 @@
 # instructions to favour dual-issue z10 pipeline. On z10 hardware is
 # "only" ~2.3x faster than software.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific.
+
 $kimdfunc=1;	# magic function code for kimd instruction
 
-$output=shift;
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $K_00_39="%r0"; $K=$K_00_39;
@@ -42,13 +61,14 @@ $t1="%r11";
 @X=("%r12","%r13","%r14");
 $sp="%r15";
 
-$frame=160+16*4;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*4;
 
 sub Xupdate {
 my $i=shift;
 
 $code.=<<___ if ($i==15);
-	lg	$prefetch,160($sp)	### Xupdate(16) warm-up
+	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
 	lr	$X[0],$X[2]
 ___
 return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
@@ -58,8 +78,8 @@ $code.=<<___ if ($i<16);
 ___
 $code.=<<___ if ($i>=16);
 	xgr	$X[0],$prefetch		### Xupdate($i)
-	lg	$prefetch,`160+4*(($i+2)%16)`($sp)
-	xg	$X[0],`160+4*(($i+8)%16)`($sp)
+	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
+	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
 	xgr	$X[0],$prefetch
 	rll	$X[0],$X[0],1
 	rllg	$X[1],$X[0],32
@@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16);
 	lr	$X[2],$X[1]		# feedback
 ___
 $code.=<<___ if ($i<=70);
-	stg	$X[0],`160+4*($i%16)`($sp)
+	stg	$X[0],`$stdframe+4*($i%16)`($sp)
 ___
 unshift(@X,pop(@X));
 }
@@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc);
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
@@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc);
 ___
 $code.=<<___;
 	lghi	%r1,-$frame
-	stg	$ctx,16($sp)
-	stmg	%r6,%r15,48($sp)
+	st${g}	$ctx,`2*$SIZE_T`($sp)
+	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)
 
 	larl	$t0,Ktable
 	llgf	$A,0($ctx)
@@ -199,7 +219,7 @@ ___
 for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
 
-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,64($inp)
 	al	$A,0($ctx)
 	al	$B,4($ctx)
@@ -211,13 +231,13 @@ $code.=<<___;
 	st	$C,8($ctx)
 	st	$D,12($ctx)
 	st	$E,16($ctx)
-	brct	$len,.Lloop
+	brct${g} $len,.Lloop
 
-	lmg	%r6,%r15,`$frame+48`($sp)
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
 	br	%r14
 .size	sha1_block_data_order,.-sha1_block_data_order
 .string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/sha/asm/sha1-sparcv9a.pl b/main/openssl/crypto/sha/asm/sha1-sparcv9a.pl
index 85e8d680..e65291bb 100644
--- a/main/openssl/crypto/sha/asm/sha1-sparcv9a.pl
+++ b/main/openssl/crypto/sha/asm/sha1-sparcv9a.pl
@@ -549,7 +549,7 @@ ___
 # programmer detect if current CPU is VIS capable at run-time.
 sub unvis {
 my ($mnemonic,$rs1,$rs2,$rd)=@_;
-my $ref,$opf;
+my ($ref,$opf);
 my %visopf = (	"fmul8ulx16"	=> 0x037,
 		"faligndata"	=> 0x048,
 		"fpadd32"	=> 0x052,
diff --git a/main/openssl/crypto/sha/asm/sha1-x86_64.S b/main/openssl/crypto/sha/asm/sha1-x86_64.S
new file mode 100644
index 00000000..3922e203
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-x86_64.S
@@ -0,0 +1,2486 @@
+.text	
+
+
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,@function
+.align	16
+sha1_block_data_order:
+	movl	OPENSSL_ia32cap_P+0(%rip),%r9d
+	movl	OPENSSL_ia32cap_P+4(%rip),%r8d
+	testl	$512,%r8d
+	jz	.Lialu
+	jmp	_ssse3_shortcut
+
+.align	16
+.Lialu:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	movq	%rsp,%r11
+	movq	%rdi,%r8
+	subq	$72,%rsp
+	movq	%rsi,%r9
+	andq	$-64,%rsp
+	movq	%rdx,%r10
+	movq	%r11,64(%rsp)
+.Lprologue:
+
+	movl	0(%r8),%esi
+	movl	4(%r8),%edi
+	movl	8(%r8),%r11d
+	movl	12(%r8),%r12d
+	movl	16(%r8),%r13d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	0(%r9),%edx
+	bswapl	%edx
+	movl	%edx,0(%rsp)
+	movl	%r11d,%eax
+	movl	4(%r9),%ebp
+	movl	%esi,%ecx
+	xorl	%r12d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r13,1),%r13d
+	andl	%edi,%eax
+	movl	%ebp,4(%rsp)
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	%edi,%eax
+	movl	8(%r9),%edx
+	movl	%r13d,%ecx
+	xorl	%r11d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r12,1),%r12d
+	andl	%esi,%eax
+	movl	%edx,8(%rsp)
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	%esi,%eax
+	movl	12(%r9),%ebp
+	movl	%r12d,%ecx
+	xorl	%edi,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r11,1),%r11d
+	andl	%r13d,%eax
+	movl	%ebp,12(%rsp)
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	%r13d,%eax
+	movl	16(%r9),%edx
+	movl	%r11d,%ecx
+	xorl	%esi,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%rdi,1),%edi
+	andl	%r12d,%eax
+	movl	%edx,16(%rsp)
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	%r12d,%eax
+	movl	20(%r9),%ebp
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%rsi,1),%esi
+	andl	%r11d,%eax
+	movl	%ebp,20(%rsp)
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	%r11d,%eax
+	movl	24(%r9),%edx
+	movl	%esi,%ecx
+	xorl	%r12d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r13,1),%r13d
+	andl	%edi,%eax
+	movl	%edx,24(%rsp)
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	%edi,%eax
+	movl	28(%r9),%ebp
+	movl	%r13d,%ecx
+	xorl	%r11d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r12,1),%r12d
+	andl	%esi,%eax
+	movl	%ebp,28(%rsp)
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	%esi,%eax
+	movl	32(%r9),%edx
+	movl	%r12d,%ecx
+	xorl	%edi,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r11,1),%r11d
+	andl	%r13d,%eax
+	movl	%edx,32(%rsp)
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	%r13d,%eax
+	movl	36(%r9),%ebp
+	movl	%r11d,%ecx
+	xorl	%esi,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%rdi,1),%edi
+	andl	%r12d,%eax
+	movl	%ebp,36(%rsp)
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	%r12d,%eax
+	movl	40(%r9),%edx
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%rsi,1),%esi
+	andl	%r11d,%eax
+	movl	%edx,40(%rsp)
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	%r11d,%eax
+	movl	44(%r9),%ebp
+	movl	%esi,%ecx
+	xorl	%r12d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r13,1),%r13d
+	andl	%edi,%eax
+	movl	%ebp,44(%rsp)
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	%edi,%eax
+	movl	48(%r9),%edx
+	movl	%r13d,%ecx
+	xorl	%r11d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r12,1),%r12d
+	andl	%esi,%eax
+	movl	%edx,48(%rsp)
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	%esi,%eax
+	movl	52(%r9),%ebp
+	movl	%r12d,%ecx
+	xorl	%edi,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r11,1),%r11d
+	andl	%r13d,%eax
+	movl	%ebp,52(%rsp)
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	%r13d,%eax
+	movl	56(%r9),%edx
+	movl	%r11d,%ecx
+	xorl	%esi,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%rdi,1),%edi
+	andl	%r12d,%eax
+	movl	%edx,56(%rsp)
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	%r12d,%eax
+	movl	60(%r9),%ebp
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%rsi,1),%esi
+	andl	%r11d,%eax
+	movl	%ebp,60(%rsp)
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	0(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%edx
+	andl	%edi,%eax
+	leal	1518500249(%rbp,%r13,1),%r13d
+	xorl	52(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$1,%edx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	movl	%edx,0(%rsp)
+	addl	%eax,%r13d
+	movl	4(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%ebp
+	andl	%esi,%eax
+	leal	1518500249(%rdx,%r12,1),%r12d
+	xorl	56(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$1,%ebp
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	movl	%ebp,4(%rsp)
+	addl	%eax,%r12d
+	movl	8(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%edx
+	andl	%r13d,%eax
+	leal	1518500249(%rbp,%r11,1),%r11d
+	xorl	60(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$1,%edx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	movl	%edx,8(%rsp)
+	addl	%eax,%r11d
+	movl	12(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
+	andl	%r12d,%eax
+	leal	1518500249(%rdx,%rdi,1),%edi
+	xorl	0(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$1,%ebp
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	movl	%ebp,12(%rsp)
+	addl	%eax,%edi
+	movl	16(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%edx
+	andl	%r11d,%eax
+	leal	1518500249(%rbp,%rsi,1),%esi
+	xorl	4(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$1,%edx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	movl	%edx,16(%rsp)
+	addl	%eax,%esi
+	movl	20(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r13,1),%r13d
+	xorl	52(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	8(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,20(%rsp)
+	movl	24(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r12,1),%r12d
+	xorl	56(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	12(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,24(%rsp)
+	movl	28(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	60(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	16(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,28(%rsp)
+	movl	32(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	0(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	20(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,32(%rsp)
+	movl	36(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	4(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	24(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,36(%rsp)
+	movl	40(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	48(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r13,1),%r13d
+	xorl	8(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	28(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,40(%rsp)
+	movl	44(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	52(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	12(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	32(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	%ebp,44(%rsp)
+	movl	48(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	56(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	16(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	36(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	%edx,48(%rsp)
+	movl	52(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	20(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	40(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%ebp,52(%rsp)
+	movl	56(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	24(%rsp),%edx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	44(%rsp),%edx
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	movl	%edx,56(%rsp)
+	movl	60(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r13,1),%r13d
+	xorl	28(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	48(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,60(%rsp)
+	movl	0(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r12,1),%r12d
+	xorl	32(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	52(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,0(%rsp)
+	movl	4(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	36(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	56(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,4(%rsp)
+	movl	8(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	40(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	60(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,8(%rsp)
+	movl	12(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	44(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	0(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,12(%rsp)
+	movl	16(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r13,1),%r13d
+	xorl	48(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	4(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,16(%rsp)
+	movl	20(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	52(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	8(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	%ebp,20(%rsp)
+	movl	24(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	56(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	12(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	%edx,24(%rsp)
+	movl	28(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	60(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	16(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%ebp,28(%rsp)
+	movl	32(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	0(%rsp),%edx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	20(%rsp),%edx
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	movl	%edx,32(%rsp)
+	movl	36(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	44(%rsp),%ebp
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	24(%rsp),%ebp
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%ebp,36(%rsp)
+	addl	%ecx,%r13d
+	movl	40(%rsp),%edx
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	48(%rsp),%edx
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	28(%rsp),%edx
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%edx,40(%rsp)
+	addl	%ecx,%r12d
+	movl	44(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	52(%rsp),%ebp
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%edi,%ebx
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%ebp,44(%rsp)
+	addl	%ecx,%r11d
+	movl	48(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	56(%rsp),%edx
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%esi,%ebx
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	36(%rsp),%edx
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%edx,48(%rsp)
+	addl	%ecx,%edi
+	movl	52(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	60(%rsp),%ebp
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rdx,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	40(%rsp),%ebp
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%ebp,52(%rsp)
+	addl	%ecx,%esi
+	movl	56(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	0(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	44(%rsp),%edx
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%edx,56(%rsp)
+	addl	%ecx,%r13d
+	movl	60(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	4(%rsp),%ebp
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	48(%rsp),%ebp
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%ebp,60(%rsp)
+	addl	%ecx,%r12d
+	movl	0(%rsp),%edx
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	8(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%edi,%ebx
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	52(%rsp),%edx
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%edx,0(%rsp)
+	addl	%ecx,%r11d
+	movl	4(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	12(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%esi,%ebx
+	leal	-1894007588(%rdx,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	56(%rsp),%ebp
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%ebp,4(%rsp)
+	addl	%ecx,%edi
+	movl	8(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	16(%rsp),%edx
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	60(%rsp),%edx
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%edx,8(%rsp)
+	addl	%ecx,%esi
+	movl	12(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	20(%rsp),%ebp
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	0(%rsp),%ebp
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%ebp,12(%rsp)
+	addl	%ecx,%r13d
+	movl	16(%rsp),%edx
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	24(%rsp),%edx
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	48(%rsp),%edx
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	4(%rsp),%edx
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%edx,16(%rsp)
+	addl	%ecx,%r12d
+	movl	20(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	28(%rsp),%ebp
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	52(%rsp),%ebp
+	xorl	%edi,%ebx
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	8(%rsp),%ebp
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%ebp,20(%rsp)
+	addl	%ecx,%r11d
+	movl	24(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	32(%rsp),%edx
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	56(%rsp),%edx
+	xorl	%esi,%ebx
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	12(%rsp),%edx
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%edx,24(%rsp)
+	addl	%ecx,%edi
+	movl	28(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	36(%rsp),%ebp
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rdx,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	16(%rsp),%ebp
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%ebp,28(%rsp)
+	addl	%ecx,%esi
+	movl	32(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	40(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	20(%rsp),%edx
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%edx,32(%rsp)
+	addl	%ecx,%r13d
+	movl	36(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	44(%rsp),%ebp
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	24(%rsp),%ebp
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%ebp,36(%rsp)
+	addl	%ecx,%r12d
+	movl	40(%rsp),%edx
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	48(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%edi,%ebx
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	28(%rsp),%edx
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%edx,40(%rsp)
+	addl	%ecx,%r11d
+	movl	44(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	52(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%esi,%ebx
+	leal	-1894007588(%rdx,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%ebp,44(%rsp)
+	addl	%ecx,%edi
+	movl	48(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	56(%rsp),%edx
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	36(%rsp),%edx
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%edx,48(%rsp)
+	addl	%ecx,%esi
+	movl	52(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	20(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	40(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,52(%rsp)
+	movl	56(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	24(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	44(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,56(%rsp)
+	movl	60(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r11,1),%r11d
+	xorl	28(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	48(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,60(%rsp)
+	movl	0(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%rdi,1),%edi
+	xorl	32(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	52(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,0(%rsp)
+	movl	4(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	36(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	56(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,4(%rsp)
+	movl	8(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	40(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	60(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,8(%rsp)
+	movl	12(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r12,1),%r12d
+	xorl	44(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	0(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	%ebp,12(%rsp)
+	movl	16(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r11,1),%r11d
+	xorl	48(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	4(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	%edx,16(%rsp)
+	movl	20(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	52(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	8(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%ebp,20(%rsp)
+	movl	24(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%rsi,1),%esi
+	xorl	56(%rsp),%edx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	12(%rsp),%edx
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	movl	%edx,24(%rsp)
+	movl	28(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	60(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	16(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,28(%rsp)
+	movl	32(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	0(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	20(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,32(%rsp)
+	movl	36(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r11,1),%r11d
+	xorl	4(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	24(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,36(%rsp)
+	movl	40(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	48(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%rdi,1),%edi
+	xorl	8(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	28(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,40(%rsp)
+	movl	44(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	52(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	12(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	32(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,44(%rsp)
+	movl	48(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	56(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	16(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	36(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,48(%rsp)
+	movl	52(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r12,1),%r12d
+	xorl	20(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	40(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	56(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r11,1),%r11d
+	xorl	24(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	44(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	60(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	28(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	48(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	%r11d,%eax
+	leal	-899497514(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	addl	0(%r8),%esi
+	addl	4(%r8),%edi
+	addl	8(%r8),%r11d
+	addl	12(%r8),%r12d
+	addl	16(%r8),%r13d
+	movl	%esi,0(%r8)
+	movl	%edi,4(%r8)
+	movl	%r11d,8(%r8)
+	movl	%r12d,12(%r8)
+	movl	%r13d,16(%r8)
+
+	subq	$1,%r10
+	leaq	64(%r9),%r9
+	jnz	.Lloop
+
+	movq	64(%rsp),%rsi
+	movq	(%rsi),%r13
+	movq	8(%rsi),%r12
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%rbx
+	leaq	32(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha1_block_data_order,.-sha1_block_data_order
+.type	sha1_block_data_order_ssse3,@function
+.align	16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	leaq	-64(%rsp),%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX(%rip),%r11
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r9
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm9,%xmm0
+	paddd	%xmm9,%xmm1
+	paddd	%xmm9,%xmm2
+	movdqa	%xmm0,0(%rsp)
+	psubd	%xmm9,%xmm0
+	movdqa	%xmm1,16(%rsp)
+	psubd	%xmm9,%xmm1
+	movdqa	%xmm2,32(%rsp)
+	psubd	%xmm9,%xmm2
+	jmp	.Loop_ssse3
+.align	16
+.Loop_ssse3:
+	movdqa	%xmm1,%xmm4
+	addl	0(%rsp),%ebp
+	xorl	%edx,%ecx
+	movdqa	%xmm3,%xmm8
+.byte	102,15,58,15,224,8
+	movl	%eax,%edi
+	roll	$5,%eax
+	paddd	%xmm3,%xmm9
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrldq	$4,%xmm8
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	pxor	%xmm0,%xmm4
+	rorl	$2,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm2,%xmm8
+	addl	4(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pxor	%xmm8,%xmm4
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm9,48(%rsp)
+	xorl	%ecx,%edi
+	addl	%ebp,%edx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm4,%xmm8
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	8(%rsp),%ecx
+	xorl	%ebx,%eax
+	pslldq	$12,%xmm10
+	paddd	%xmm4,%xmm4
+	movl	%edx,%edi
+	roll	$5,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrld	$31,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm4
+	addl	12(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm4
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	0(%r11),%xmm10
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	pxor	%xmm9,%xmm4
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movdqa	%xmm2,%xmm5
+	addl	16(%rsp),%eax
+	xorl	%ebp,%edx
+	movdqa	%xmm4,%xmm9
+.byte	102,15,58,15,233,8
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	paddd	%xmm4,%xmm10
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrldq	$4,%xmm9
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	pxor	%xmm1,%xmm5
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm3,%xmm9
+	addl	20(%rsp),%ebp
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pxor	%xmm9,%xmm5
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	%xmm10,0(%rsp)
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm5,%xmm9
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	24(%rsp),%edx
+	xorl	%ecx,%ebx
+	pslldq	$12,%xmm8
+	paddd	%xmm5,%xmm5
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	psrld	$31,%xmm9
+	xorl	%ecx,%esi
+	addl	%ebp,%edx
+	movdqa	%xmm8,%xmm10
+	rorl	$7,%eax
+	addl	%esi,%edx
+	psrld	$30,%xmm8
+	por	%xmm9,%xmm5
+	addl	28(%rsp),%ecx
+	xorl	%ebx,%eax
+	movl	%edx,%esi
+	roll	$5,%edx
+	pslld	$2,%xmm10
+	pxor	%xmm8,%xmm5
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	16(%r11),%xmm8
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	pxor	%xmm10,%xmm5
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movdqa	%xmm3,%xmm6
+	addl	32(%rsp),%ebx
+	xorl	%eax,%ebp
+	movdqa	%xmm5,%xmm10
+.byte	102,15,58,15,242,8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	paddd	%xmm5,%xmm8
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	psrldq	$4,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	pxor	%xmm2,%xmm6
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm4,%xmm10
+	addl	36(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	pxor	%xmm10,%xmm6
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm8,16(%rsp)
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm6,%xmm10
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	40(%rsp),%ebp
+	xorl	%edx,%ecx
+	pslldq	$12,%xmm9
+	paddd	%xmm6,%xmm6
+	movl	%eax,%edi
+	roll	$5,%eax
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrld	$31,%xmm10
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	psrld	$30,%xmm9
+	por	%xmm10,%xmm6
+	addl	44(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pslld	$2,%xmm8
+	pxor	%xmm9,%xmm6
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	16(%r11),%xmm9
+	xorl	%ecx,%edi
+	addl	%ebp,%edx
+	pxor	%xmm8,%xmm6
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movdqa	%xmm4,%xmm7
+	addl	48(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm8
+.byte	102,15,58,15,251,8
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm6,%xmm9
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrldq	$4,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	pxor	%xmm3,%xmm7
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm5,%xmm8
+	addl	52(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pxor	%xmm8,%xmm7
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm9,32(%rsp)
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm7,%xmm8
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	56(%rsp),%eax
+	xorl	%ebp,%edx
+	pslldq	$12,%xmm10
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrld	$31,%xmm8
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm7
+	addl	60(%rsp),%ebp
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm7
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	16(%r11),%xmm10
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	pxor	%xmm9,%xmm7
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movdqa	%xmm7,%xmm9
+	addl	0(%rsp),%edx
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,206,8
+	xorl	%ecx,%ebx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm1,%xmm0
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm7,%xmm10
+	xorl	%ecx,%esi
+	addl	%ebp,%edx
+	pxor	%xmm9,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	4(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm10,48(%rsp)
+	movl	%edx,%esi
+	roll	$5,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	pslld	$2,%xmm0
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	psrld	$30,%xmm9
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	8(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	por	%xmm9,%xmm0
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	movdqa	%xmm0,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	16(%rsp),%ebp
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,215,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm0,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm10,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm1,%xmm10
+	movdqa	%xmm8,0(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm10
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm10,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%eax,%edi
+	movdqa	%xmm1,%xmm8
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	32(%rsp),%eax
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,192,8
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	movdqa	32(%r11),%xmm10
+	paddd	%xmm1,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm8,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm8
+	movdqa	%xmm9,16(%rsp)
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	pslld	$2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ecx,%esi
+	psrld	$30,%xmm8
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	por	%xmm8,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%ebx,%edi
+	movdqa	%xmm2,%xmm9
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	48(%rsp),%ebx
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,201,8
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm2,%xmm10
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm9,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm9
+	movdqa	%xmm10,32(%rsp)
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ebp
+	xorl	%edx,%esi
+	psrld	$30,%xmm9
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	por	%xmm9,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ecx,%edi
+	movdqa	%xmm3,%xmm10
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	0(%rsp),%ecx
+	pxor	%xmm0,%xmm4
+.byte	102,68,15,58,15,210,8
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm3,%xmm8
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm10,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm8,48(%rsp)
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	pslld	$2,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%ebp,%esi
+	psrld	$30,%xmm10
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	por	%xmm10,%xmm4
+	addl	12(%rsp),%ebp
+	xorl	%edx,%edi
+	movdqa	%xmm4,%xmm8
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	16(%rsp),%edx
+	pxor	%xmm1,%xmm5
+.byte	102,68,15,58,15,195,8
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm6,%xmm5
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm4,%xmm9
+	rorl	$7,%eax
+	addl	%esi,%edx
+	pxor	%xmm8,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm9,0(%rsp)
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	pslld	$2,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%eax,%esi
+	psrld	$30,%xmm8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	por	%xmm8,%xmm5
+	addl	28(%rsp),%eax
+	xorl	%ebp,%edi
+	movdqa	%xmm5,%xmm9
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ecx,%edi
+	pxor	%xmm2,%xmm6
+.byte	102,68,15,58,15,204,8
+	xorl	%edx,%ecx
+	addl	32(%rsp),%ebp
+	andl	%edx,%edi
+	pxor	%xmm7,%xmm6
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm5,%xmm10
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	pxor	%xmm9,%xmm6
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm10,16(%rsp)
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	36(%rsp),%edx
+	andl	%ecx,%esi
+	pslld	$2,%xmm6
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	psrld	$30,%xmm9
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	por	%xmm9,%xmm6
+	movl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm10
+	addl	40(%rsp),%ecx
+	andl	%ebx,%edi
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	44(%rsp),%ebx
+	andl	%eax,%esi
+	andl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%edi
+	pxor	%xmm3,%xmm7
+.byte	102,68,15,58,15,213,8
+	xorl	%ebp,%edx
+	addl	48(%rsp),%eax
+	andl	%ebp,%edi
+	pxor	%xmm0,%xmm7
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	48(%r11),%xmm9
+	paddd	%xmm6,%xmm8
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	pxor	%xmm10,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm8,32(%rsp)
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	52(%rsp),%ebp
+	andl	%edx,%esi
+	pslld	$2,%xmm7
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	psrld	$30,%xmm10
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	por	%xmm10,%xmm7
+	movl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm7,%xmm8
+	addl	56(%rsp),%edx
+	andl	%ecx,%edi
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	60(%rsp),%ecx
+	andl	%ebx,%esi
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%edi
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,198,8
+	xorl	%eax,%ebp
+	addl	0(%rsp),%ebx
+	andl	%eax,%edi
+	pxor	%xmm1,%xmm0
+	andl	%ebp,%esi
+	rorl	$7,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm7,%xmm9
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	pxor	%xmm8,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	4(%rsp),%eax
+	andl	%ebp,%esi
+	pslld	$2,%xmm0
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	psrld	$30,%xmm8
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	por	%xmm8,%xmm0
+	movl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	%xmm0,%xmm9
+	addl	8(%rsp),%ebp
+	andl	%edx,%edi
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	12(%rsp),%edx
+	andl	%ecx,%esi
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%edi
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,207,8
+	xorl	%ebx,%eax
+	addl	16(%rsp),%ecx
+	andl	%ebx,%edi
+	pxor	%xmm2,%xmm1
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm0,%xmm10
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	pxor	%xmm9,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movdqa	%xmm1,%xmm9
+	movdqa	%xmm10,0(%rsp)
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	20(%rsp),%ebx
+	andl	%eax,%esi
+	pslld	$2,%xmm1
+	andl	%ebp,%edi
+	rorl	$7,%edx
+	psrld	$30,%xmm9
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	por	%xmm9,%xmm1
+	movl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm1,%xmm10
+	addl	24(%rsp),%eax
+	andl	%ebp,%edi
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	28(%rsp),%ebp
+	andl	%edx,%esi
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%edi
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,208,8
+	xorl	%ecx,%ebx
+	addl	32(%rsp),%edx
+	andl	%ecx,%edi
+	pxor	%xmm3,%xmm2
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm1,%xmm8
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	pxor	%xmm10,%xmm2
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movdqa	%xmm2,%xmm10
+	movdqa	%xmm8,16(%rsp)
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	36(%rsp),%ecx
+	andl	%ebx,%esi
+	pslld	$2,%xmm2
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	psrld	$30,%xmm10
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	por	%xmm10,%xmm2
+	movl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm2,%xmm8
+	addl	40(%rsp),%ebx
+	andl	%eax,%edi
+	andl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	44(%rsp),%eax
+	andl	%ebp,%esi
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	addl	48(%rsp),%ebp
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,193,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm2,%xmm9
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm8,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm3,%xmm8
+	movdqa	%xmm9,32(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm8
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm8,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	0(%rsp),%eax
+	paddd	%xmm3,%xmm10
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	movdqa	%xmm10,48(%rsp)
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	4(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	12(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	cmpq	%r10,%r9
+	je	.Ldone_ssse3
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+.byte	102,15,56,0,206
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	paddd	%xmm9,%xmm0
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movdqa	%xmm0,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	psubd	%xmm9,%xmm0
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+.byte	102,15,56,0,214
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm9,%xmm1
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movdqa	%xmm1,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	psubd	%xmm9,%xmm1
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+.byte	102,15,56,0,222
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	paddd	%xmm9,%xmm2
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movdqa	%xmm2,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	psubd	%xmm9,%xmm2
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	leaq	64(%rsp),%rsi
+	movq	0(%rsi),%r12
+	movq	8(%rsi),%rbp
+	movq	16(%rsi),%rbx
+	leaq	24(%rsi),%rsp
+.Lepilogue_ssse3:
+	.byte	0xf3,0xc3
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/main/openssl/crypto/sha/asm/sha1-x86_64.pl b/main/openssl/crypto/sha/asm/sha1-x86_64.pl
index 4edc5ea9..f15c7ec3 100755
--- a/main/openssl/crypto/sha/asm/sha1-x86_64.pl
+++ b/main/openssl/crypto/sha/asm/sha1-x86_64.pl
@@ -16,7 +16,7 @@
 # There was suggestion to mechanically translate 32-bit code, but I
 # dismissed it, reasoning that x86_64 offers enough register bank
 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
-# implementation:-) However! While 64-bit code does performs better
+# implementation:-) However! While 64-bit code does perform better
 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
 # x86_64 does offer larger *addressable* bank, but out-of-order core
 # reaches for even more registers through dynamic aliasing, and EM64T
@@ -29,6 +29,38 @@
 # Xeon P4	+65%		+0%		9.9
 # Core2		+60%		+10%		7.0
 
+# August 2009.
+#
+# The code was revised to minimize code size and to maximize
+# "distance" between instructions producing input to 'lea'
+# instruction and the 'lea' instruction itself, which is essential
+# for Intel Atom core.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
+# for background and implementation details. The only difference from
+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
+# to free temporary registers.
+
+# April 2011.
+#
+# Add AVX code path. See sha1-586.pl for further information.
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86_64		SSSE3		AVX
+# P4		9.8		-
+# Opteron	6.6		-
+# Core2		6.7		6.1/+10%	-
+# Atom		11.0		9.7/+13%	-
+# Westmere	7.1		5.6/+27%	-
+# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -40,7 +72,18 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+	   $1>=10);
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $ctx="%rdi";	# 1st arg
 $inp="%rsi";	# 2nd arg
@@ -51,196 +94,994 @@ $ctx="%r8";
 $inp="%r9";
 $num="%r10";
 
-$xi="%eax";
-$t0="%ebx";
-$t1="%ecx";
-$A="%edx";
-$B="%esi";
-$C="%edi";
-$D="%ebp";
-$E="%r11d";
-$T="%r12d";
-
-@V=($A,$B,$C,$D,$E,$T);
+$t0="%eax";
+$t1="%ebx";
+$t2="%ecx";
+@xi=("%edx","%ebp");
+$A="%esi";
+$B="%edi";
+$C="%r11d";
+$D="%r12d";
+$E="%r13d";
 
-sub PROLOGUE {
-my $func=shift;
-$code.=<<___;
-.globl	$func
-.type	$func,\@function,3
-.align	16
-$func:
-	push	%rbx
-	push	%rbp
-	push	%r12
-	mov	%rsp,%r11
-	mov	%rdi,$ctx	# reassigned argument
-	sub	\$`8+16*4`,%rsp
-	mov	%rsi,$inp	# reassigned argument
-	and	\$-64,%rsp
-	mov	%rdx,$num	# reassigned argument
-	mov	%r11,`16*4`(%rsp)
-.Lprologue:
-
-	mov	0($ctx),$A
-	mov	4($ctx),$B
-	mov	8($ctx),$C
-	mov	12($ctx),$D
-	mov	16($ctx),$E
-___
-}
-
-sub EPILOGUE {
-my $func=shift;
-$code.=<<___;
-	mov	`16*4`(%rsp),%rsi
-	mov	(%rsi),%r12
-	mov	8(%rsi),%rbp
-	mov	16(%rsi),%rbx
-	lea	24(%rsi),%rsp
-.Lepilogue:
-	ret
-.size	$func,.-$func
-___
-}
+@V=($A,$B,$C,$D,$E);
 
 sub BODY_00_19 {
-my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 $code.=<<___ if ($i==0);
-	mov	`4*$i`($inp),$xi	
-	`"bswap	$xi"	if(!defined($host))`
-	mov	$xi,`4*$i`(%rsp)
+	mov	`4*$i`($inp),$xi[0]
+	bswap	$xi[0]
+	mov	$xi[0],`4*$i`(%rsp)
 ___
 $code.=<<___ if ($i<15);
-	lea	0x5a827999($xi,$e),$f
 	mov	$c,$t0
-	mov	`4*$j`($inp),$xi
-	mov	$a,$e
+	mov	`4*$j`($inp),$xi[1]
+	mov	$a,$t2
 	xor	$d,$t0
-	`"bswap	$xi"	if(!defined($host))`	
-	rol	\$5,$e
+	bswap	$xi[1]
+	rol	\$5,$t2
+	lea	0x5a827999($xi[0],$e),$e
 	and	$b,$t0
-	mov	$xi,`4*$j`(%rsp)
-	add	$e,$f
+	mov	$xi[1],`4*$j`(%rsp)
+	add	$t2,$e
 	xor	$d,$t0
 	rol	\$30,$b
-	add	$t0,$f
+	add	$t0,$e
 ___
 $code.=<<___ if ($i>=15);
-	lea	0x5a827999($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
+	mov	`4*($j%16)`(%rsp),$xi[1]
 	mov	$c,$t0
-	mov	$a,$e
-	xor	`4*(($j+2)%16)`(%rsp),$xi
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
 	xor	$d,$t0
-	rol	\$5,$e
-	xor	`4*(($j+8)%16)`(%rsp),$xi
+	rol	\$5,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
 	and	$b,$t0
-	add	$e,$f
-	xor	`4*(($j+13)%16)`(%rsp),$xi
+	lea	0x5a827999($xi[0],$e),$e
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
 	xor	$d,$t0
+	rol	\$1,$xi[1]
+	add	$t2,$e
 	rol	\$30,$b
-	add	$t0,$f
-	rol	\$1,$xi
-	mov	$xi,`4*($j%16)`(%rsp)
+	mov	$xi[1],`4*($j%16)`(%rsp)
+	add	$t0,$e
 ___
+unshift(@xi,pop(@xi));
 }
 
 sub BODY_20_39 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
 $code.=<<___ if ($i<79);
-	lea	$K($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
+	mov	`4*($j%16)`(%rsp),$xi[1]
 	mov	$c,$t0
-	mov	$a,$e
-	xor	`4*(($j+2)%16)`(%rsp),$xi
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
 	xor	$b,$t0
-	rol	\$5,$e
-	xor	`4*(($j+8)%16)`(%rsp),$xi
+	rol	\$5,$t2
+	lea	$K($xi[0],$e),$e
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
 	xor	$d,$t0
-	add	$e,$f
-	xor	`4*(($j+13)%16)`(%rsp),$xi
+	add	$t2,$e
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
 	rol	\$30,$b
-	add	$t0,$f
-	rol	\$1,$xi
+	add	$t0,$e
+	rol	\$1,$xi[1]
 ___
 $code.=<<___ if ($i<76);
-	mov	$xi,`4*($j%16)`(%rsp)
+	mov	$xi[1],`4*($j%16)`(%rsp)
 ___
 $code.=<<___ if ($i==79);
-	lea	$K($xi,$e),$f
 	mov	$c,$t0
-	mov	$a,$e
+	mov	$a,$t2
 	xor	$b,$t0
-	rol	\$5,$e
+	lea	$K($xi[0],$e),$e
+	rol	\$5,$t2
 	xor	$d,$t0
-	add	$e,$f
+	add	$t2,$e
 	rol	\$30,$b
-	add	$t0,$f
+	add	$t0,$e
 ___
+unshift(@xi,pop(@xi));
 }
 
 sub BODY_40_59 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 $code.=<<___;
-	lea	0x8f1bbcdc($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
-	mov	$b,$t0
-	mov	$b,$t1
-	xor	`4*(($j+2)%16)`(%rsp),$xi
-	mov	$a,$e
-	and	$c,$t0
-	xor	`4*(($j+8)%16)`(%rsp),$xi
-	or	$c,$t1
-	rol	\$5,$e
-	xor	`4*(($j+13)%16)`(%rsp),$xi
-	and	$d,$t1
-	add	$e,$f
-	rol	\$1,$xi
-	or	$t1,$t0
+	mov	`4*($j%16)`(%rsp),$xi[1]
+	mov	$c,$t0
+	mov	$c,$t1
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+	and	$d,$t0
+	mov	$a,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+	xor	$d,$t1
+	lea	0x8f1bbcdc($xi[0],$e),$e
+	rol	\$5,$t2
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
+	add	$t0,$e
+	and	$b,$t1
+	rol	\$1,$xi[1]
+	add	$t1,$e
 	rol	\$30,$b
-	mov	$xi,`4*($j%16)`(%rsp)
-	add	$t0,$f
+	mov	$xi[1],`4*($j%16)`(%rsp)
+	add	$t2,$e
 ___
+unshift(@xi,pop(@xi));
 }
 
-$code=".text\n";
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
 
-&PROLOGUE("sha1_block_data_order");
-$code.=".align	4\n.Lloop:\n";
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,\@function,3
+.align	16
+sha1_block_data_order:
+	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
+	test	\$`1<<9`,%r8d		# check SSSE3 bit
+	jz	.Lialu
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r8d		# mask AVX bit
+	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
+	or	%r9d,%r8d
+	cmp	\$`1<<28|1<<30`,%r8d
+	je	_avx_shortcut
+___
+$code.=<<___;
+	jmp	_ssse3_shortcut
+
+.align	16
+.Lialu:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	mov	%rsp,%r11
+	mov	%rdi,$ctx	# reassigned argument
+	sub	\$`8+16*4`,%rsp
+	mov	%rsi,$inp	# reassigned argument
+	and	\$-64,%rsp
+	mov	%rdx,$num	# reassigned argument
+	mov	%r11,`16*4`(%rsp)
+.Lprologue:
+
+	mov	0($ctx),$A
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	16($ctx),$E
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+___
 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
-	add	0($ctx),$E
-	add	4($ctx),$T
-	add	8($ctx),$A
-	add	12($ctx),$B
-	add	16($ctx),$C
-	mov	$E,0($ctx)
-	mov	$T,4($ctx)
-	mov	$A,8($ctx)
-	mov	$B,12($ctx)
-	mov	$C,16($ctx)
-
-	xchg	$E,$A	# mov	$E,$A
-	xchg	$T,$B	# mov	$T,$B
-	xchg	$E,$C	# mov	$A,$C
-	xchg	$T,$D	# mov	$B,$D
-			# mov	$C,$E
-	lea	`16*4`($inp),$inp
+	add	0($ctx),$A
+	add	4($ctx),$B
+	add	8($ctx),$C
+	add	12($ctx),$D
+	add	16($ctx),$E
+	mov	$A,0($ctx)
+	mov	$B,4($ctx)
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+
 	sub	\$1,$num
+	lea	`16*4`($inp),$inp
 	jnz	.Lloop
+
+	mov	`16*4`(%rsp),%rsi
+	mov	(%rsi),%r13
+	mov	8(%rsi),%r12
+	mov	16(%rsi),%rbp
+	mov	24(%rsi),%rbx
+	lea	32(%rsi),%rsp
+.Lepilogue:
+	ret
+.size	sha1_block_data_order,.-sha1_block_data_order
 ___
-&EPILOGUE("sha1_block_data_order");
+{{{
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type	sha1_block_data_order_ssse3,\@function,3
+.align	16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,64+0(%rsp)
+	movaps	%xmm7,64+16(%rsp)
+	movaps	%xmm8,64+32(%rsp)
+	movaps	%xmm9,64+48(%rsp)
+	movaps	%xmm10,64+64(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	add	\$64,$inp
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	jmp	.Loop_ssse3
+___
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@Tx[0],@X[0]);
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
 $code.=<<___;
-.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	64+0(%rsp),%xmm6
+	movaps	64+16(%rsp),%xmm7
+	movaps	64+32(%rsp),%xmm8
+	movaps	64+48(%rsp),%xmm9
+	movaps	64+64(%rsp),%xmm10
+___
+$code.=<<___;
+	lea	`64+($win64?5*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r12
+	mov	8(%rsi),%rbp
+	mov	16(%rsi),%rbx
+	lea	24(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+___
+
+if ($avx) {
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	sha1_block_data_order_avx,\@function,3
+.align	16
+sha1_block_data_order_avx:
+_avx_shortcut:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,64+0(%rsp)
+	movaps	%xmm7,64+16(%rsp)
+	movaps	%xmm8,64+32(%rsp)
+	movaps	%xmm9,64+48(%rsp)
+	movaps	%xmm10,64+64(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+	vzeroupper
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	@Tx[1],@X[-3&7],@X[1]
+	vpaddd	@Tx[1],@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	jmp	.Loop_avx
+___
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	(".Ldone_avx");
+
+	unshift(@Tx,pop(@Tx));
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vzeroupper
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	64+0(%rsp),%xmm6
+	movaps	64+16(%rsp),%xmm7
+	movaps	64+32(%rsp),%xmm8
+	movaps	64+48(%rsp),%xmm9
+	movaps	64+64(%rsp),%xmm10
+___
+$code.=<<___;
+	lea	`64+($win64?5*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r12
+	mov	8(%rsi),%rbp
+	mov	16(%rsi),%rbx
+	lea	24(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
+___
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+___
+}}}
+$code.=<<___;
+.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
 ___
 
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -272,25 +1113,75 @@ se_handler:
 
 	lea	.Lprologue(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lin_prologue
+	jb	.Lcommon_seh_tail
 
 	mov	152($context),%rax	# pull context->Rsp
 
 	lea	.Lepilogue(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lin_prologue
+	jae	.Lcommon_seh_tail
 
 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
-	lea	24(%rax),%rax
+	lea	32(%rax),%rax
 
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
 	mov	%rbx,144($context)	# restore context->Rbx
 	mov	%rbp,160($context)	# restore context->Rbp
 	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+
+	jmp	.Lcommon_seh_tail
+.size	se_handler,.-se_handler
 
-.Lin_prologue:
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	64(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$10,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore cotnext->R12
+
+.Lcommon_seh_tail:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
@@ -328,19 +1219,38 @@ se_handler:
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	se_handler,.-se_handler
+.size	ssse3_handler,.-ssse3_handler
 
 .section	.pdata
 .align	4
 	.rva	.LSEH_begin_sha1_block_data_order
 	.rva	.LSEH_end_sha1_block_data_order
 	.rva	.LSEH_info_sha1_block_data_order
-
+	.rva	.LSEH_begin_sha1_block_data_order_ssse3
+	.rva	.LSEH_end_sha1_block_data_order_ssse3
+	.rva	.LSEH_info_sha1_block_data_order_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_sha1_block_data_order_avx
+	.rva	.LSEH_end_sha1_block_data_order_avx
+	.rva	.LSEH_info_sha1_block_data_order_avx
+___
+$code.=<<___;
 .section	.xdata
 .align	8
 .LSEH_info_sha1_block_data_order:
 	.byte	9,0,0,0
 	.rva	se_handler
+.LSEH_info_sha1_block_data_order_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_block_data_order_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
 ___
 }
 
diff --git a/main/openssl/crypto/sha/asm/sha256-586.S b/main/openssl/crypto/sha/asm/sha256-586.S
new file mode 100644
index 00000000..77a89514
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha256-586.S
@@ -0,0 +1,258 @@
+.file	"sha512-586.s"
+.text
+.globl	sha256_block_data_order
+.type	sha256_block_data_order,@function
+.align	16
+sha256_block_data_order:
+.L_sha256_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	.L001K256-.L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+.align	16
+.L002loop:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	movl	28(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	movl	44(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	movl	60(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	addl	$64,%edi
+	subl	$32,%esp
+	movl	%edi,100(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edi
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esp)
+.align	16
+.L00300_15:
+	movl	92(%esp),%ebx
+	movl	%edx,%ecx
+	rorl	$14,%ecx
+	movl	20(%esp),%esi
+	xorl	%edx,%ecx
+	rorl	$5,%ecx
+	xorl	%edx,%ecx
+	rorl	$6,%ecx
+	movl	24(%esp),%edi
+	addl	%ecx,%ebx
+	xorl	%edi,%esi
+	movl	%edx,16(%esp)
+	movl	%eax,%ecx
+	andl	%edx,%esi
+	movl	12(%esp),%edx
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	28(%esp),%ebx
+	xorl	%eax,%ecx
+	rorl	$11,%ecx
+	movl	4(%esp),%esi
+	xorl	%eax,%ecx
+	rorl	$2,%ecx
+	addl	%ebx,%edx
+	movl	8(%esp),%edi
+	addl	%ecx,%ebx
+	movl	%eax,(%esp)
+	movl	%eax,%ecx
+	subl	$4,%esp
+	orl	%esi,%eax
+	andl	%esi,%ecx
+	andl	%edi,%eax
+	movl	(%ebp),%esi
+	orl	%ecx,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	addl	%esi,%edx
+	addl	%esi,%eax
+	cmpl	$3248222580,%esi
+	jne	.L00300_15
+	movl	152(%esp),%ebx
+.align	16
+.L00416_63:
+	movl	%ebx,%esi
+	movl	100(%esp),%ecx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	xorl	%ebx,%esi
+	rorl	$7,%esi
+	shrl	$3,%ebx
+	rorl	$2,%edi
+	xorl	%esi,%ebx
+	xorl	%ecx,%edi
+	rorl	$17,%edi
+	shrl	$10,%ecx
+	addl	156(%esp),%ebx
+	xorl	%ecx,%edi
+	addl	120(%esp),%ebx
+	movl	%edx,%ecx
+	addl	%edi,%ebx
+	rorl	$14,%ecx
+	movl	20(%esp),%esi
+	xorl	%edx,%ecx
+	rorl	$5,%ecx
+	movl	%ebx,92(%esp)
+	xorl	%edx,%ecx
+	rorl	$6,%ecx
+	movl	24(%esp),%edi
+	addl	%ecx,%ebx
+	xorl	%edi,%esi
+	movl	%edx,16(%esp)
+	movl	%eax,%ecx
+	andl	%edx,%esi
+	movl	12(%esp),%edx
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	28(%esp),%ebx
+	xorl	%eax,%ecx
+	rorl	$11,%ecx
+	movl	4(%esp),%esi
+	xorl	%eax,%ecx
+	rorl	$2,%ecx
+	addl	%ebx,%edx
+	movl	8(%esp),%edi
+	addl	%ecx,%ebx
+	movl	%eax,(%esp)
+	movl	%eax,%ecx
+	subl	$4,%esp
+	orl	%esi,%eax
+	andl	%esi,%ecx
+	andl	%edi,%eax
+	movl	(%ebp),%esi
+	orl	%ecx,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	movl	152(%esp),%ebx
+	addl	%esi,%edx
+	addl	%esi,%eax
+	cmpl	$3329325298,%esi
+	jne	.L00416_63
+	movl	352(%esp),%esi
+	movl	4(%esp),%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edi
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%ecx
+	addl	12(%esi),%edi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edi,12(%esi)
+	movl	20(%esp),%eax
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ecx
+	movl	356(%esp),%edi
+	addl	16(%esi),%edx
+	addl	20(%esi),%eax
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%eax,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	addl	$352,%esp
+	subl	$256,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L002loop
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L001K256:
+.long	1116352408,1899447441,3049323471,3921009573
+.long	961987163,1508970993,2453635748,2870763221
+.long	3624381080,310598401,607225278,1426881987
+.long	1925078388,2162078206,2614888103,3248222580
+.long	3835390401,4022224774,264347078,604807628
+.long	770255983,1249150122,1555081692,1996064986
+.long	2554220882,2821834349,2952996808,3210313671
+.long	3336571891,3584528711,113926993,338241895
+.long	666307205,773529912,1294757372,1396182291
+.long	1695183700,1986661051,2177026350,2456956037
+.long	2730485921,2820302411,3259730800,3345764771
+.long	3516065817,3600352804,4094571909,275423344
+.long	430227734,506948616,659060556,883997877
+.long	958139571,1322822218,1537002063,1747873779
+.long	1955562222,2024104815,2227730452,2361852424
+.long	2428436474,2756734187,3204031479,3329325298
+.size	sha256_block_data_order,.-.L_sha256_block_data_order_begin
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
diff --git a/main/openssl/crypto/sha/asm/sha256-586.pl b/main/openssl/crypto/sha/asm/sha256-586.pl
index ecc8b69c..52a7c7f8 100644
--- a/main/openssl/crypto/sha/asm/sha256-586.pl
+++ b/main/openssl/crypto/sha/asm/sha256-586.pl
@@ -14,14 +14,14 @@
 #		Pentium	PIII	P4	AMD K8	Core2
 # gcc		46	36	41	27	26
 # icc		57	33	38	25	23	
-# x86 asm	40	30	35	20	20
-# x86_64 asm(*)	-	-	21	15.8	16.5
+# x86 asm	40	30	33	20	18
+# x86_64 asm(*)	-	-	21	16	16
 #
 # (*) x86_64 assembler performance is presented for reference
 #     purposes.
 #
 # Performance improvement over compiler generated code varies from
-# 10% to 40% [see above]. Not very impressive on some �-archs, but
+# 10% to 40% [see above]. Not very impressive on some µ-archs, but
 # it's 5 times smaller and optimizies amount of writes.
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
@@ -48,20 +48,19 @@ sub BODY_00_15() {
     my $in_16_63=shift;
 
 	&mov	("ecx",$E);
-	 &add	($T,&DWP(4*(8+15+16-9),"esp"))	if ($in_16_63);	# T += X[-7]
-	&ror	("ecx",6);
-	&mov	("edi",$E);
-	&ror	("edi",11);
+	 &add	($T,"edi")			if ($in_16_63);	# T += sigma1(X[-2])
+	&ror	("ecx",25-11);
 	 &mov	("esi",$Foff);
-	&xor	("ecx","edi");
-	&ror	("edi",25-11);
+	&xor	("ecx",$E);
+	&ror	("ecx",11-6);
 	 &mov	(&DWP(4*(8+15),"esp"),$T)	if ($in_16_63);	# save X[0]
-	&xor	("ecx","edi");	# Sigma1(e)
+	&xor	("ecx",$E);
+	&ror	("ecx",6);	# Sigma1(e)
 	 &mov	("edi",$Goff);
 	&add	($T,"ecx");	# T += Sigma1(e)
-	 &mov	($Eoff,$E);	# modulo-scheduled
 
 	&xor	("esi","edi");
+	 &mov	($Eoff,$E);	# modulo-scheduled
 	 &mov	("ecx",$A);
 	&and	("esi",$E);
 	 &mov	($E,$Doff);	# e becomes d, which is e in next iteration
@@ -69,14 +68,14 @@ sub BODY_00_15() {
 	 &mov	("edi",$A);
 	&add	($T,"esi");	# T += Ch(e,f,g)
 
-	&ror	("ecx",2);
+	&ror	("ecx",22-13);
 	 &add	($T,$Hoff);	# T += h
-	&ror	("edi",13);
+	&xor	("ecx",$A);
+	&ror	("ecx",13-2);
 	 &mov	("esi",$Boff);
-	&xor	("ecx","edi");
-	&ror	("edi",22-13);
+	&xor	("ecx",$A);
+	&ror	("ecx",2);	# Sigma0(a)
 	 &add	($E,$T);	# d += T
-	&xor	("ecx","edi");	# Sigma0(a)
 	 &mov	("edi",$Coff);
 
 	&add	($T,"ecx");	# T += Sigma0(a)
@@ -168,23 +167,22 @@ sub BODY_00_15() {
 &set_label("16_63",16);
 	&mov	("esi",$T);
 	 &mov	("ecx",&DWP(4*(8+15+16-14),"esp"));
-	&shr	($T,3);
-	&ror	("esi",7);
-	&xor	($T,"esi");
 	&ror	("esi",18-7);
 	 &mov	("edi","ecx");
-	&xor	($T,"esi");			# T = sigma0(X[-15])
+	&xor	("esi",$T);
+	&ror	("esi",7);
+	&shr	($T,3);
 
-	&shr	("ecx",10);
-	 &mov	("esi",&DWP(4*(8+15+16),"esp"));
-	&ror	("edi",17);
-	&xor	("ecx","edi");
 	&ror	("edi",19-17);
-	 &add	($T,"esi");			# T += X[-16]
-	&xor	("edi","ecx")			# sigma1(X[-2])
+	 &xor	($T,"esi");			# T = sigma0(X[-15])
+	&xor	("edi","ecx");
+	&ror	("edi",17);
+	&shr	("ecx",10);
+	 &add	($T,&DWP(4*(8+15+16),"esp"));	# T += X[-16]
+	&xor	("edi","ecx");			# sigma1(X[-2])
 
-	&add	($T,"edi");			# T += sigma1(X[-2])
-	# &add	($T,&DWP(4*(8+15+16-9),"esp"));	# T += X[-7], moved to BODY_00_15(1)
+	 &add	($T,&DWP(4*(8+15+16-9),"esp"));	# T += X[-7]
+	# &add	($T,"edi");			# T += sigma1(X[-2])
 	# &mov	(&DWP(4*(8+15),"esp"),$T);	# save X[0]
 
 	&BODY_00_15(1);
diff --git a/main/openssl/crypto/sha/asm/sha256-armv4.pl b/main/openssl/crypto/sha/asm/sha256-armv4.pl
index 492cb62b..9c84e8d9 100644
--- a/main/openssl/crypto/sha/asm/sha256-armv4.pl
+++ b/main/openssl/crypto/sha/asm/sha256-armv4.pl
@@ -18,11 +18,16 @@
 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
 # Cortex A8 core and ~20 cycles per processed byte.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~17 cycles per processed byte.
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $ctx="r0";	$t0="r0";
-$inp="r1";
+$inp="r1";	$t3="r1";
 $len="r2";	$t1="r2";
 $T1="r3";
 $A="r4";
@@ -46,6 +51,9 @@ sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	ldr	$T1,[$inp],#4
+#else
 	ldrb	$T1,[$inp,#3]			@ $i
 	ldrb	$t2,[$inp,#2]
 	ldrb	$t1,[$inp,#1]
@@ -53,16 +61,24 @@ $code.=<<___ if ($i<16);
 	orr	$T1,$T1,$t2,lsl#8
 	orr	$T1,$T1,$t1,lsl#16
 	orr	$T1,$T1,$t0,lsl#24
-	`"str	$inp,[sp,#17*4]"	if ($i==15)`
+#endif
 ___
 $code.=<<___;
-	ldr	$t2,[$Ktbl],#4			@ *K256++
 	mov	$t0,$e,ror#$Sigma1[0]
-	str	$T1,[sp,#`$i%16`*4]
+	ldr	$t2,[$Ktbl],#4			@ *K256++
 	eor	$t0,$t0,$e,ror#$Sigma1[1]
 	eor	$t1,$f,$g
+#if $i>=16
+	add	$T1,$T1,$t3			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$T1,$T1
+#endif
+#if $i==15
+	str	$inp,[sp,#17*4]			@ leave room for $t3
+#endif
 	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
 	and	$t1,$t1,$e
+	str	$T1,[sp,#`$i%16`*4]
 	add	$T1,$T1,$t0
 	eor	$t1,$t1,$g			@ Ch(e,f,g)
 	add	$T1,$T1,$h
@@ -71,6 +87,9 @@ $code.=<<___;
 	eor	$h,$h,$a,ror#$Sigma0[1]
 	add	$T1,$T1,$t2
 	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
+#if $i>=15
+	ldr	$t3,[sp,#`($i+2)%16`*4]		@ from BODY_16_xx
+#endif
 	orr	$t0,$a,$b
 	and	$t1,$a,$b
 	and	$t0,$t0,$c
@@ -85,24 +104,26 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t3,[sp,#`($i+1)%16`*4]		@ $i
 	ldr	$t2,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t3,ror#$sigma0[0]
 	ldr	$T1,[sp,#`($i+0)%16`*4]
-	mov	$t0,$t1,ror#$sigma0[0]
-	ldr	$inp,[sp,#`($i+9)%16`*4]
-	eor	$t0,$t0,$t1,ror#$sigma0[1]
-	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
-	mov	$t1,$t2,ror#$sigma1[0]
+	eor	$t0,$t0,$t3,ror#$sigma0[1]
+	ldr	$t1,[sp,#`($i+9)%16`*4]
+	eor	$t0,$t0,$t3,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	mov	$t3,$t2,ror#$sigma1[0]
 	add	$T1,$T1,$t0
-	eor	$t1,$t1,$t2,ror#$sigma1[1]
-	add	$T1,$T1,$inp
-	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	eor	$t3,$t3,$t2,ror#$sigma1[1]
 	add	$T1,$T1,$t1
+	eor	$t3,$t3,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	@ add	$T1,$T1,$t3
 ___
 	&BODY_00_15(@_);
 }
 
 $code=<<___;
+#include "arm_arch.h"
+
 .text
 .code	32
 
@@ -132,7 +153,7 @@ K256:
 sha256_block_data_order:
 	sub	r3,pc,#8		@ sha256_block_data_order
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{$ctx,$inp,$len,r4-r12,lr}
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 	sub	$Ktbl,r3,#256		@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
@@ -171,10 +192,14 @@ $code.=<<___;
 	bne	.Loop
 
 	add	sp,sp,#`16+3`*4	@ destroy frame
-	ldmia	sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size   sha256_block_data_order,.-sha256_block_data_order
 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
diff --git a/main/openssl/crypto/sha/asm/sha256-armv4.s b/main/openssl/crypto/sha/asm/sha256-armv4.s
index ee903dc4..9c20a63c 100644
--- a/main/openssl/crypto/sha/asm/sha256-armv4.s
+++ b/main/openssl/crypto/sha/asm/sha256-armv4.s
@@ -1,3 +1,5 @@
+#include "arm_arch.h"
+
 .text
 .code	32
 
@@ -27,11 +29,14 @@ K256:
 sha256_block_data_order:
 	sub	r3,pc,#8		@ sha256_block_data_order
 	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{r0,r1,r2,r4-r12,lr}
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
 	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
 	sub	r14,r3,#256		@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
 .Loop:
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 0
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -39,14 +44,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r8,ror#6
-	str	r3,[sp,#0*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 0>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 0==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#0*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -55,6 +68,9 @@ sha256_block_data_order:
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 0>=15
+	ldr	r1,[sp,#2*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -62,6 +78,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 1
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -69,14 +88,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r7,ror#6
-	str	r3,[sp,#1*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 1>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 1==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#1*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -85,6 +112,9 @@ sha256_block_data_order:
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 1>=15
+	ldr	r1,[sp,#3*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -92,6 +122,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 2
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -99,14 +132,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r6,ror#6
-	str	r3,[sp,#2*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 2>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 2==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#2*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -115,6 +156,9 @@ sha256_block_data_order:
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 2>=15
+	ldr	r1,[sp,#4*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -122,6 +166,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 3
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -129,14 +176,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r5,ror#6
-	str	r3,[sp,#3*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 3>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 3==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#3*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -145,6 +200,9 @@ sha256_block_data_order:
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 3>=15
+	ldr	r1,[sp,#5*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -152,6 +210,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 4
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -159,14 +220,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r4,ror#6
-	str	r3,[sp,#4*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 4>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 4==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#4*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -175,6 +244,9 @@ sha256_block_data_order:
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 4>=15
+	ldr	r1,[sp,#6*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -182,6 +254,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 5
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -189,14 +264,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r11,ror#6
-	str	r3,[sp,#5*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 5>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 5==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#5*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -205,6 +288,9 @@ sha256_block_data_order:
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 5>=15
+	ldr	r1,[sp,#7*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -212,6 +298,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 6
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -219,14 +308,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r10,ror#6
-	str	r3,[sp,#6*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 6>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 6==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#6*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -235,6 +332,9 @@ sha256_block_data_order:
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 6>=15
+	ldr	r1,[sp,#8*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -242,6 +342,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 7
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -249,14 +352,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r9,ror#6
-	str	r3,[sp,#7*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 7>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 7==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#7*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -265,6 +376,9 @@ sha256_block_data_order:
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 7>=15
+	ldr	r1,[sp,#9*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -272,6 +386,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r8,r8,r3
 	add	r4,r4,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 8
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -279,14 +396,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r8,ror#6
-	str	r3,[sp,#8*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 8>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 8==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#8*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -295,6 +420,9 @@ sha256_block_data_order:
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 8>=15
+	ldr	r1,[sp,#10*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -302,6 +430,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 9
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -309,14 +440,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r7,ror#6
-	str	r3,[sp,#9*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 9>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 9==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#9*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -325,6 +464,9 @@ sha256_block_data_order:
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 9>=15
+	ldr	r1,[sp,#11*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -332,6 +474,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 10
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -339,14 +484,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r6,ror#6
-	str	r3,[sp,#10*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 10>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 10==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#10*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -355,6 +508,9 @@ sha256_block_data_order:
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 10>=15
+	ldr	r1,[sp,#12*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -362,6 +518,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 11
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -369,14 +528,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r5,ror#6
-	str	r3,[sp,#11*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 11>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 11==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#11*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -385,6 +552,9 @@ sha256_block_data_order:
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 11>=15
+	ldr	r1,[sp,#13*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -392,6 +562,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 12
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -399,14 +572,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r4,ror#6
-	str	r3,[sp,#12*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 12>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 12==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#12*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -415,6 +596,9 @@ sha256_block_data_order:
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 12>=15
+	ldr	r1,[sp,#14*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -422,6 +606,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 13
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -429,14 +616,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r11,ror#6
-	str	r3,[sp,#13*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 13>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 13==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#13*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -445,6 +640,9 @@ sha256_block_data_order:
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 13>=15
+	ldr	r1,[sp,#15*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -452,6 +650,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 14
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -459,14 +660,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r10,ror#6
-	str	r3,[sp,#14*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 14>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 14==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#14*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -475,6 +684,9 @@ sha256_block_data_order:
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 14>=15
+	ldr	r1,[sp,#0*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -482,6 +694,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 15
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -489,14 +704,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	str	r1,[sp,#17*4]
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r9,ror#6
-	str	r3,[sp,#15*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 15>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 15==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#15*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -505,6 +728,9 @@ sha256_block_data_order:
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 15>=15
+	ldr	r1,[sp,#1*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -513,26 +739,34 @@ sha256_block_data_order:
 	add	r8,r8,r3
 	add	r4,r4,r0
 .Lrounds_16_xx:
-	ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#1*4]		@ 16
 	ldr	r12,[sp,#14*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#0*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#9*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#9*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r8,ror#6
-	str	r3,[sp,#0*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 16>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 16==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#0*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -541,6 +775,9 @@ sha256_block_data_order:
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 16>=15
+	ldr	r1,[sp,#2*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -548,26 +785,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
-	ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#2*4]		@ 17
 	ldr	r12,[sp,#15*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#1*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#10*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#10*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r7,ror#6
-	str	r3,[sp,#1*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 17>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 17==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#1*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -576,6 +821,9 @@ sha256_block_data_order:
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 17>=15
+	ldr	r1,[sp,#3*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -583,26 +831,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
-	ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#3*4]		@ 18
 	ldr	r12,[sp,#0*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#2*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#11*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#11*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r6,ror#6
-	str	r3,[sp,#2*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 18>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 18==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#2*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -611,6 +867,9 @@ sha256_block_data_order:
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 18>=15
+	ldr	r1,[sp,#4*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -618,26 +877,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
-	ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#4*4]		@ 19
 	ldr	r12,[sp,#1*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#3*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#12*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#12*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r5,ror#6
-	str	r3,[sp,#3*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 19>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 19==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#3*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -646,6 +913,9 @@ sha256_block_data_order:
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 19>=15
+	ldr	r1,[sp,#5*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -653,26 +923,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
-	ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#5*4]		@ 20
 	ldr	r12,[sp,#2*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#4*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#13*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#13*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r4,ror#6
-	str	r3,[sp,#4*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 20>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 20==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#4*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -681,6 +959,9 @@ sha256_block_data_order:
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 20>=15
+	ldr	r1,[sp,#6*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -688,26 +969,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
-	ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#6*4]		@ 21
 	ldr	r12,[sp,#3*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#5*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#14*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#14*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r11,ror#6
-	str	r3,[sp,#5*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 21>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 21==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#5*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -716,6 +1005,9 @@ sha256_block_data_order:
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 21>=15
+	ldr	r1,[sp,#7*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -723,26 +1015,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
-	ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#7*4]		@ 22
 	ldr	r12,[sp,#4*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#6*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#15*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#15*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r10,ror#6
-	str	r3,[sp,#6*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 22>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 22==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#6*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -751,6 +1051,9 @@ sha256_block_data_order:
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 22>=15
+	ldr	r1,[sp,#8*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -758,26 +1061,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
-	ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#8*4]		@ 23
 	ldr	r12,[sp,#5*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#7*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#0*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#0*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r9,ror#6
-	str	r3,[sp,#7*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 23>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 23==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#7*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -786,6 +1097,9 @@ sha256_block_data_order:
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 23>=15
+	ldr	r1,[sp,#9*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -793,26 +1107,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r8,r8,r3
 	add	r4,r4,r0
-	ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#9*4]		@ 24
 	ldr	r12,[sp,#6*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#8*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#1*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#1*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r8,ror#6
-	str	r3,[sp,#8*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 24>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 24==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#8*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -821,6 +1143,9 @@ sha256_block_data_order:
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 24>=15
+	ldr	r1,[sp,#10*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -828,26 +1153,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
-	ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#10*4]		@ 25
 	ldr	r12,[sp,#7*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#9*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#2*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#2*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r7,ror#6
-	str	r3,[sp,#9*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 25>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 25==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#9*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -856,6 +1189,9 @@ sha256_block_data_order:
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 25>=15
+	ldr	r1,[sp,#11*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -863,26 +1199,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
-	ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#11*4]		@ 26
 	ldr	r12,[sp,#8*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#10*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#3*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#3*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r6,ror#6
-	str	r3,[sp,#10*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 26>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 26==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#10*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -891,6 +1235,9 @@ sha256_block_data_order:
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 26>=15
+	ldr	r1,[sp,#12*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -898,26 +1245,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
-	ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#12*4]		@ 27
 	ldr	r12,[sp,#9*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#11*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#4*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#4*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r5,ror#6
-	str	r3,[sp,#11*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 27>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 27==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#11*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -926,6 +1281,9 @@ sha256_block_data_order:
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 27>=15
+	ldr	r1,[sp,#13*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -933,26 +1291,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
-	ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#13*4]		@ 28
 	ldr	r12,[sp,#10*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#12*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#5*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#5*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r4,ror#6
-	str	r3,[sp,#12*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 28>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 28==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#12*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -961,6 +1327,9 @@ sha256_block_data_order:
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 28>=15
+	ldr	r1,[sp,#14*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -968,26 +1337,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
-	ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#14*4]		@ 29
 	ldr	r12,[sp,#11*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#13*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#6*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#6*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r11,ror#6
-	str	r3,[sp,#13*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 29>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 29==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#13*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -996,6 +1373,9 @@ sha256_block_data_order:
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 29>=15
+	ldr	r1,[sp,#15*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -1003,26 +1383,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
-	ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#15*4]		@ 30
 	ldr	r12,[sp,#12*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#14*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#7*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#7*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r10,ror#6
-	str	r3,[sp,#14*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 30>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 30==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#14*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -1031,6 +1419,9 @@ sha256_block_data_order:
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 30>=15
+	ldr	r1,[sp,#0*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -1038,26 +1429,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
-	ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#0*4]		@ 31
 	ldr	r12,[sp,#13*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#15*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#8*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#8*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r9,ror#6
-	str	r3,[sp,#15*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 31>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 31==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#15*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -1066,6 +1465,9 @@ sha256_block_data_order:
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 31>=15
+	ldr	r1,[sp,#1*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -1102,10 +1504,14 @@ sha256_block_data_order:
 	bne	.Loop
 
 	add	sp,sp,#19*4	@ destroy frame
-	ldmia	sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .size   sha256_block_data_order,.-sha256_block_data_order
 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
 .align	2
diff --git a/main/openssl/crypto/sha/asm/sha256-mips.S b/main/openssl/crypto/sha/asm/sha256-mips.S
new file mode 100644
index 00000000..2bd728e9
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha256-mips.S
@@ -0,0 +1,1998 @@
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+.set	noat
+#if !defined(__vxworks) || defined(__pic__)
+.option	pic2
+#endif
+
+.align	5
+.globl	sha256_block_data_order
+.ent	sha256_block_data_order
+sha256_block_data_order:
+	.frame	$29,128,$31
+	.mask	3237937152,-4
+	.set	noreorder
+	.cpload	$25
+	sub $29,128
+	sw	$31,128-1*4($29)
+	sw	$30,128-2*4($29)
+	sw	$23,128-3*4($29)
+	sw	$22,128-4*4($29)
+	sw	$21,128-5*4($29)
+	sw	$20,128-6*4($29)
+	sw	$19,128-7*4($29)
+	sw	$18,128-8*4($29)
+	sw	$17,128-9*4($29)
+	sw	$16,128-10*4($29)
+	sll $23,$6,6
+	.set	reorder
+	la	$6,K256		# PIC-ified 'load address'
+
+	lw	$1,0*4($4)		# load context
+	lw	$2,1*4($4)
+	lw	$3,2*4($4)
+	lw	$7,3*4($4)
+	lw	$24,4*4($4)
+	lw	$25,5*4($4)
+	lw	$30,6*4($4)
+	lw	$31,7*4($4)
+
+	add $23,$5		# pointer to the end of input
+	sw	$23,16*4($29)
+	b	.Loop
+
+.align	5
+.Loop:
+	lwl	$8,3($5)
+	lwr	$8,0($5)
+	lwl	$9,7($5)
+	lwr	$9,4($5)
+	srl	$13,$8,24		# byte swap(0)
+	srl	$14,$8,8
+	andi	$15,$8,0xFF00
+	sll	$8,$8,24
+	andi	$14,0xFF00
+	sll	$15,$15,8
+	or	$8,$13
+	or	$14,$15
+	or	$8,$14
+	addu	$12,$8,$31			# 0
+	srl	$31,$24,6
+	xor	$15,$25,$30
+	sll	$14,$24,7
+	and	$15,$24
+	srl	$13,$24,11
+	xor	$31,$14
+	sll	$14,$24,21
+	xor	$31,$13
+	srl	$13,$24,25
+	xor	$31,$14
+	sll	$14,$24,26
+	xor	$31,$13
+	xor	$15,$30			# Ch(e,f,g)
+	xor	$13,$14,$31			# Sigma1(e)
+
+	srl	$31,$1,2
+	addu	$12,$15
+	lw	$15,0($6)		# K[0]
+	sll	$14,$1,10
+	addu	$12,$13
+	srl	$13,$1,13
+	xor	$31,$14
+	sll	$14,$1,19
+	xor	$31,$13
+	srl	$13,$1,22
+	xor	$31,$14
+	sll	$14,$1,30
+	xor	$31,$13
+	sw	$8,0($29)	# offload to ring buffer
+	xor	$31,$14			# Sigma0(a)
+
+	or	$13,$1,$2
+	and	$14,$1,$2
+	and	$13,$3
+	or	$14,$13			# Maj(a,b,c)
+	addu	$12,$15			# +=K[0]
+	addu	$31,$14
+
+	addu	$7,$12
+	addu	$31,$12
+	lwl	$10,11($5)
+	lwr	$10,8($5)
+	srl	$14,$9,24		# byte swap(1)
+	srl	$15,$9,8
+	andi	$16,$9,0xFF00
+	sll	$9,$9,24
+	andi	$15,0xFF00
+	sll	$16,$16,8
+	or	$9,$14
+	or	$15,$16
+	or	$9,$15
+	addu	$13,$9,$30			# 1
+	srl	$30,$7,6
+	xor	$16,$24,$25
+	sll	$15,$7,7
+	and	$16,$7
+	srl	$14,$7,11
+	xor	$30,$15
+	sll	$15,$7,21
+	xor	$30,$14
+	srl	$14,$7,25
+	xor	$30,$15
+	sll	$15,$7,26
+	xor	$30,$14
+	xor	$16,$25			# Ch(e,f,g)
+	xor	$14,$15,$30			# Sigma1(e)
+
+	srl	$30,$31,2
+	addu	$13,$16
+	lw	$16,4($6)		# K[1]
+	sll	$15,$31,10
+	addu	$13,$14
+	srl	$14,$31,13
+	xor	$30,$15
+	sll	$15,$31,19
+	xor	$30,$14
+	srl	$14,$31,22
+	xor	$30,$15
+	sll	$15,$31,30
+	xor	$30,$14
+	sw	$9,4($29)	# offload to ring buffer
+	xor	$30,$15			# Sigma0(a)
+
+	or	$14,$31,$1
+	and	$15,$31,$1
+	and	$14,$2
+	or	$15,$14			# Maj(a,b,c)
+	addu	$13,$16			# +=K[1]
+	addu	$30,$15
+
+	addu	$3,$13
+	addu	$30,$13
+	lwl	$11,15($5)
+	lwr	$11,12($5)
+	srl	$15,$10,24		# byte swap(2)
+	srl	$16,$10,8
+	andi	$17,$10,0xFF00
+	sll	$10,$10,24
+	andi	$16,0xFF00
+	sll	$17,$17,8
+	or	$10,$15
+	or	$16,$17
+	or	$10,$16
+	addu	$14,$10,$25			# 2
+	srl	$25,$3,6
+	xor	$17,$7,$24
+	sll	$16,$3,7
+	and	$17,$3
+	srl	$15,$3,11
+	xor	$25,$16
+	sll	$16,$3,21
+	xor	$25,$15
+	srl	$15,$3,25
+	xor	$25,$16
+	sll	$16,$3,26
+	xor	$25,$15
+	xor	$17,$24			# Ch(e,f,g)
+	xor	$15,$16,$25			# Sigma1(e)
+
+	srl	$25,$30,2
+	addu	$14,$17
+	lw	$17,8($6)		# K[2]
+	sll	$16,$30,10
+	addu	$14,$15
+	srl	$15,$30,13
+	xor	$25,$16
+	sll	$16,$30,19
+	xor	$25,$15
+	srl	$15,$30,22
+	xor	$25,$16
+	sll	$16,$30,30
+	xor	$25,$15
+	sw	$10,8($29)	# offload to ring buffer
+	xor	$25,$16			# Sigma0(a)
+
+	or	$15,$30,$31
+	and	$16,$30,$31
+	and	$15,$1
+	or	$16,$15			# Maj(a,b,c)
+	addu	$14,$17			# +=K[2]
+	addu	$25,$16
+
+	addu	$2,$14
+	addu	$25,$14
+	lwl	$12,19($5)
+	lwr	$12,16($5)
+	srl	$16,$11,24		# byte swap(3)
+	srl	$17,$11,8
+	andi	$18,$11,0xFF00
+	sll	$11,$11,24
+	andi	$17,0xFF00
+	sll	$18,$18,8
+	or	$11,$16
+	or	$17,$18
+	or	$11,$17
+	addu	$15,$11,$24			# 3
+	srl	$24,$2,6
+	xor	$18,$3,$7
+	sll	$17,$2,7
+	and	$18,$2
+	srl	$16,$2,11
+	xor	$24,$17
+	sll	$17,$2,21
+	xor	$24,$16
+	srl	$16,$2,25
+	xor	$24,$17
+	sll	$17,$2,26
+	xor	$24,$16
+	xor	$18,$7			# Ch(e,f,g)
+	xor	$16,$17,$24			# Sigma1(e)
+
+	srl	$24,$25,2
+	addu	$15,$18
+	lw	$18,12($6)		# K[3]
+	sll	$17,$25,10
+	addu	$15,$16
+	srl	$16,$25,13
+	xor	$24,$17
+	sll	$17,$25,19
+	xor	$24,$16
+	srl	$16,$25,22
+	xor	$24,$17
+	sll	$17,$25,30
+	xor	$24,$16
+	sw	$11,12($29)	# offload to ring buffer
+	xor	$24,$17			# Sigma0(a)
+
+	or	$16,$25,$30
+	and	$17,$25,$30
+	and	$16,$31
+	or	$17,$16			# Maj(a,b,c)
+	addu	$15,$18			# +=K[3]
+	addu	$24,$17
+
+	addu	$1,$15
+	addu	$24,$15
+	lwl	$13,23($5)
+	lwr	$13,20($5)
+	srl	$17,$12,24		# byte swap(4)
+	srl	$18,$12,8
+	andi	$19,$12,0xFF00
+	sll	$12,$12,24
+	andi	$18,0xFF00
+	sll	$19,$19,8
+	or	$12,$17
+	or	$18,$19
+	or	$12,$18
+	addu	$16,$12,$7			# 4
+	srl	$7,$1,6
+	xor	$19,$2,$3
+	sll	$18,$1,7
+	and	$19,$1
+	srl	$17,$1,11
+	xor	$7,$18
+	sll	$18,$1,21
+	xor	$7,$17
+	srl	$17,$1,25
+	xor	$7,$18
+	sll	$18,$1,26
+	xor	$7,$17
+	xor	$19,$3			# Ch(e,f,g)
+	xor	$17,$18,$7			# Sigma1(e)
+
+	srl	$7,$24,2
+	addu	$16,$19
+	lw	$19,16($6)		# K[4]
+	sll	$18,$24,10
+	addu	$16,$17
+	srl	$17,$24,13
+	xor	$7,$18
+	sll	$18,$24,19
+	xor	$7,$17
+	srl	$17,$24,22
+	xor	$7,$18
+	sll	$18,$24,30
+	xor	$7,$17
+	sw	$12,16($29)	# offload to ring buffer
+	xor	$7,$18			# Sigma0(a)
+
+	or	$17,$24,$25
+	and	$18,$24,$25
+	and	$17,$30
+	or	$18,$17			# Maj(a,b,c)
+	addu	$16,$19			# +=K[4]
+	addu	$7,$18
+
+	addu	$31,$16
+	addu	$7,$16
+	lwl	$14,27($5)
+	lwr	$14,24($5)
+	srl	$18,$13,24		# byte swap(5)
+	srl	$19,$13,8
+	andi	$20,$13,0xFF00
+	sll	$13,$13,24
+	andi	$19,0xFF00
+	sll	$20,$20,8
+	or	$13,$18
+	or	$19,$20
+	or	$13,$19
+	addu	$17,$13,$3			# 5
+	srl	$3,$31,6
+	xor	$20,$1,$2
+	sll	$19,$31,7
+	and	$20,$31
+	srl	$18,$31,11
+	xor	$3,$19
+	sll	$19,$31,21
+	xor	$3,$18
+	srl	$18,$31,25
+	xor	$3,$19
+	sll	$19,$31,26
+	xor	$3,$18
+	xor	$20,$2			# Ch(e,f,g)
+	xor	$18,$19,$3			# Sigma1(e)
+
+	srl	$3,$7,2
+	addu	$17,$20
+	lw	$20,20($6)		# K[5]
+	sll	$19,$7,10
+	addu	$17,$18
+	srl	$18,$7,13
+	xor	$3,$19
+	sll	$19,$7,19
+	xor	$3,$18
+	srl	$18,$7,22
+	xor	$3,$19
+	sll	$19,$7,30
+	xor	$3,$18
+	sw	$13,20($29)	# offload to ring buffer
+	xor	$3,$19			# Sigma0(a)
+
+	or	$18,$7,$24
+	and	$19,$7,$24
+	and	$18,$25
+	or	$19,$18			# Maj(a,b,c)
+	addu	$17,$20			# +=K[5]
+	addu	$3,$19
+
+	addu	$30,$17
+	addu	$3,$17
+	lwl	$15,31($5)
+	lwr	$15,28($5)
+	srl	$19,$14,24		# byte swap(6)
+	srl	$20,$14,8
+	andi	$21,$14,0xFF00
+	sll	$14,$14,24
+	andi	$20,0xFF00
+	sll	$21,$21,8
+	or	$14,$19
+	or	$20,$21
+	or	$14,$20
+	addu	$18,$14,$2			# 6
+	srl	$2,$30,6
+	xor	$21,$31,$1
+	sll	$20,$30,7
+	and	$21,$30
+	srl	$19,$30,11
+	xor	$2,$20
+	sll	$20,$30,21
+	xor	$2,$19
+	srl	$19,$30,25
+	xor	$2,$20
+	sll	$20,$30,26
+	xor	$2,$19
+	xor	$21,$1			# Ch(e,f,g)
+	xor	$19,$20,$2			# Sigma1(e)
+
+	srl	$2,$3,2
+	addu	$18,$21
+	lw	$21,24($6)		# K[6]
+	sll	$20,$3,10
+	addu	$18,$19
+	srl	$19,$3,13
+	xor	$2,$20
+	sll	$20,$3,19
+	xor	$2,$19
+	srl	$19,$3,22
+	xor	$2,$20
+	sll	$20,$3,30
+	xor	$2,$19
+	sw	$14,24($29)	# offload to ring buffer
+	xor	$2,$20			# Sigma0(a)
+
+	or	$19,$3,$7
+	and	$20,$3,$7
+	and	$19,$24
+	or	$20,$19			# Maj(a,b,c)
+	addu	$18,$21			# +=K[6]
+	addu	$2,$20
+
+	addu	$25,$18
+	addu	$2,$18
+	lwl	$16,35($5)
+	lwr	$16,32($5)
+	srl	$20,$15,24		# byte swap(7)
+	srl	$21,$15,8
+	andi	$22,$15,0xFF00
+	sll	$15,$15,24
+	andi	$21,0xFF00
+	sll	$22,$22,8
+	or	$15,$20
+	or	$21,$22
+	or	$15,$21
+	addu	$19,$15,$1			# 7
+	srl	$1,$25,6
+	xor	$22,$30,$31
+	sll	$21,$25,7
+	and	$22,$25
+	srl	$20,$25,11
+	xor	$1,$21
+	sll	$21,$25,21
+	xor	$1,$20
+	srl	$20,$25,25
+	xor	$1,$21
+	sll	$21,$25,26
+	xor	$1,$20
+	xor	$22,$31			# Ch(e,f,g)
+	xor	$20,$21,$1			# Sigma1(e)
+
+	srl	$1,$2,2
+	addu	$19,$22
+	lw	$22,28($6)		# K[7]
+	sll	$21,$2,10
+	addu	$19,$20
+	srl	$20,$2,13
+	xor	$1,$21
+	sll	$21,$2,19
+	xor	$1,$20
+	srl	$20,$2,22
+	xor	$1,$21
+	sll	$21,$2,30
+	xor	$1,$20
+	sw	$15,28($29)	# offload to ring buffer
+	xor	$1,$21			# Sigma0(a)
+
+	or	$20,$2,$3
+	and	$21,$2,$3
+	and	$20,$7
+	or	$21,$20			# Maj(a,b,c)
+	addu	$19,$22			# +=K[7]
+	addu	$1,$21
+
+	addu	$24,$19
+	addu	$1,$19
+	lwl	$17,39($5)
+	lwr	$17,36($5)
+	srl	$21,$16,24		# byte swap(8)
+	srl	$22,$16,8
+	andi	$23,$16,0xFF00
+	sll	$16,$16,24
+	andi	$22,0xFF00
+	sll	$23,$23,8
+	or	$16,$21
+	or	$22,$23
+	or	$16,$22
+	addu	$20,$16,$31			# 8
+	srl	$31,$24,6
+	xor	$23,$25,$30
+	sll	$22,$24,7
+	and	$23,$24
+	srl	$21,$24,11
+	xor	$31,$22
+	sll	$22,$24,21
+	xor	$31,$21
+	srl	$21,$24,25
+	xor	$31,$22
+	sll	$22,$24,26
+	xor	$31,$21
+	xor	$23,$30			# Ch(e,f,g)
+	xor	$21,$22,$31			# Sigma1(e)
+
+	srl	$31,$1,2
+	addu	$20,$23
+	lw	$23,32($6)		# K[8]
+	sll	$22,$1,10
+	addu	$20,$21
+	srl	$21,$1,13
+	xor	$31,$22
+	sll	$22,$1,19
+	xor	$31,$21
+	srl	$21,$1,22
+	xor	$31,$22
+	sll	$22,$1,30
+	xor	$31,$21
+	sw	$16,32($29)	# offload to ring buffer
+	xor	$31,$22			# Sigma0(a)
+
+	or	$21,$1,$2
+	and	$22,$1,$2
+	and	$21,$3
+	or	$22,$21			# Maj(a,b,c)
+	addu	$20,$23			# +=K[8]
+	addu	$31,$22
+
+	addu	$7,$20
+	addu	$31,$20
+	lwl	$18,43($5)
+	lwr	$18,40($5)
+	srl	$22,$17,24		# byte swap(9)
+	srl	$23,$17,8
+	andi	$8,$17,0xFF00
+	sll	$17,$17,24
+	andi	$23,0xFF00
+	sll	$8,$8,8
+	or	$17,$22
+	or	$23,$8
+	or	$17,$23
+	addu	$21,$17,$30			# 9
+	srl	$30,$7,6
+	xor	$8,$24,$25
+	sll	$23,$7,7
+	and	$8,$7
+	srl	$22,$7,11
+	xor	$30,$23
+	sll	$23,$7,21
+	xor	$30,$22
+	srl	$22,$7,25
+	xor	$30,$23
+	sll	$23,$7,26
+	xor	$30,$22
+	xor	$8,$25			# Ch(e,f,g)
+	xor	$22,$23,$30			# Sigma1(e)
+
+	srl	$30,$31,2
+	addu	$21,$8
+	lw	$8,36($6)		# K[9]
+	sll	$23,$31,10
+	addu	$21,$22
+	srl	$22,$31,13
+	xor	$30,$23
+	sll	$23,$31,19
+	xor	$30,$22
+	srl	$22,$31,22
+	xor	$30,$23
+	sll	$23,$31,30
+	xor	$30,$22
+	sw	$17,36($29)	# offload to ring buffer
+	xor	$30,$23			# Sigma0(a)
+
+	or	$22,$31,$1
+	and	$23,$31,$1
+	and	$22,$2
+	or	$23,$22			# Maj(a,b,c)
+	addu	$21,$8			# +=K[9]
+	addu	$30,$23
+
+	addu	$3,$21
+	addu	$30,$21
+	lwl	$19,47($5)
+	lwr	$19,44($5)
+	srl	$23,$18,24		# byte swap(10)
+	srl	$8,$18,8
+	andi	$9,$18,0xFF00
+	sll	$18,$18,24
+	andi	$8,0xFF00
+	sll	$9,$9,8
+	or	$18,$23
+	or	$8,$9
+	or	$18,$8
+	addu	$22,$18,$25			# 10
+	srl	$25,$3,6
+	xor	$9,$7,$24
+	sll	$8,$3,7
+	and	$9,$3
+	srl	$23,$3,11
+	xor	$25,$8
+	sll	$8,$3,21
+	xor	$25,$23
+	srl	$23,$3,25
+	xor	$25,$8
+	sll	$8,$3,26
+	xor	$25,$23
+	xor	$9,$24			# Ch(e,f,g)
+	xor	$23,$8,$25			# Sigma1(e)
+
+	srl	$25,$30,2
+	addu	$22,$9
+	lw	$9,40($6)		# K[10]
+	sll	$8,$30,10
+	addu	$22,$23
+	srl	$23,$30,13
+	xor	$25,$8
+	sll	$8,$30,19
+	xor	$25,$23
+	srl	$23,$30,22
+	xor	$25,$8
+	sll	$8,$30,30
+	xor	$25,$23
+	sw	$18,40($29)	# offload to ring buffer
+	xor	$25,$8			# Sigma0(a)
+
+	or	$23,$30,$31
+	and	$8,$30,$31
+	and	$23,$1
+	or	$8,$23			# Maj(a,b,c)
+	addu	$22,$9			# +=K[10]
+	addu	$25,$8
+
+	addu	$2,$22
+	addu	$25,$22
+	lwl	$20,51($5)
+	lwr	$20,48($5)
+	srl	$8,$19,24		# byte swap(11)
+	srl	$9,$19,8
+	andi	$10,$19,0xFF00
+	sll	$19,$19,24
+	andi	$9,0xFF00
+	sll	$10,$10,8
+	or	$19,$8
+	or	$9,$10
+	or	$19,$9
+	addu	$23,$19,$24			# 11
+	srl	$24,$2,6
+	xor	$10,$3,$7
+	sll	$9,$2,7
+	and	$10,$2
+	srl	$8,$2,11
+	xor	$24,$9
+	sll	$9,$2,21
+	xor	$24,$8
+	srl	$8,$2,25
+	xor	$24,$9
+	sll	$9,$2,26
+	xor	$24,$8
+	xor	$10,$7			# Ch(e,f,g)
+	xor	$8,$9,$24			# Sigma1(e)
+
+	srl	$24,$25,2
+	addu	$23,$10
+	lw	$10,44($6)		# K[11]
+	sll	$9,$25,10
+	addu	$23,$8
+	srl	$8,$25,13
+	xor	$24,$9
+	sll	$9,$25,19
+	xor	$24,$8
+	srl	$8,$25,22
+	xor	$24,$9
+	sll	$9,$25,30
+	xor	$24,$8
+	sw	$19,44($29)	# offload to ring buffer
+	xor	$24,$9			# Sigma0(a)
+
+	or	$8,$25,$30
+	and	$9,$25,$30
+	and	$8,$31
+	or	$9,$8			# Maj(a,b,c)
+	addu	$23,$10			# +=K[11]
+	addu	$24,$9
+
+	addu	$1,$23
+	addu	$24,$23
+	lwl	$21,55($5)
+	lwr	$21,52($5)
+	srl	$9,$20,24		# byte swap(12)
+	srl	$10,$20,8
+	andi	$11,$20,0xFF00
+	sll	$20,$20,24
+	andi	$10,0xFF00
+	sll	$11,$11,8
+	or	$20,$9
+	or	$10,$11
+	or	$20,$10
+	addu	$8,$20,$7			# 12
+	srl	$7,$1,6
+	xor	$11,$2,$3
+	sll	$10,$1,7
+	and	$11,$1
+	srl	$9,$1,11
+	xor	$7,$10
+	sll	$10,$1,21
+	xor	$7,$9
+	srl	$9,$1,25
+	xor	$7,$10
+	sll	$10,$1,26
+	xor	$7,$9
+	xor	$11,$3			# Ch(e,f,g)
+	xor	$9,$10,$7			# Sigma1(e)
+
+	srl	$7,$24,2
+	addu	$8,$11
+	lw	$11,48($6)		# K[12]
+	sll	$10,$24,10
+	addu	$8,$9
+	srl	$9,$24,13
+	xor	$7,$10
+	sll	$10,$24,19
+	xor	$7,$9
+	srl	$9,$24,22
+	xor	$7,$10
+	sll	$10,$24,30
+	xor	$7,$9
+	sw	$20,48($29)	# offload to ring buffer
+	xor	$7,$10			# Sigma0(a)
+
+	or	$9,$24,$25
+	and	$10,$24,$25
+	and	$9,$30
+	or	$10,$9			# Maj(a,b,c)
+	addu	$8,$11			# +=K[12]
+	addu	$7,$10
+
+	addu	$31,$8
+	addu	$7,$8
+	lwl	$22,59($5)
+	lwr	$22,56($5)
+	srl	$10,$21,24		# byte swap(13)
+	srl	$11,$21,8
+	andi	$12,$21,0xFF00
+	sll	$21,$21,24
+	andi	$11,0xFF00
+	sll	$12,$12,8
+	or	$21,$10
+	or	$11,$12
+	or	$21,$11
+	addu	$9,$21,$3			# 13
+	srl	$3,$31,6
+	xor	$12,$1,$2
+	sll	$11,$31,7
+	and	$12,$31
+	srl	$10,$31,11
+	xor	$3,$11
+	sll	$11,$31,21
+	xor	$3,$10
+	srl	$10,$31,25
+	xor	$3,$11
+	sll	$11,$31,26
+	xor	$3,$10
+	xor	$12,$2			# Ch(e,f,g)
+	xor	$10,$11,$3			# Sigma1(e)
+
+	srl	$3,$7,2
+	addu	$9,$12
+	lw	$12,52($6)		# K[13]
+	sll	$11,$7,10
+	addu	$9,$10
+	srl	$10,$7,13
+	xor	$3,$11
+	sll	$11,$7,19
+	xor	$3,$10
+	srl	$10,$7,22
+	xor	$3,$11
+	sll	$11,$7,30
+	xor	$3,$10
+	sw	$21,52($29)	# offload to ring buffer
+	xor	$3,$11			# Sigma0(a)
+
+	or	$10,$7,$24
+	and	$11,$7,$24
+	and	$10,$25
+	or	$11,$10			# Maj(a,b,c)
+	addu	$9,$12			# +=K[13]
+	addu	$3,$11
+
+	addu	$30,$9
+	addu	$3,$9
+	lw	$8,0($29)	# prefetch from ring buffer
+	lwl	$23,63($5)
+	lwr	$23,60($5)
+	srl	$11,$22,24		# byte swap(14)
+	srl	$12,$22,8
+	andi	$13,$22,0xFF00
+	sll	$22,$22,24
+	andi	$12,0xFF00
+	sll	$13,$13,8
+	or	$22,$11
+	or	$12,$13
+	or	$22,$12
+	addu	$10,$22,$2			# 14
+	srl	$2,$30,6
+	xor	$13,$31,$1
+	sll	$12,$30,7
+	and	$13,$30
+	srl	$11,$30,11
+	xor	$2,$12
+	sll	$12,$30,21
+	xor	$2,$11
+	srl	$11,$30,25
+	xor	$2,$12
+	sll	$12,$30,26
+	xor	$2,$11
+	xor	$13,$1			# Ch(e,f,g)
+	xor	$11,$12,$2			# Sigma1(e)
+
+	srl	$2,$3,2
+	addu	$10,$13
+	lw	$13,56($6)		# K[14]
+	sll	$12,$3,10
+	addu	$10,$11
+	srl	$11,$3,13
+	xor	$2,$12
+	sll	$12,$3,19
+	xor	$2,$11
+	srl	$11,$3,22
+	xor	$2,$12
+	sll	$12,$3,30
+	xor	$2,$11
+	sw	$22,56($29)	# offload to ring buffer
+	xor	$2,$12			# Sigma0(a)
+
+	or	$11,$3,$7
+	and	$12,$3,$7
+	and	$11,$24
+	or	$12,$11			# Maj(a,b,c)
+	addu	$10,$13			# +=K[14]
+	addu	$2,$12
+
+	addu	$25,$10
+	addu	$2,$10
+	lw	$9,4($29)	# prefetch from ring buffer
+	srl	$12,$23,24		# byte swap(15)
+	srl	$13,$23,8
+	andi	$14,$23,0xFF00
+	sll	$23,$23,24
+	andi	$13,0xFF00
+	sll	$14,$14,8
+	or	$23,$12
+	or	$13,$14
+	or	$23,$13
+	addu	$11,$23,$1			# 15
+	srl	$1,$25,6
+	xor	$14,$30,$31
+	sll	$13,$25,7
+	and	$14,$25
+	srl	$12,$25,11
+	xor	$1,$13
+	sll	$13,$25,21
+	xor	$1,$12
+	srl	$12,$25,25
+	xor	$1,$13
+	sll	$13,$25,26
+	xor	$1,$12
+	xor	$14,$31			# Ch(e,f,g)
+	xor	$12,$13,$1			# Sigma1(e)
+
+	srl	$1,$2,2
+	addu	$11,$14
+	lw	$14,60($6)		# K[15]
+	sll	$13,$2,10
+	addu	$11,$12
+	srl	$12,$2,13
+	xor	$1,$13
+	sll	$13,$2,19
+	xor	$1,$12
+	srl	$12,$2,22
+	xor	$1,$13
+	sll	$13,$2,30
+	xor	$1,$12
+	sw	$23,60($29)	# offload to ring buffer
+	xor	$1,$13			# Sigma0(a)
+
+	or	$12,$2,$3
+	and	$13,$2,$3
+	and	$12,$7
+	or	$13,$12			# Maj(a,b,c)
+	addu	$11,$14			# +=K[15]
+	addu	$1,$13
+
+	addu	$24,$11
+	addu	$1,$11
+	lw	$10,8($29)	# prefetch from ring buffer
+	b	.L16_xx
+.align	4
+.L16_xx:
+	srl	$14,$9,3		# Xupdate(16)
+	addu	$8,$17			# +=X[i+9]
+	sll	$13,$9,14
+	srl	$12,$9,7
+	xor	$14,$13
+	sll	$13,11
+	xor	$14,$12
+	srl	$12,$9,18
+	xor	$14,$13
+
+	srl	$15,$22,10
+	xor	$14,$12			# sigma0(X[i+1])
+	sll	$13,$22,13
+	addu	$8,$14
+	srl	$12,$22,17
+	xor	$15,$13
+	sll	$13,2
+	xor	$15,$12
+	srl	$12,$22,19
+	xor	$15,$13
+
+	xor	$15,$12			# sigma1(X[i+14])
+	addu	$8,$15
+	addu	$12,$8,$31			# 16
+	srl	$31,$24,6
+	xor	$15,$25,$30
+	sll	$14,$24,7
+	and	$15,$24
+	srl	$13,$24,11
+	xor	$31,$14
+	sll	$14,$24,21
+	xor	$31,$13
+	srl	$13,$24,25
+	xor	$31,$14
+	sll	$14,$24,26
+	xor	$31,$13
+	xor	$15,$30			# Ch(e,f,g)
+	xor	$13,$14,$31			# Sigma1(e)
+
+	srl	$31,$1,2
+	addu	$12,$15
+	lw	$15,64($6)		# K[16]
+	sll	$14,$1,10
+	addu	$12,$13
+	srl	$13,$1,13
+	xor	$31,$14
+	sll	$14,$1,19
+	xor	$31,$13
+	srl	$13,$1,22
+	xor	$31,$14
+	sll	$14,$1,30
+	xor	$31,$13
+	sw	$8,0($29)	# offload to ring buffer
+	xor	$31,$14			# Sigma0(a)
+
+	or	$13,$1,$2
+	and	$14,$1,$2
+	and	$13,$3
+	or	$14,$13			# Maj(a,b,c)
+	addu	$12,$15			# +=K[16]
+	addu	$31,$14
+
+	addu	$7,$12
+	addu	$31,$12
+	lw	$11,12($29)	# prefetch from ring buffer
+	srl	$15,$10,3		# Xupdate(17)
+	addu	$9,$18			# +=X[i+9]
+	sll	$14,$10,14
+	srl	$13,$10,7
+	xor	$15,$14
+	sll	$14,11
+	xor	$15,$13
+	srl	$13,$10,18
+	xor	$15,$14
+
+	srl	$16,$23,10
+	xor	$15,$13			# sigma0(X[i+1])
+	sll	$14,$23,13
+	addu	$9,$15
+	srl	$13,$23,17
+	xor	$16,$14
+	sll	$14,2
+	xor	$16,$13
+	srl	$13,$23,19
+	xor	$16,$14
+
+	xor	$16,$13			# sigma1(X[i+14])
+	addu	$9,$16
+	addu	$13,$9,$30			# 17
+	srl	$30,$7,6
+	xor	$16,$24,$25
+	sll	$15,$7,7
+	and	$16,$7
+	srl	$14,$7,11
+	xor	$30,$15
+	sll	$15,$7,21
+	xor	$30,$14
+	srl	$14,$7,25
+	xor	$30,$15
+	sll	$15,$7,26
+	xor	$30,$14
+	xor	$16,$25			# Ch(e,f,g)
+	xor	$14,$15,$30			# Sigma1(e)
+
+	srl	$30,$31,2
+	addu	$13,$16
+	lw	$16,68($6)		# K[17]
+	sll	$15,$31,10
+	addu	$13,$14
+	srl	$14,$31,13
+	xor	$30,$15
+	sll	$15,$31,19
+	xor	$30,$14
+	srl	$14,$31,22
+	xor	$30,$15
+	sll	$15,$31,30
+	xor	$30,$14
+	sw	$9,4($29)	# offload to ring buffer
+	xor	$30,$15			# Sigma0(a)
+
+	or	$14,$31,$1
+	and	$15,$31,$1
+	and	$14,$2
+	or	$15,$14			# Maj(a,b,c)
+	addu	$13,$16			# +=K[17]
+	addu	$30,$15
+
+	addu	$3,$13
+	addu	$30,$13
+	lw	$12,16($29)	# prefetch from ring buffer
+	srl	$16,$11,3		# Xupdate(18)
+	addu	$10,$19			# +=X[i+9]
+	sll	$15,$11,14
+	srl	$14,$11,7
+	xor	$16,$15
+	sll	$15,11
+	xor	$16,$14
+	srl	$14,$11,18
+	xor	$16,$15
+
+	srl	$17,$8,10
+	xor	$16,$14			# sigma0(X[i+1])
+	sll	$15,$8,13
+	addu	$10,$16
+	srl	$14,$8,17
+	xor	$17,$15
+	sll	$15,2
+	xor	$17,$14
+	srl	$14,$8,19
+	xor	$17,$15
+
+	xor	$17,$14			# sigma1(X[i+14])
+	addu	$10,$17
+	addu	$14,$10,$25			# 18
+	srl	$25,$3,6
+	xor	$17,$7,$24
+	sll	$16,$3,7
+	and	$17,$3
+	srl	$15,$3,11
+	xor	$25,$16
+	sll	$16,$3,21
+	xor	$25,$15
+	srl	$15,$3,25
+	xor	$25,$16
+	sll	$16,$3,26
+	xor	$25,$15
+	xor	$17,$24			# Ch(e,f,g)
+	xor	$15,$16,$25			# Sigma1(e)
+
+	srl	$25,$30,2
+	addu	$14,$17
+	lw	$17,72($6)		# K[18]
+	sll	$16,$30,10
+	addu	$14,$15
+	srl	$15,$30,13
+	xor	$25,$16
+	sll	$16,$30,19
+	xor	$25,$15
+	srl	$15,$30,22
+	xor	$25,$16
+	sll	$16,$30,30
+	xor	$25,$15
+	sw	$10,8($29)	# offload to ring buffer
+	xor	$25,$16			# Sigma0(a)
+
+	or	$15,$30,$31
+	and	$16,$30,$31
+	and	$15,$1
+	or	$16,$15			# Maj(a,b,c)
+	addu	$14,$17			# +=K[18]
+	addu	$25,$16
+
+	addu	$2,$14
+	addu	$25,$14
+	lw	$13,20($29)	# prefetch from ring buffer
+	srl	$17,$12,3		# Xupdate(19)
+	addu	$11,$20			# +=X[i+9]
+	sll	$16,$12,14
+	srl	$15,$12,7
+	xor	$17,$16
+	sll	$16,11
+	xor	$17,$15
+	srl	$15,$12,18
+	xor	$17,$16
+
+	srl	$18,$9,10
+	xor	$17,$15			# sigma0(X[i+1])
+	sll	$16,$9,13
+	addu	$11,$17
+	srl	$15,$9,17
+	xor	$18,$16
+	sll	$16,2
+	xor	$18,$15
+	srl	$15,$9,19
+	xor	$18,$16
+
+	xor	$18,$15			# sigma1(X[i+14])
+	addu	$11,$18
+	addu	$15,$11,$24			# 19
+	srl	$24,$2,6
+	xor	$18,$3,$7
+	sll	$17,$2,7
+	and	$18,$2
+	srl	$16,$2,11
+	xor	$24,$17
+	sll	$17,$2,21
+	xor	$24,$16
+	srl	$16,$2,25
+	xor	$24,$17
+	sll	$17,$2,26
+	xor	$24,$16
+	xor	$18,$7			# Ch(e,f,g)
+	xor	$16,$17,$24			# Sigma1(e)
+
+	srl	$24,$25,2
+	addu	$15,$18
+	lw	$18,76($6)		# K[19]
+	sll	$17,$25,10
+	addu	$15,$16
+	srl	$16,$25,13
+	xor	$24,$17
+	sll	$17,$25,19
+	xor	$24,$16
+	srl	$16,$25,22
+	xor	$24,$17
+	sll	$17,$25,30
+	xor	$24,$16
+	sw	$11,12($29)	# offload to ring buffer
+	xor	$24,$17			# Sigma0(a)
+
+	or	$16,$25,$30
+	and	$17,$25,$30
+	and	$16,$31
+	or	$17,$16			# Maj(a,b,c)
+	addu	$15,$18			# +=K[19]
+	addu	$24,$17
+
+	addu	$1,$15
+	addu	$24,$15
+	lw	$14,24($29)	# prefetch from ring buffer
+	srl	$18,$13,3		# Xupdate(20)
+	addu	$12,$21			# +=X[i+9]
+	sll	$17,$13,14
+	srl	$16,$13,7
+	xor	$18,$17
+	sll	$17,11
+	xor	$18,$16
+	srl	$16,$13,18
+	xor	$18,$17
+
+	srl	$19,$10,10
+	xor	$18,$16			# sigma0(X[i+1])
+	sll	$17,$10,13
+	addu	$12,$18
+	srl	$16,$10,17
+	xor	$19,$17
+	sll	$17,2
+	xor	$19,$16
+	srl	$16,$10,19
+	xor	$19,$17
+
+	xor	$19,$16			# sigma1(X[i+14])
+	addu	$12,$19
+	addu	$16,$12,$7			# 20
+	srl	$7,$1,6
+	xor	$19,$2,$3
+	sll	$18,$1,7
+	and	$19,$1
+	srl	$17,$1,11
+	xor	$7,$18
+	sll	$18,$1,21
+	xor	$7,$17
+	srl	$17,$1,25
+	xor	$7,$18
+	sll	$18,$1,26
+	xor	$7,$17
+	xor	$19,$3			# Ch(e,f,g)
+	xor	$17,$18,$7			# Sigma1(e)
+
+	srl	$7,$24,2
+	addu	$16,$19
+	lw	$19,80($6)		# K[20]
+	sll	$18,$24,10
+	addu	$16,$17
+	srl	$17,$24,13
+	xor	$7,$18
+	sll	$18,$24,19
+	xor	$7,$17
+	srl	$17,$24,22
+	xor	$7,$18
+	sll	$18,$24,30
+	xor	$7,$17
+	sw	$12,16($29)	# offload to ring buffer
+	xor	$7,$18			# Sigma0(a)
+
+	or	$17,$24,$25
+	and	$18,$24,$25
+	and	$17,$30
+	or	$18,$17			# Maj(a,b,c)
+	addu	$16,$19			# +=K[20]
+	addu	$7,$18
+
+	addu	$31,$16
+	addu	$7,$16
+	lw	$15,28($29)	# prefetch from ring buffer
+	srl	$19,$14,3		# Xupdate(21)
+	addu	$13,$22			# +=X[i+9]
+	sll	$18,$14,14
+	srl	$17,$14,7
+	xor	$19,$18
+	sll	$18,11
+	xor	$19,$17
+	srl	$17,$14,18
+	xor	$19,$18
+
+	srl	$20,$11,10
+	xor	$19,$17			# sigma0(X[i+1])
+	sll	$18,$11,13
+	addu	$13,$19
+	srl	$17,$11,17
+	xor	$20,$18
+	sll	$18,2
+	xor	$20,$17
+	srl	$17,$11,19
+	xor	$20,$18
+
+	xor	$20,$17			# sigma1(X[i+14])
+	addu	$13,$20
+	addu	$17,$13,$3			# 21
+	srl	$3,$31,6
+	xor	$20,$1,$2
+	sll	$19,$31,7
+	and	$20,$31
+	srl	$18,$31,11
+	xor	$3,$19
+	sll	$19,$31,21
+	xor	$3,$18
+	srl	$18,$31,25
+	xor	$3,$19
+	sll	$19,$31,26
+	xor	$3,$18
+	xor	$20,$2			# Ch(e,f,g)
+	xor	$18,$19,$3			# Sigma1(e)
+
+	srl	$3,$7,2
+	addu	$17,$20
+	lw	$20,84($6)		# K[21]
+	sll	$19,$7,10
+	addu	$17,$18
+	srl	$18,$7,13
+	xor	$3,$19
+	sll	$19,$7,19
+	xor	$3,$18
+	srl	$18,$7,22
+	xor	$3,$19
+	sll	$19,$7,30
+	xor	$3,$18
+	sw	$13,20($29)	# offload to ring buffer
+	xor	$3,$19			# Sigma0(a)
+
+	or	$18,$7,$24
+	and	$19,$7,$24
+	and	$18,$25
+	or	$19,$18			# Maj(a,b,c)
+	addu	$17,$20			# +=K[21]
+	addu	$3,$19
+
+	addu	$30,$17
+	addu	$3,$17
+	lw	$16,32($29)	# prefetch from ring buffer
+	srl	$20,$15,3		# Xupdate(22)
+	addu	$14,$23			# +=X[i+9]
+	sll	$19,$15,14
+	srl	$18,$15,7
+	xor	$20,$19
+	sll	$19,11
+	xor	$20,$18
+	srl	$18,$15,18
+	xor	$20,$19
+
+	srl	$21,$12,10
+	xor	$20,$18			# sigma0(X[i+1])
+	sll	$19,$12,13
+	addu	$14,$20
+	srl	$18,$12,17
+	xor	$21,$19
+	sll	$19,2
+	xor	$21,$18
+	srl	$18,$12,19
+	xor	$21,$19
+
+	xor	$21,$18			# sigma1(X[i+14])
+	addu	$14,$21
+	addu	$18,$14,$2			# 22
+	srl	$2,$30,6
+	xor	$21,$31,$1
+	sll	$20,$30,7
+	and	$21,$30
+	srl	$19,$30,11
+	xor	$2,$20
+	sll	$20,$30,21
+	xor	$2,$19
+	srl	$19,$30,25
+	xor	$2,$20
+	sll	$20,$30,26
+	xor	$2,$19
+	xor	$21,$1			# Ch(e,f,g)
+	xor	$19,$20,$2			# Sigma1(e)
+
+	srl	$2,$3,2
+	addu	$18,$21
+	lw	$21,88($6)		# K[22]
+	sll	$20,$3,10
+	addu	$18,$19
+	srl	$19,$3,13
+	xor	$2,$20
+	sll	$20,$3,19
+	xor	$2,$19
+	srl	$19,$3,22
+	xor	$2,$20
+	sll	$20,$3,30
+	xor	$2,$19
+	sw	$14,24($29)	# offload to ring buffer
+	xor	$2,$20			# Sigma0(a)
+
+	or	$19,$3,$7
+	and	$20,$3,$7
+	and	$19,$24
+	or	$20,$19			# Maj(a,b,c)
+	addu	$18,$21			# +=K[22]
+	addu	$2,$20
+
+	addu	$25,$18
+	addu	$2,$18
+	lw	$17,36($29)	# prefetch from ring buffer
+	srl	$21,$16,3		# Xupdate(23)
+	addu	$15,$8			# +=X[i+9]
+	sll	$20,$16,14
+	srl	$19,$16,7
+	xor	$21,$20
+	sll	$20,11
+	xor	$21,$19
+	srl	$19,$16,18
+	xor	$21,$20
+
+	srl	$22,$13,10
+	xor	$21,$19			# sigma0(X[i+1])
+	sll	$20,$13,13
+	addu	$15,$21
+	srl	$19,$13,17
+	xor	$22,$20
+	sll	$20,2
+	xor	$22,$19
+	srl	$19,$13,19
+	xor	$22,$20
+
+	xor	$22,$19			# sigma1(X[i+14])
+	addu	$15,$22
+	addu	$19,$15,$1			# 23
+	srl	$1,$25,6
+	xor	$22,$30,$31
+	sll	$21,$25,7
+	and	$22,$25
+	srl	$20,$25,11
+	xor	$1,$21
+	sll	$21,$25,21
+	xor	$1,$20
+	srl	$20,$25,25
+	xor	$1,$21
+	sll	$21,$25,26
+	xor	$1,$20
+	xor	$22,$31			# Ch(e,f,g)
+	xor	$20,$21,$1			# Sigma1(e)
+
+	srl	$1,$2,2
+	addu	$19,$22
+	lw	$22,92($6)		# K[23]
+	sll	$21,$2,10
+	addu	$19,$20
+	srl	$20,$2,13
+	xor	$1,$21
+	sll	$21,$2,19
+	xor	$1,$20
+	srl	$20,$2,22
+	xor	$1,$21
+	sll	$21,$2,30
+	xor	$1,$20
+	sw	$15,28($29)	# offload to ring buffer
+	xor	$1,$21			# Sigma0(a)
+
+	or	$20,$2,$3
+	and	$21,$2,$3
+	and	$20,$7
+	or	$21,$20			# Maj(a,b,c)
+	addu	$19,$22			# +=K[23]
+	addu	$1,$21
+
+	addu	$24,$19
+	addu	$1,$19
+	lw	$18,40($29)	# prefetch from ring buffer
+	srl	$22,$17,3		# Xupdate(24)
+	addu	$16,$9			# +=X[i+9]
+	sll	$21,$17,14
+	srl	$20,$17,7
+	xor	$22,$21
+	sll	$21,11
+	xor	$22,$20
+	srl	$20,$17,18
+	xor	$22,$21
+
+	srl	$23,$14,10
+	xor	$22,$20			# sigma0(X[i+1])
+	sll	$21,$14,13
+	addu	$16,$22
+	srl	$20,$14,17
+	xor	$23,$21
+	sll	$21,2
+	xor	$23,$20
+	srl	$20,$14,19
+	xor	$23,$21
+
+	xor	$23,$20			# sigma1(X[i+14])
+	addu	$16,$23
+	addu	$20,$16,$31			# 24
+	srl	$31,$24,6
+	xor	$23,$25,$30
+	sll	$22,$24,7
+	and	$23,$24
+	srl	$21,$24,11
+	xor	$31,$22
+	sll	$22,$24,21
+	xor	$31,$21
+	srl	$21,$24,25
+	xor	$31,$22
+	sll	$22,$24,26
+	xor	$31,$21
+	xor	$23,$30			# Ch(e,f,g)
+	xor	$21,$22,$31			# Sigma1(e)
+
+	srl	$31,$1,2
+	addu	$20,$23
+	lw	$23,96($6)		# K[24]
+	sll	$22,$1,10
+	addu	$20,$21
+	srl	$21,$1,13
+	xor	$31,$22
+	sll	$22,$1,19
+	xor	$31,$21
+	srl	$21,$1,22
+	xor	$31,$22
+	sll	$22,$1,30
+	xor	$31,$21
+	sw	$16,32($29)	# offload to ring buffer
+	xor	$31,$22			# Sigma0(a)
+
+	or	$21,$1,$2
+	and	$22,$1,$2
+	and	$21,$3
+	or	$22,$21			# Maj(a,b,c)
+	addu	$20,$23			# +=K[24]
+	addu	$31,$22
+
+	addu	$7,$20
+	addu	$31,$20
+	lw	$19,44($29)	# prefetch from ring buffer
+	srl	$23,$18,3		# Xupdate(25)
+	addu	$17,$10			# +=X[i+9]
+	sll	$22,$18,14
+	srl	$21,$18,7
+	xor	$23,$22
+	sll	$22,11
+	xor	$23,$21
+	srl	$21,$18,18
+	xor	$23,$22
+
+	srl	$8,$15,10
+	xor	$23,$21			# sigma0(X[i+1])
+	sll	$22,$15,13
+	addu	$17,$23
+	srl	$21,$15,17
+	xor	$8,$22
+	sll	$22,2
+	xor	$8,$21
+	srl	$21,$15,19
+	xor	$8,$22
+
+	xor	$8,$21			# sigma1(X[i+14])
+	addu	$17,$8
+	addu	$21,$17,$30			# 25
+	srl	$30,$7,6
+	xor	$8,$24,$25
+	sll	$23,$7,7
+	and	$8,$7
+	srl	$22,$7,11
+	xor	$30,$23
+	sll	$23,$7,21
+	xor	$30,$22
+	srl	$22,$7,25
+	xor	$30,$23
+	sll	$23,$7,26
+	xor	$30,$22
+	xor	$8,$25			# Ch(e,f,g)
+	xor	$22,$23,$30			# Sigma1(e)
+
+	srl	$30,$31,2
+	addu	$21,$8
+	lw	$8,100($6)		# K[25]
+	sll	$23,$31,10
+	addu	$21,$22
+	srl	$22,$31,13
+	xor	$30,$23
+	sll	$23,$31,19
+	xor	$30,$22
+	srl	$22,$31,22
+	xor	$30,$23
+	sll	$23,$31,30
+	xor	$30,$22
+	sw	$17,36($29)	# offload to ring buffer
+	xor	$30,$23			# Sigma0(a)
+
+	or	$22,$31,$1
+	and	$23,$31,$1
+	and	$22,$2
+	or	$23,$22			# Maj(a,b,c)
+	addu	$21,$8			# +=K[25]
+	addu	$30,$23
+
+	addu	$3,$21
+	addu	$30,$21
+	lw	$20,48($29)	# prefetch from ring buffer
+	srl	$8,$19,3		# Xupdate(26)
+	addu	$18,$11			# +=X[i+9]
+	sll	$23,$19,14
+	srl	$22,$19,7
+	xor	$8,$23
+	sll	$23,11
+	xor	$8,$22
+	srl	$22,$19,18
+	xor	$8,$23
+
+	srl	$9,$16,10
+	xor	$8,$22			# sigma0(X[i+1])
+	sll	$23,$16,13
+	addu	$18,$8
+	srl	$22,$16,17
+	xor	$9,$23
+	sll	$23,2
+	xor	$9,$22
+	srl	$22,$16,19
+	xor	$9,$23
+
+	xor	$9,$22			# sigma1(X[i+14])
+	addu	$18,$9
+	addu	$22,$18,$25			# 26
+	srl	$25,$3,6
+	xor	$9,$7,$24
+	sll	$8,$3,7
+	and	$9,$3
+	srl	$23,$3,11
+	xor	$25,$8
+	sll	$8,$3,21
+	xor	$25,$23
+	srl	$23,$3,25
+	xor	$25,$8
+	sll	$8,$3,26
+	xor	$25,$23
+	xor	$9,$24			# Ch(e,f,g)
+	xor	$23,$8,$25			# Sigma1(e)
+
+	srl	$25,$30,2
+	addu	$22,$9
+	lw	$9,104($6)		# K[26]
+	sll	$8,$30,10
+	addu	$22,$23
+	srl	$23,$30,13
+	xor	$25,$8
+	sll	$8,$30,19
+	xor	$25,$23
+	srl	$23,$30,22
+	xor	$25,$8
+	sll	$8,$30,30
+	xor	$25,$23
+	sw	$18,40($29)	# offload to ring buffer
+	xor	$25,$8			# Sigma0(a)
+
+	or	$23,$30,$31
+	and	$8,$30,$31
+	and	$23,$1
+	or	$8,$23			# Maj(a,b,c)
+	addu	$22,$9			# +=K[26]
+	addu	$25,$8
+
+	addu	$2,$22
+	addu	$25,$22
+	lw	$21,52($29)	# prefetch from ring buffer
+	srl	$9,$20,3		# Xupdate(27)
+	addu	$19,$12			# +=X[i+9]
+	sll	$8,$20,14
+	srl	$23,$20,7
+	xor	$9,$8
+	sll	$8,11
+	xor	$9,$23
+	srl	$23,$20,18
+	xor	$9,$8
+
+	srl	$10,$17,10
+	xor	$9,$23			# sigma0(X[i+1])
+	sll	$8,$17,13
+	addu	$19,$9
+	srl	$23,$17,17
+	xor	$10,$8
+	sll	$8,2
+	xor	$10,$23
+	srl	$23,$17,19
+	xor	$10,$8
+
+	xor	$10,$23			# sigma1(X[i+14])
+	addu	$19,$10
+	addu	$23,$19,$24			# 27
+	srl	$24,$2,6
+	xor	$10,$3,$7
+	sll	$9,$2,7
+	and	$10,$2
+	srl	$8,$2,11
+	xor	$24,$9
+	sll	$9,$2,21
+	xor	$24,$8
+	srl	$8,$2,25
+	xor	$24,$9
+	sll	$9,$2,26
+	xor	$24,$8
+	xor	$10,$7			# Ch(e,f,g)
+	xor	$8,$9,$24			# Sigma1(e)
+
+	srl	$24,$25,2
+	addu	$23,$10
+	lw	$10,108($6)		# K[27]
+	sll	$9,$25,10
+	addu	$23,$8
+	srl	$8,$25,13
+	xor	$24,$9
+	sll	$9,$25,19
+	xor	$24,$8
+	srl	$8,$25,22
+	xor	$24,$9
+	sll	$9,$25,30
+	xor	$24,$8
+	sw	$19,44($29)	# offload to ring buffer
+	xor	$24,$9			# Sigma0(a)
+
+	or	$8,$25,$30
+	and	$9,$25,$30
+	and	$8,$31
+	or	$9,$8			# Maj(a,b,c)
+	addu	$23,$10			# +=K[27]
+	addu	$24,$9
+
+	addu	$1,$23
+	addu	$24,$23
+	lw	$22,56($29)	# prefetch from ring buffer
+	srl	$10,$21,3		# Xupdate(28)
+	addu	$20,$13			# +=X[i+9]
+	sll	$9,$21,14
+	srl	$8,$21,7
+	xor	$10,$9
+	sll	$9,11
+	xor	$10,$8
+	srl	$8,$21,18
+	xor	$10,$9
+
+	srl	$11,$18,10
+	xor	$10,$8			# sigma0(X[i+1])
+	sll	$9,$18,13
+	addu	$20,$10
+	srl	$8,$18,17
+	xor	$11,$9
+	sll	$9,2
+	xor	$11,$8
+	srl	$8,$18,19
+	xor	$11,$9
+
+	xor	$11,$8			# sigma1(X[i+14])
+	addu	$20,$11
+	addu	$8,$20,$7			# 28
+	srl	$7,$1,6
+	xor	$11,$2,$3
+	sll	$10,$1,7
+	and	$11,$1
+	srl	$9,$1,11
+	xor	$7,$10
+	sll	$10,$1,21
+	xor	$7,$9
+	srl	$9,$1,25
+	xor	$7,$10
+	sll	$10,$1,26
+	xor	$7,$9
+	xor	$11,$3			# Ch(e,f,g)
+	xor	$9,$10,$7			# Sigma1(e)
+
+	srl	$7,$24,2
+	addu	$8,$11
+	lw	$11,112($6)		# K[28]
+	sll	$10,$24,10
+	addu	$8,$9
+	srl	$9,$24,13
+	xor	$7,$10
+	sll	$10,$24,19
+	xor	$7,$9
+	srl	$9,$24,22
+	xor	$7,$10
+	sll	$10,$24,30
+	xor	$7,$9
+	sw	$20,48($29)	# offload to ring buffer
+	xor	$7,$10			# Sigma0(a)
+
+	or	$9,$24,$25
+	and	$10,$24,$25
+	and	$9,$30
+	or	$10,$9			# Maj(a,b,c)
+	addu	$8,$11			# +=K[28]
+	addu	$7,$10
+
+	addu	$31,$8
+	addu	$7,$8
+	lw	$23,60($29)	# prefetch from ring buffer
+	srl	$11,$22,3		# Xupdate(29)
+	addu	$21,$14			# +=X[i+9]
+	sll	$10,$22,14
+	srl	$9,$22,7
+	xor	$11,$10
+	sll	$10,11
+	xor	$11,$9
+	srl	$9,$22,18
+	xor	$11,$10
+
+	srl	$12,$19,10
+	xor	$11,$9			# sigma0(X[i+1])
+	sll	$10,$19,13
+	addu	$21,$11
+	srl	$9,$19,17
+	xor	$12,$10
+	sll	$10,2
+	xor	$12,$9
+	srl	$9,$19,19
+	xor	$12,$10
+
+	xor	$12,$9			# sigma1(X[i+14])
+	addu	$21,$12
+	addu	$9,$21,$3			# 29
+	srl	$3,$31,6
+	xor	$12,$1,$2
+	sll	$11,$31,7
+	and	$12,$31
+	srl	$10,$31,11
+	xor	$3,$11
+	sll	$11,$31,21
+	xor	$3,$10
+	srl	$10,$31,25
+	xor	$3,$11
+	sll	$11,$31,26
+	xor	$3,$10
+	xor	$12,$2			# Ch(e,f,g)
+	xor	$10,$11,$3			# Sigma1(e)
+
+	srl	$3,$7,2
+	addu	$9,$12
+	lw	$12,116($6)		# K[29]
+	sll	$11,$7,10
+	addu	$9,$10
+	srl	$10,$7,13
+	xor	$3,$11
+	sll	$11,$7,19
+	xor	$3,$10
+	srl	$10,$7,22
+	xor	$3,$11
+	sll	$11,$7,30
+	xor	$3,$10
+	sw	$21,52($29)	# offload to ring buffer
+	xor	$3,$11			# Sigma0(a)
+
+	or	$10,$7,$24
+	and	$11,$7,$24
+	and	$10,$25
+	or	$11,$10			# Maj(a,b,c)
+	addu	$9,$12			# +=K[29]
+	addu	$3,$11
+
+	addu	$30,$9
+	addu	$3,$9
+	lw	$8,0($29)	# prefetch from ring buffer
+	srl	$12,$23,3		# Xupdate(30)
+	addu	$22,$15			# +=X[i+9]
+	sll	$11,$23,14
+	srl	$10,$23,7
+	xor	$12,$11
+	sll	$11,11
+	xor	$12,$10
+	srl	$10,$23,18
+	xor	$12,$11
+
+	srl	$13,$20,10
+	xor	$12,$10			# sigma0(X[i+1])
+	sll	$11,$20,13
+	addu	$22,$12
+	srl	$10,$20,17
+	xor	$13,$11
+	sll	$11,2
+	xor	$13,$10
+	srl	$10,$20,19
+	xor	$13,$11
+
+	xor	$13,$10			# sigma1(X[i+14])
+	addu	$22,$13
+	addu	$10,$22,$2			# 30
+	srl	$2,$30,6
+	xor	$13,$31,$1
+	sll	$12,$30,7
+	and	$13,$30
+	srl	$11,$30,11
+	xor	$2,$12
+	sll	$12,$30,21
+	xor	$2,$11
+	srl	$11,$30,25
+	xor	$2,$12
+	sll	$12,$30,26
+	xor	$2,$11
+	xor	$13,$1			# Ch(e,f,g)
+	xor	$11,$12,$2			# Sigma1(e)
+
+	srl	$2,$3,2
+	addu	$10,$13
+	lw	$13,120($6)		# K[30]
+	sll	$12,$3,10
+	addu	$10,$11
+	srl	$11,$3,13
+	xor	$2,$12
+	sll	$12,$3,19
+	xor	$2,$11
+	srl	$11,$3,22
+	xor	$2,$12
+	sll	$12,$3,30
+	xor	$2,$11
+	sw	$22,56($29)	# offload to ring buffer
+	xor	$2,$12			# Sigma0(a)
+
+	or	$11,$3,$7
+	and	$12,$3,$7
+	and	$11,$24
+	or	$12,$11			# Maj(a,b,c)
+	addu	$10,$13			# +=K[30]
+	addu	$2,$12
+
+	addu	$25,$10
+	addu	$2,$10
+	lw	$9,4($29)	# prefetch from ring buffer
+	srl	$13,$8,3		# Xupdate(31)
+	addu	$23,$16			# +=X[i+9]
+	sll	$12,$8,14
+	srl	$11,$8,7
+	xor	$13,$12
+	sll	$12,11
+	xor	$13,$11
+	srl	$11,$8,18
+	xor	$13,$12
+
+	srl	$14,$21,10
+	xor	$13,$11			# sigma0(X[i+1])
+	sll	$12,$21,13
+	addu	$23,$13
+	srl	$11,$21,17
+	xor	$14,$12
+	sll	$12,2
+	xor	$14,$11
+	srl	$11,$21,19
+	xor	$14,$12
+
+	xor	$14,$11			# sigma1(X[i+14])
+	addu	$23,$14
+	addu	$11,$23,$1			# 31
+	srl	$1,$25,6
+	xor	$14,$30,$31
+	sll	$13,$25,7
+	and	$14,$25
+	srl	$12,$25,11
+	xor	$1,$13
+	sll	$13,$25,21
+	xor	$1,$12
+	srl	$12,$25,25
+	xor	$1,$13
+	sll	$13,$25,26
+	xor	$1,$12
+	xor	$14,$31			# Ch(e,f,g)
+	xor	$12,$13,$1			# Sigma1(e)
+
+	srl	$1,$2,2
+	addu	$11,$14
+	lw	$14,124($6)		# K[31]
+	sll	$13,$2,10
+	addu	$11,$12
+	srl	$12,$2,13
+	xor	$1,$13
+	sll	$13,$2,19
+	xor	$1,$12
+	srl	$12,$2,22
+	xor	$1,$13
+	sll	$13,$2,30
+	xor	$1,$12
+	sw	$23,60($29)	# offload to ring buffer
+	xor	$1,$13			# Sigma0(a)
+
+	or	$12,$2,$3
+	and	$13,$2,$3
+	and	$12,$7
+	or	$13,$12			# Maj(a,b,c)
+	addu	$11,$14			# +=K[31]
+	addu	$1,$13
+
+	addu	$24,$11
+	addu	$1,$11
+	lw	$10,8($29)	# prefetch from ring buffer
+	and	$14,0xfff
+	li	$15,2290
+	.set	noreorder
+	bne	$14,$15,.L16_xx
+	add $6,16*4		# Ktbl+=16
+
+	lw	$23,16*4($29)	# restore pointer to the end of input
+	lw	$8,0*4($4)
+	lw	$9,1*4($4)
+	lw	$10,2*4($4)
+	add $5,16*4
+	lw	$11,3*4($4)
+	addu	$1,$8
+	lw	$12,4*4($4)
+	addu	$2,$9
+	lw	$13,5*4($4)
+	addu	$3,$10
+	lw	$14,6*4($4)
+	addu	$7,$11
+	lw	$15,7*4($4)
+	addu	$24,$12
+	sw	$1,0*4($4)
+	addu	$25,$13
+	sw	$2,1*4($4)
+	addu	$30,$14
+	sw	$3,2*4($4)
+	addu	$31,$15
+	sw	$7,3*4($4)
+	sw	$24,4*4($4)
+	sw	$25,5*4($4)
+	sw	$30,6*4($4)
+	sw	$31,7*4($4)
+
+	bne	$5,$23,.Loop
+	sub $6,192	# rewind $6
+
+	lw	$31,128-1*4($29)
+	lw	$30,128-2*4($29)
+	lw	$23,128-3*4($29)
+	lw	$22,128-4*4($29)
+	lw	$21,128-5*4($29)
+	lw	$20,128-6*4($29)
+	lw	$19,128-7*4($29)
+	lw	$18,128-8*4($29)
+	lw	$17,128-9*4($29)
+	lw	$16,128-10*4($29)
+	jr	$31
+	add $29,128
+.end	sha256_block_data_order
+
+.rdata
+.align	5
+K256:
+	.word	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+.asciiz	"SHA256 for MIPS, CRYPTOGAMS by <appro@openssl.org>"
+.align	5
+
diff --git a/main/openssl/crypto/sha/asm/sha256-x86_64.S b/main/openssl/crypto/sha/asm/sha256-x86_64.S
new file mode 100644
index 00000000..db5b898f
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha256-x86_64.S
@@ -0,0 +1,1778 @@
+.text	
+
+.globl	sha256_block_data_order
+.type	sha256_block_data_order,@function
+.align	16
+sha256_block_data_order:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	shlq	$4,%rdx
+	subq	$64+32,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%r11,64+24(%rsp)
+.Lprologue:
+
+	leaq	K256(%rip),%rbp
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xorq	%rdi,%rdi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,0(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,4(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,8(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,12(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,16(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,20(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,24(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,28(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,32(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,36(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,40(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,44(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,48(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,52(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,56(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,60(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	36(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r14d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,0(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	40(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%r14d,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,4(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	44(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r14d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,8(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	48(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%r14d,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,12(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	52(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r14d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,16(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	56(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%r14d,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,20(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	60(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r14d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,24(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	0(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%r14d,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,28(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	4(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r14d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,32(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	8(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%r14d,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,36(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	12(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r14d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,40(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	16(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%r14d,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,44(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	20(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r14d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,48(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	24(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%r14d,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,52(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	28(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r14d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,56(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	32(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%r14d,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,60(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	cmpq	$64,%rdi
+	jb	.Lrounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop
+
+	movq	64+24(%rsp),%rsi
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha256_block_data_order,.-sha256_block_data_order
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/main/openssl/crypto/sha/asm/sha512-586.S b/main/openssl/crypto/sha/asm/sha512-586.S
new file mode 100644
index 00000000..4b806f35
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha512-586.S
@@ -0,0 +1,563 @@
+.file	"sha512-586.s"
+.text
+.globl	sha512_block_data_order
+.type	sha512_block_data_order,@function
+.align	16
+sha512_block_data_order:
+.L_sha512_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	.L001K512-.L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$7,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+.align	16
+.L002loop_x86:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	movl	28(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	movl	44(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	movl	60(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	64(%edi),%eax
+	movl	68(%edi),%ebx
+	movl	72(%edi),%ecx
+	movl	76(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	80(%edi),%eax
+	movl	84(%edi),%ebx
+	movl	88(%edi),%ecx
+	movl	92(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	96(%edi),%eax
+	movl	100(%edi),%ebx
+	movl	104(%edi),%ecx
+	movl	108(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	112(%edi),%eax
+	movl	116(%edi),%ebx
+	movl	120(%edi),%ecx
+	movl	124(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	addl	$128,%edi
+	subl	$72,%esp
+	movl	%edi,204(%esp)
+	leal	8(%esp),%edi
+	movl	$16,%ecx
+.long	2784229001
+.align	16
+.L00300_15_x86:
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$148,%dl
+	jne	.L00300_15_x86
+.align	16
+.L00416_79_x86:
+	movl	312(%esp),%ecx
+	movl	316(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$1,%ecx
+	movl	%edx,%edi
+	shrl	$1,%edx
+	movl	%ecx,%eax
+	shll	$24,%esi
+	movl	%edx,%ebx
+	shll	$24,%edi
+	xorl	%esi,%ebx
+	shrl	$6,%ecx
+	xorl	%edi,%eax
+	shrl	$6,%edx
+	xorl	%ecx,%eax
+	shll	$7,%esi
+	xorl	%edx,%ebx
+	shll	$1,%edi
+	xorl	%esi,%ebx
+	shrl	$1,%ecx
+	xorl	%edi,%eax
+	shrl	$1,%edx
+	xorl	%ecx,%eax
+	shll	$6,%edi
+	xorl	%edx,%ebx
+	xorl	%edi,%eax
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	208(%esp),%ecx
+	movl	212(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$6,%ecx
+	movl	%edx,%edi
+	shrl	$6,%edx
+	movl	%ecx,%eax
+	shll	$3,%esi
+	movl	%edx,%ebx
+	shll	$3,%edi
+	xorl	%esi,%eax
+	shrl	$13,%ecx
+	xorl	%edi,%ebx
+	shrl	$13,%edx
+	xorl	%ecx,%eax
+	shll	$10,%esi
+	xorl	%edx,%ebx
+	shll	$10,%edi
+	xorl	%esi,%ebx
+	shrl	$10,%ecx
+	xorl	%edi,%eax
+	shrl	$10,%edx
+	xorl	%ecx,%ebx
+	shll	$13,%edi
+	xorl	%edx,%eax
+	xorl	%edi,%eax
+	movl	320(%esp),%ecx
+	movl	324(%esp),%edx
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	movl	248(%esp),%esi
+	movl	252(%esp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,192(%esp)
+	movl	%ebx,196(%esp)
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$23,%dl
+	jne	.L00416_79_x86
+	movl	840(%esp),%esi
+	movl	844(%esp),%edi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	addl	8(%esp),%eax
+	adcl	12(%esp),%ebx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	addl	16(%esp),%ecx
+	adcl	20(%esp),%edx
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	addl	24(%esp),%eax
+	adcl	28(%esp),%ebx
+	movl	%eax,16(%esi)
+	movl	%ebx,20(%esi)
+	addl	32(%esp),%ecx
+	adcl	36(%esp),%edx
+	movl	%ecx,24(%esi)
+	movl	%edx,28(%esi)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	addl	40(%esp),%eax
+	adcl	44(%esp),%ebx
+	movl	%eax,32(%esi)
+	movl	%ebx,36(%esi)
+	addl	48(%esp),%ecx
+	adcl	52(%esp),%edx
+	movl	%ecx,40(%esi)
+	movl	%edx,44(%esi)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	addl	56(%esp),%eax
+	adcl	60(%esp),%ebx
+	movl	%eax,48(%esi)
+	movl	%ebx,52(%esi)
+	addl	64(%esp),%ecx
+	adcl	68(%esp),%edx
+	movl	%ecx,56(%esi)
+	movl	%edx,60(%esi)
+	addl	$840,%esp
+	subl	$640,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L002loop_x86
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L001K512:
+.long	3609767458,1116352408
+.long	602891725,1899447441
+.long	3964484399,3049323471
+.long	2173295548,3921009573
+.long	4081628472,961987163
+.long	3053834265,1508970993
+.long	2937671579,2453635748
+.long	3664609560,2870763221
+.long	2734883394,3624381080
+.long	1164996542,310598401
+.long	1323610764,607225278
+.long	3590304994,1426881987
+.long	4068182383,1925078388
+.long	991336113,2162078206
+.long	633803317,2614888103
+.long	3479774868,3248222580
+.long	2666613458,3835390401
+.long	944711139,4022224774
+.long	2341262773,264347078
+.long	2007800933,604807628
+.long	1495990901,770255983
+.long	1856431235,1249150122
+.long	3175218132,1555081692
+.long	2198950837,1996064986
+.long	3999719339,2554220882
+.long	766784016,2821834349
+.long	2566594879,2952996808
+.long	3203337956,3210313671
+.long	1034457026,3336571891
+.long	2466948901,3584528711
+.long	3758326383,113926993
+.long	168717936,338241895
+.long	1188179964,666307205
+.long	1546045734,773529912
+.long	1522805485,1294757372
+.long	2643833823,1396182291
+.long	2343527390,1695183700
+.long	1014477480,1986661051
+.long	1206759142,2177026350
+.long	344077627,2456956037
+.long	1290863460,2730485921
+.long	3158454273,2820302411
+.long	3505952657,3259730800
+.long	106217008,3345764771
+.long	3606008344,3516065817
+.long	1432725776,3600352804
+.long	1467031594,4094571909
+.long	851169720,275423344
+.long	3100823752,430227734
+.long	1363258195,506948616
+.long	3750685593,659060556
+.long	3785050280,883997877
+.long	3318307427,958139571
+.long	3812723403,1322822218
+.long	2003034995,1537002063
+.long	3602036899,1747873779
+.long	1575990012,1955562222
+.long	1125592928,2024104815
+.long	2716904306,2227730452
+.long	442776044,2361852424
+.long	593698344,2428436474
+.long	3733110249,2756734187
+.long	2999351573,3204031479
+.long	3815920427,3329325298
+.long	3928383900,3391569614
+.long	566280711,3515267271
+.long	3454069534,3940187606
+.long	4000239992,4118630271
+.long	1914138554,116418474
+.long	2731055270,174292421
+.long	3203993006,289380356
+.long	320620315,460393269
+.long	587496836,685471733
+.long	1086792851,852142971
+.long	365543100,1017036298
+.long	2618297676,1126000580
+.long	3409855158,1288033470
+.long	4234509866,1501505948
+.long	987167468,1607167915
+.long	1246189591,1816402316
+.size	sha512_block_data_order,.-.L_sha512_block_data_order_begin
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
diff --git a/main/openssl/crypto/sha/asm/sha512-586.pl b/main/openssl/crypto/sha/asm/sha512-586.pl
index 5b9f3337..9f8c51eb 100644
--- a/main/openssl/crypto/sha/asm/sha512-586.pl
+++ b/main/openssl/crypto/sha/asm/sha512-586.pl
@@ -23,7 +23,7 @@
 #
 # IALU code-path is optimized for elder Pentiums. On vanilla Pentium
 # performance improvement over compiler generated code reaches ~60%,
-# while on PIII - ~35%. On newer �-archs improvement varies from 15%
+# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
 # to 50%, but it's less important as they are expected to execute SSE2
 # code-path, which is commonly ~2-3x faster [than compiler generated
 # code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
@@ -142,9 +142,9 @@ sub BODY_00_15_x86 {
 	&mov	("edx",$Ehi);
 	&mov	("esi","ecx");
 
-	&shr	("ecx",9)	# lo>>9
+	&shr	("ecx",9);	# lo>>9
 	&mov	("edi","edx");
-	&shr	("edx",9)	# hi>>9
+	&shr	("edx",9);	# hi>>9
 	&mov	("ebx","ecx");
 	&shl	("esi",14);	# lo<<14
 	&mov	("eax","edx");
@@ -207,9 +207,9 @@ sub BODY_00_15_x86 {
 	&mov	($Dhi,"ebx");
 	&mov	("esi","ecx");
 
-	&shr	("ecx",2)	# lo>>2
+	&shr	("ecx",2);	# lo>>2
 	&mov	("edi","edx");
-	&shr	("edx",2)	# hi>>2
+	&shr	("edx",2);	# hi>>2
 	&mov	("ebx","ecx");
 	&shl	("esi",4);	# lo<<4
 	&mov	("eax","edx");
@@ -452,9 +452,9 @@ if ($sse2) {
 	&mov	("edx",&DWP(8*(9+15+16-1)+4,"esp"));
 	&mov	("esi","ecx");
 
-	&shr	("ecx",1)	# lo>>1
+	&shr	("ecx",1);	# lo>>1
 	&mov	("edi","edx");
-	&shr	("edx",1)	# hi>>1
+	&shr	("edx",1);	# hi>>1
 	&mov	("eax","ecx");
 	&shl	("esi",24);	# lo<<24
 	&mov	("ebx","edx");
@@ -488,9 +488,9 @@ if ($sse2) {
 	&mov	("edx",&DWP(8*(9+15+16-14)+4,"esp"));
 	&mov	("esi","ecx");
 
-	&shr	("ecx",6)	# lo>>6
+	&shr	("ecx",6);	# lo>>6
 	&mov	("edi","edx");
-	&shr	("edx",6)	# hi>>6
+	&shr	("edx",6);	# hi>>6
 	&mov	("eax","ecx");
 	&shl	("esi",3);	# lo<<3
 	&mov	("ebx","edx");
diff --git a/main/openssl/crypto/sha/asm/sha512-armv4.pl b/main/openssl/crypto/sha/asm/sha512-armv4.pl
index 3a35861a..7faf37b1 100644
--- a/main/openssl/crypto/sha/asm/sha512-armv4.pl
+++ b/main/openssl/crypto/sha/asm/sha512-armv4.pl
@@ -18,22 +18,33 @@
 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
 # Cortex A8 core and ~40 cycles per processed byte.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 25.5 cycles or 47% faster than integer-only code.
+
 # Byte order [in]dependence. =========================================
 #
-# Caller is expected to maintain specific *dword* order in h[0-7],
-# namely with most significant dword at *lower* address, which is
-# reflected in below two parameters. *Byte* order within these dwords
-# in turn is whatever *native* byte order on current platform.
-$hi=0;
-$lo=4;
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
 # ====================================================================
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
-$ctx="r0";
+$ctx="r0";	# parameter block
 $inp="r1";
 $len="r2";
+
 $Tlo="r3";
 $Thi="r4";
 $Alo="r5";
@@ -61,15 +72,17 @@ $Xoff=8*8;
 sub BODY_00_15() {
 my $magic = shift;
 $code.=<<___;
-	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
-	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
 	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
 	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
 	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
 	eor	$t0,$t0,$Elo,lsr#18
 	eor	$t1,$t1,$Ehi,lsr#18
 	eor	$t0,$t0,$Ehi,lsl#14
@@ -96,25 +109,24 @@ $code.=<<___;
 	and	$t1,$t1,$Ehi
 	str	$Ahi,[sp,#$Aoff+4]
 	eor	$t0,$t0,$t2
-	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
 	eor	$t1,$t1,$t3		@ Ch(e,f,g)
-	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
 
 	adds	$Tlo,$Tlo,$t0
 	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
 	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
 	adc	$Thi,$Thi,$t3		@ T += K[i]
 	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
 	adc	$Ehi,$Ehi,$Thi		@ d += T
-
-	and	$t0,$t2,#0xff
 	teq	$t0,#$magic
-	orreq	$Ktbl,$Ktbl,#1
 
-	ldr	$t2,[sp,#$Boff+0]	@ b.lo
 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+	orreq	$Ktbl,$Ktbl,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -131,80 +143,100 @@ $code.=<<___;
 	eor	$t0,$t0,$Alo,lsl#25
 	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
 	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
 	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
 
-	and	$t0,$Alo,$t2
-	orr	$Alo,$Alo,$t2
 	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
 	ldr	$t2,[sp,#$Coff+4]	@ c.hi
 	and	$Alo,$Alo,$t3
-	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
 	and	$t3,$Ahi,$t1
 	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
 	and	$Ahi,$Ahi,$t2
-	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
 	adds	$Alo,$Alo,$Tlo
-	adc	$Ahi,$Ahi,$Thi		@ h += T
-
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
 	add	$Ktbl,$Ktbl,#8
 ___
 }
 $code=<<___;
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
 .text
 .code	32
 .type	K512,%object
 .align	5
 K512:
-.word	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
-.word	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
-.word	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
-.word	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
-.word	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
-.word	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
-.word	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
-.word	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
-.word	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
-.word	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
-.word	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
-.word	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
-.word	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
-.word	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
-.word	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
-.word	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
-.word	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
-.word	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
-.word	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
-.word	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
-.word	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
-.word	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
-.word	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
-.word	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
-.word	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
-.word	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
-.word	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
-.word	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
-.word	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
-.word	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
-.word	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
-.word	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
-.word	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
-.word	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
-.word	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
-.word	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
-.word	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
-.word	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
-.word	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
-.word	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
 
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
 	sub	r3,pc,#8		@ sha512_block_data_order
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
 	stmdb	sp!,{r4-r12,lr}
-	sub	$Ktbl,r3,#640		@ K512
+	sub	$Ktbl,r3,#672		@ K512
 	sub	sp,sp,#9*8
 
 	ldr	$Elo,[$ctx,#$Eoff+$lo]
@@ -238,6 +270,7 @@ sha512_block_data_order:
 	str	$Thi,[sp,#$Foff+4]
 
 .L00_15:
+#if __ARM_ARCH__<7
 	ldrb	$Tlo,[$inp,#7]
 	ldrb	$t0, [$inp,#6]
 	ldrb	$t1, [$inp,#5]
@@ -252,26 +285,30 @@ sha512_block_data_order:
 	orr	$Thi,$Thi,$t3,lsl#8
 	orr	$Thi,$Thi,$t0,lsl#16
 	orr	$Thi,$Thi,$t1,lsl#24
-	str	$Tlo,[sp,#$Xoff+0]
-	str	$Thi,[sp,#$Xoff+4]
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
 ___
 	&BODY_00_15(0x94);
 $code.=<<___;
 	tst	$Ktbl,#1
 	beq	.L00_15
-	bic	$Ktbl,$Ktbl,#1
-
-.L16_79:
 	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
 	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
-	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
-	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
-
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
 	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
 	eor	$Tlo,$Tlo,$t1,lsl#31
 	eor	$Thi,$Thi,$t0,lsl#31
 	eor	$Tlo,$Tlo,$t0,lsr#8
@@ -295,25 +332,24 @@ $code.=<<___;
 	eor	$t1,$t1,$t3,lsl#3
 	eor	$t0,$t0,$t2,lsr#6
 	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
 	eor	$t0,$t0,$t3,lsl#26
 
-	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
 	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
 	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
 	adc	$Thi,$Thi,$t1
 
-	ldr	$t0,[sp,#`$Xoff+8*16`+0]
 	ldr	$t1,[sp,#`$Xoff+8*16`+4]
 	adds	$Tlo,$Tlo,$t2
 	adc	$Thi,$Thi,$t3
 	adds	$Tlo,$Tlo,$t0
 	adc	$Thi,$Thi,$t1
-	str	$Tlo,[sp,#$Xoff+0]
-	str	$Thi,[sp,#$Xoff+4]
 ___
 	&BODY_00_15(0x17);
 $code.=<<___;
-	tst	$Ktbl,#1
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
 	beq	.L16_79
 	bic	$Ktbl,$Ktbl,#1
 
@@ -324,12 +360,12 @@ $code.=<<___;
 	ldr	$t2, [$ctx,#$Boff+$lo]
 	ldr	$t3, [$ctx,#$Boff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Boff+$hi]
 
 	ldr	$Alo,[sp,#$Coff+0]
@@ -341,12 +377,12 @@ $code.=<<___;
 	ldr	$t2, [$ctx,#$Doff+$lo]
 	ldr	$t3, [$ctx,#$Doff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Doff+$hi]
 
 	ldr	$Tlo,[sp,#$Foff+0]
@@ -356,12 +392,12 @@ $code.=<<___;
 	ldr	$t2, [$ctx,#$Foff+$lo]
 	ldr	$t3, [$ctx,#$Foff+$hi]
 	adds	$Elo,$Elo,$t0
-	adc	$Ehi,$Ehi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
 	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Foff+$hi]
 
 	ldr	$Alo,[sp,#$Goff+0]
@@ -373,12 +409,12 @@ $code.=<<___;
 	ldr	$t2, [$ctx,#$Hoff+$lo]
 	ldr	$t3, [$ctx,#$Hoff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Hoff+$hi]
 
 	add	sp,sp,#640
@@ -388,13 +424,156 @@ $code.=<<___;
 	bne	.Loop
 
 	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size   sha512_block_data_order,.-sha512_block_data_order
-.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+#endif
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	vadd.i64	$T1,$K,$h
+	veor		$Ch,$f,$g
+	veor		$t0,$t1
+	vand		$Ch,$e
+	veor		$t0,$t2			@ Sigma1(e)
+	veor		$Ch,$g			@ Ch(e,f,g)
+	vadd.i64	$T1,$t0
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	vadd.i64	$T1,$Ch
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	vadd.i64	$T1,@X[$i%16]
+	vorr		$Maj,$a,$c
+	vand		$Ch,$a,$c
+	veor		$h,$t0,$t1
+	vand		$Maj,$b
+	veor		$h,$t2			@ Sigma0(a)
+	vorr		$Maj,$Ch		@ Maj(a,b,c)
+	vadd.i64	$h,$T1
+	vadd.i64	$d,$T1
+	vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.align	4
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	sub	$Ktbl,r3,#672		@ K512
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	vldmia	sp!,{d8-d15}		@ epilogue
+	bx	lr
+#endif
+___
+}
+$code.=<<___;
+.size	sha512_block_data_order,.-sha512_block_data_order
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
+.comm	OPENSSL_armcap_P,4,4
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/sha/asm/sha512-armv4.s b/main/openssl/crypto/sha/asm/sha512-armv4.s
index b030c16c..57301922 100644
--- a/main/openssl/crypto/sha/asm/sha512-armv4.s
+++ b/main/openssl/crypto/sha/asm/sha512-armv4.s
@@ -1,90 +1,111 @@
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
 .text
 .code	32
 .type	K512,%object
 .align	5
 K512:
-.word	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
-.word	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
-.word	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
-.word	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
-.word	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
-.word	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
-.word	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
-.word	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
-.word	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
-.word	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
-.word	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
-.word	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
-.word	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
-.word	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
-.word	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
-.word	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
-.word	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
-.word	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
-.word	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
-.word	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
-.word	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
-.word	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
-.word	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
-.word	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
-.word	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
-.word	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
-.word	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
-.word	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
-.word	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
-.word	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
-.word	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
-.word	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
-.word	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
-.word	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
-.word	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
-.word	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
-.word	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
-.word	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
-.word	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
-.word	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
 
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
 	sub	r3,pc,#8		@ sha512_block_data_order
 	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
 	stmdb	sp!,{r4-r12,lr}
-	sub	r14,r3,#640		@ K512
+	sub	r14,r3,#672		@ K512
 	sub	sp,sp,#9*8
 
-	ldr	r7,[r0,#32+4]
-	ldr	r8,[r0,#32+0]
-	ldr	r9, [r0,#48+4]
-	ldr	r10, [r0,#48+0]
-	ldr	r11, [r0,#56+4]
-	ldr	r12, [r0,#56+0]
+	ldr	r7,[r0,#32+LO]
+	ldr	r8,[r0,#32+HI]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
 .Loop:
 	str	r9, [sp,#48+0]
 	str	r10, [sp,#48+4]
 	str	r11, [sp,#56+0]
 	str	r12, [sp,#56+4]
-	ldr	r5,[r0,#0+4]
-	ldr	r6,[r0,#0+0]
-	ldr	r3,[r0,#8+4]
-	ldr	r4,[r0,#8+0]
-	ldr	r9, [r0,#16+4]
-	ldr	r10, [r0,#16+0]
-	ldr	r11, [r0,#24+4]
-	ldr	r12, [r0,#24+0]
+	ldr	r5,[r0,#0+LO]
+	ldr	r6,[r0,#0+HI]
+	ldr	r3,[r0,#8+LO]
+	ldr	r4,[r0,#8+HI]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
 	str	r3,[sp,#8+0]
 	str	r4,[sp,#8+4]
 	str	r9, [sp,#16+0]
 	str	r10, [sp,#16+4]
 	str	r11, [sp,#24+0]
 	str	r12, [sp,#24+4]
-	ldr	r3,[r0,#40+4]
-	ldr	r4,[r0,#40+0]
+	ldr	r3,[r0,#40+LO]
+	ldr	r4,[r0,#40+HI]
 	str	r3,[sp,#40+0]
 	str	r4,[sp,#40+4]
 
 .L00_15:
+#if __ARM_ARCH__<7
 	ldrb	r3,[r1,#7]
 	ldrb	r9, [r1,#6]
 	ldrb	r10, [r1,#5]
@@ -99,17 +120,25 @@ sha512_block_data_order:
 	orr	r4,r4,r12,lsl#8
 	orr	r4,r4,r9,lsl#16
 	orr	r4,r4,r10,lsl#24
-	str	r3,[sp,#64+0]
-	str	r4,[sp,#64+4]
-	ldr	r11,[sp,#56+0]	@ h.lo
-	ldr	r12,[sp,#56+4]	@ h.hi
+#else
+	ldr	r3,[r1,#4]
+	ldr	r4,[r1],#8
+#ifdef __ARMEL__
+	rev	r3,r3
+	rev	r4,r4
+#endif
+#endif
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
 	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
 	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
 	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
 	eor	r9,r9,r7,lsr#18
 	eor	r10,r10,r8,lsr#18
 	eor	r9,r9,r8,lsl#14
@@ -136,25 +165,24 @@ sha512_block_data_order:
 	and	r10,r10,r8
 	str	r6,[sp,#0+4]
 	eor	r9,r9,r11
-	ldr	r11,[r14,#4]		@ K[i].lo
+	ldr	r11,[r14,#LO]	@ K[i].lo
 	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#0]		@ K[i].hi
+	ldr	r12,[r14,#HI]	@ K[i].hi
 
 	adds	r3,r3,r9
 	ldr	r7,[sp,#24+0]	@ d.lo
 	adc	r4,r4,r10		@ T += Ch(e,f,g)
 	ldr	r8,[sp,#24+4]	@ d.hi
 	adds	r3,r3,r11
+	and	r9,r11,#0xff
 	adc	r4,r4,r12		@ T += K[i]
 	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
 	adc	r8,r8,r4		@ d += T
-
-	and	r9,r11,#0xff
 	teq	r9,#148
-	orreq	r14,r14,#1
 
-	ldr	r11,[sp,#8+0]	@ b.lo
 	ldr	r12,[sp,#16+0]	@ c.lo
+	orreq	r14,r14,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -171,38 +199,36 @@ sha512_block_data_order:
 	eor	r9,r9,r5,lsl#25
 	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
 	adds	r3,r3,r9
+	and	r9,r5,r11
 	adc	r4,r4,r10		@ T += Sigma0(a)
 
-	and	r9,r5,r11
-	orr	r5,r5,r11
 	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
 	ldr	r11,[sp,#16+4]	@ c.hi
 	and	r5,r5,r12
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r12,r6,r10
 	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r6,r6,r11
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	adds	r5,r5,r3
-	adc	r6,r6,r4		@ h += T
-
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
 	add	r14,r14,#8
 	tst	r14,#1
 	beq	.L00_15
-	bic	r14,r14,#1
-
-.L16_79:
 	ldr	r9,[sp,#184+0]
 	ldr	r10,[sp,#184+4]
-	ldr	r11,[sp,#80+0]
-	ldr	r12,[sp,#80+4]
-
+	bic	r14,r14,#1
+.L16_79:
 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 	mov	r3,r9,lsr#1
+	ldr	r11,[sp,#80+0]
 	mov	r4,r10,lsr#1
+	ldr	r12,[sp,#80+4]
 	eor	r3,r3,r10,lsl#31
 	eor	r4,r4,r9,lsl#31
 	eor	r3,r3,r9,lsr#8
@@ -226,30 +252,30 @@ sha512_block_data_order:
 	eor	r10,r10,r12,lsl#3
 	eor	r9,r9,r11,lsr#6
 	eor	r10,r10,r12,lsr#6
+	ldr	r11,[sp,#120+0]
 	eor	r9,r9,r12,lsl#26
 
-	ldr	r11,[sp,#120+0]
 	ldr	r12,[sp,#120+4]
 	adds	r3,r3,r9
+	ldr	r9,[sp,#192+0]
 	adc	r4,r4,r10
 
-	ldr	r9,[sp,#192+0]
 	ldr	r10,[sp,#192+4]
 	adds	r3,r3,r11
 	adc	r4,r4,r12
 	adds	r3,r3,r9
 	adc	r4,r4,r10
-	str	r3,[sp,#64+0]
-	str	r4,[sp,#64+4]
-	ldr	r11,[sp,#56+0]	@ h.lo
-	ldr	r12,[sp,#56+4]	@ h.hi
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
 	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
 	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
 	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
 	eor	r9,r9,r7,lsr#18
 	eor	r10,r10,r8,lsr#18
 	eor	r9,r9,r8,lsl#14
@@ -276,25 +302,24 @@ sha512_block_data_order:
 	and	r10,r10,r8
 	str	r6,[sp,#0+4]
 	eor	r9,r9,r11
-	ldr	r11,[r14,#4]		@ K[i].lo
+	ldr	r11,[r14,#LO]	@ K[i].lo
 	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#0]		@ K[i].hi
+	ldr	r12,[r14,#HI]	@ K[i].hi
 
 	adds	r3,r3,r9
 	ldr	r7,[sp,#24+0]	@ d.lo
 	adc	r4,r4,r10		@ T += Ch(e,f,g)
 	ldr	r8,[sp,#24+4]	@ d.hi
 	adds	r3,r3,r11
+	and	r9,r11,#0xff
 	adc	r4,r4,r12		@ T += K[i]
 	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
 	adc	r8,r8,r4		@ d += T
-
-	and	r9,r11,#0xff
 	teq	r9,#23
-	orreq	r14,r14,#1
 
-	ldr	r11,[sp,#8+0]	@ b.lo
 	ldr	r12,[sp,#16+0]	@ c.lo
+	orreq	r14,r14,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -311,90 +336,91 @@ sha512_block_data_order:
 	eor	r9,r9,r5,lsl#25
 	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
 	adds	r3,r3,r9
+	and	r9,r5,r11
 	adc	r4,r4,r10		@ T += Sigma0(a)
 
-	and	r9,r5,r11
-	orr	r5,r5,r11
 	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
 	ldr	r11,[sp,#16+4]	@ c.hi
 	and	r5,r5,r12
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r12,r6,r10
 	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r6,r6,r11
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	adds	r5,r5,r3
-	adc	r6,r6,r4		@ h += T
-
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
-	add	r14,r14,#8
+	adc	r6,r6,r4		@ h += T
 	tst	r14,#1
+	add	r14,r14,#8
+	ldreq	r9,[sp,#184+0]
+	ldreq	r10,[sp,#184+4]
 	beq	.L16_79
 	bic	r14,r14,#1
 
 	ldr	r3,[sp,#8+0]
 	ldr	r4,[sp,#8+4]
-	ldr	r9, [r0,#0+4]
-	ldr	r10, [r0,#0+0]
-	ldr	r11, [r0,#8+4]
-	ldr	r12, [r0,#8+0]
+	ldr	r9, [r0,#0+LO]
+	ldr	r10, [r0,#0+HI]
+	ldr	r11, [r0,#8+LO]
+	ldr	r12, [r0,#8+HI]
 	adds	r9,r5,r9
+	str	r9, [r0,#0+LO]
 	adc	r10,r6,r10
+	str	r10, [r0,#0+HI]
 	adds	r11,r3,r11
+	str	r11, [r0,#8+LO]
 	adc	r12,r4,r12
-	str	r9, [r0,#0+4]
-	str	r10, [r0,#0+0]
-	str	r11, [r0,#8+4]
-	str	r12, [r0,#8+0]
+	str	r12, [r0,#8+HI]
 
 	ldr	r5,[sp,#16+0]
 	ldr	r6,[sp,#16+4]
 	ldr	r3,[sp,#24+0]
 	ldr	r4,[sp,#24+4]
-	ldr	r9, [r0,#16+4]
-	ldr	r10, [r0,#16+0]
-	ldr	r11, [r0,#24+4]
-	ldr	r12, [r0,#24+0]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
 	adds	r9,r5,r9
+	str	r9, [r0,#16+LO]
 	adc	r10,r6,r10
+	str	r10, [r0,#16+HI]
 	adds	r11,r3,r11
+	str	r11, [r0,#24+LO]
 	adc	r12,r4,r12
-	str	r9, [r0,#16+4]
-	str	r10, [r0,#16+0]
-	str	r11, [r0,#24+4]
-	str	r12, [r0,#24+0]
+	str	r12, [r0,#24+HI]
 
 	ldr	r3,[sp,#40+0]
 	ldr	r4,[sp,#40+4]
-	ldr	r9, [r0,#32+4]
-	ldr	r10, [r0,#32+0]
-	ldr	r11, [r0,#40+4]
-	ldr	r12, [r0,#40+0]
+	ldr	r9, [r0,#32+LO]
+	ldr	r10, [r0,#32+HI]
+	ldr	r11, [r0,#40+LO]
+	ldr	r12, [r0,#40+HI]
 	adds	r7,r7,r9
+	str	r7,[r0,#32+LO]
 	adc	r8,r8,r10
+	str	r8,[r0,#32+HI]
 	adds	r11,r3,r11
+	str	r11, [r0,#40+LO]
 	adc	r12,r4,r12
-	str	r7,[r0,#32+4]
-	str	r8,[r0,#32+0]
-	str	r11, [r0,#40+4]
-	str	r12, [r0,#40+0]
+	str	r12, [r0,#40+HI]
 
 	ldr	r5,[sp,#48+0]
 	ldr	r6,[sp,#48+4]
 	ldr	r3,[sp,#56+0]
 	ldr	r4,[sp,#56+4]
-	ldr	r9, [r0,#48+4]
-	ldr	r10, [r0,#48+0]
-	ldr	r11, [r0,#56+4]
-	ldr	r12, [r0,#56+0]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
 	adds	r9,r5,r9
+	str	r9, [r0,#48+LO]
 	adc	r10,r6,r10
+	str	r10, [r0,#48+HI]
 	adds	r11,r3,r11
+	str	r11, [r0,#56+LO]
 	adc	r12,r4,r12
-	str	r9, [r0,#48+4]
-	str	r10, [r0,#48+0]
-	str	r11, [r0,#56+4]
-	str	r12, [r0,#56+0]
+	str	r12, [r0,#56+HI]
 
 	add	sp,sp,#640
 	sub	r14,r14,#640
@@ -403,10 +429,1355 @@ sha512_block_data_order:
 	bne	.Loop
 
 	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-.size   sha512_block_data_order,.-sha512_block_data_order
-.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
+#endif
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.align	4
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	sub	r3,r3,#672		@ K512
+	vldmia	r0,{d16-d23}		@ load context
+.Loop_neon:
+	vshr.u64	d24,d20,#14	@ 0
+#if 0<16
+	vld1.64		{d0},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+	vshr.u64	d26,d20,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+	vrev64.8	d0,d0
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d0
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 1
+#if 1<16
+	vld1.64		{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+	vrev64.8	d1,d1
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d1
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 2
+#if 2<16
+	vld1.64		{d2},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+	vshr.u64	d26,d18,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+	vrev64.8	d2,d2
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d2
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 3
+#if 3<16
+	vld1.64		{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+	vrev64.8	d3,d3
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d3
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 4
+#if 4<16
+	vld1.64		{d4},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+	vshr.u64	d26,d16,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+	vrev64.8	d4,d4
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d4
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 5
+#if 5<16
+	vld1.64		{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+	vrev64.8	d5,d5
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d5
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 6
+#if 6<16
+	vld1.64		{d6},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+	vshr.u64	d26,d22,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+	vrev64.8	d6,d6
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d6
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 7
+#if 7<16
+	vld1.64		{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+	vrev64.8	d7,d7
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d7
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	vshr.u64	d24,d20,#14	@ 8
+#if 8<16
+	vld1.64		{d8},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+	vshr.u64	d26,d20,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+	vrev64.8	d8,d8
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d8
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 9
+#if 9<16
+	vld1.64		{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+	vrev64.8	d9,d9
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d9
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 10
+#if 10<16
+	vld1.64		{d10},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+	vshr.u64	d26,d18,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+	vrev64.8	d10,d10
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d10
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 11
+#if 11<16
+	vld1.64		{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+	vrev64.8	d11,d11
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d11
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 12
+#if 12<16
+	vld1.64		{d12},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+	vshr.u64	d26,d16,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+	vrev64.8	d12,d12
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d12
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 13
+#if 13<16
+	vld1.64		{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+	vrev64.8	d13,d13
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d13
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 14
+#if 14<16
+	vld1.64		{d14},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+	vshr.u64	d26,d22,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+	vrev64.8	d14,d14
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d14
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 15
+#if 15<16
+	vld1.64		{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+	vrev64.8	d15,d15
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d15
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	mov		r12,#4
+.L16_79_neon:
+	subs		r12,#1
+	vshr.u64	q12,q7,#19
+	vshr.u64	q13,q7,#61
+	vshr.u64	q15,q7,#6
+	vsli.64		q12,q7,#45
+	vext.8		q14,q0,q1,#8	@ X[i+1]
+	vsli.64		q13,q7,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q0,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q4,q5,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q0,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q0,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d0
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 17
+#if 17<16
+	vld1.64		{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d1
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	q12,q0,#19
+	vshr.u64	q13,q0,#61
+	vshr.u64	q15,q0,#6
+	vsli.64		q12,q0,#45
+	vext.8		q14,q1,q2,#8	@ X[i+1]
+	vsli.64		q13,q0,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q1,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q5,q6,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q1,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q1,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d2
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 19
+#if 19<16
+	vld1.64		{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d3
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	q12,q1,#19
+	vshr.u64	q13,q1,#61
+	vshr.u64	q15,q1,#6
+	vsli.64		q12,q1,#45
+	vext.8		q14,q2,q3,#8	@ X[i+1]
+	vsli.64		q13,q1,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q2,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q6,q7,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q2,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q2,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d4
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 21
+#if 21<16
+	vld1.64		{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d5
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	q12,q2,#19
+	vshr.u64	q13,q2,#61
+	vshr.u64	q15,q2,#6
+	vsli.64		q12,q2,#45
+	vext.8		q14,q3,q4,#8	@ X[i+1]
+	vsli.64		q13,q2,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q3,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q7,q0,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q3,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q3,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d6
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 23
+#if 23<16
+	vld1.64		{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d7
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	vshr.u64	q12,q3,#19
+	vshr.u64	q13,q3,#61
+	vshr.u64	q15,q3,#6
+	vsli.64		q12,q3,#45
+	vext.8		q14,q4,q5,#8	@ X[i+1]
+	vsli.64		q13,q3,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q4,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q0,q1,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q4,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q4,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d8
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 25
+#if 25<16
+	vld1.64		{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d9
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	q12,q4,#19
+	vshr.u64	q13,q4,#61
+	vshr.u64	q15,q4,#6
+	vsli.64		q12,q4,#45
+	vext.8		q14,q5,q6,#8	@ X[i+1]
+	vsli.64		q13,q4,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q5,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q1,q2,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q5,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q5,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d10
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 27
+#if 27<16
+	vld1.64		{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d11
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	q12,q5,#19
+	vshr.u64	q13,q5,#61
+	vshr.u64	q15,q5,#6
+	vsli.64		q12,q5,#45
+	vext.8		q14,q6,q7,#8	@ X[i+1]
+	vsli.64		q13,q5,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q6,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q2,q3,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q6,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q6,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d12
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 29
+#if 29<16
+	vld1.64		{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d13
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	q12,q6,#19
+	vshr.u64	q13,q6,#61
+	vshr.u64	q15,q6,#6
+	vsli.64		q12,q6,#45
+	vext.8		q14,q7,q0,#8	@ X[i+1]
+	vsli.64		q13,q6,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q7,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q3,q4,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q7,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q7,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d14
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 31
+#if 31<16
+	vld1.64		{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d15
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	bne		.L16_79_neon
+
+	vldmia		r0,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		r0,{d16-d23}	@ save context
+	teq		r1,r2
+	sub		r3,#640	@ rewind K512
+	bne		.Loop_neon
+
+	vldmia	sp!,{d8-d15}		@ epilogue
+	.word	0xe12fff1e
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
 .align	2
+.comm	OPENSSL_armcap_P,4,4
diff --git a/main/openssl/crypto/sha/asm/sha512-mips.pl b/main/openssl/crypto/sha/asm/sha512-mips.pl
new file mode 100644
index 00000000..ffa053bb
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha512-mips.pl
@@ -0,0 +1,455 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA2 block procedures for MIPS.
+
+# October 2010.
+#
+# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
+# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
+# for now can only be compiled for MIPS64 ISA] improvement is modest
+# ~17%, but it comes for free, because it's same instruction sequence.
+# Improvement coefficients are for aligned input.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+#   excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$PTR_SLL="dsll";	# incidentally works even on n32
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$PTR_SLL="sll";
+	$SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
+open STDOUT,">$output";
+
+if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
+
+if ($output =~ /512/) {
+	$label="512";
+	$SZ=8;
+	$LD="ld";		# load from memory
+	$ST="sd";		# store to memory
+	$SLL="dsll";		# shift left logical
+	$SRL="dsrl";		# shift right logical
+	$ADDU="daddu";
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=( 7, 1, 8);	# right shift first
+	@sigma1=( 6,19,61);	# right shift first
+	$lastK=0x817;
+	$rounds=80;
+} else {
+	$label="256";
+	$SZ=4;
+	$LD="lw";		# load from memory
+	$ST="sw";		# store to memory
+	$SLL="sll";		# shift left logical
+	$SRL="srl";		# shift right logical
+	$ADDU="addu";
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 3, 7,18);	# right shift first
+	@sigma1=(10,17,19);	# right shift first
+	$lastK=0x8f2;
+	$rounds=64;
+}
+
+$MSB = $big_endian ? 0 : ($SZ-1);
+$LSB = ($SZ-1)&~$MSB;
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
+@X=map("\$$_",(8..23));
+
+$ctx=$a0;
+$inp=$a1;
+$len=$a2;	$Ktbl=$len;
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
+
+$code.=<<___ if ($i<15);
+	${LD}l	@X[1],`($i+1)*$SZ+$MSB`($inp)
+	${LD}r	@X[1],`($i+1)*$SZ+$LSB`($inp)
+___
+$code.=<<___	if (!$big_endian && $i<16 && $SZ==4);
+	srl	$tmp0,@X[0],24		# byte swap($i)
+	srl	$tmp1,@X[0],8
+	andi	$tmp2,@X[0],0xFF00
+	sll	@X[0],@X[0],24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	@X[0],$tmp0
+	or	$tmp1,$tmp2
+	or	@X[0],$tmp1
+___
+$code.=<<___	if (!$big_endian && $i<16 && $SZ==8);
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+	and	$tmp1,@X[0],$tmp0	# byte swap($i)
+	dsrl	$tmp2,@X[0],24
+	dsll	$tmp1,24
+	and	$tmp2,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	and	$tmp2,@X[0],$tmp0
+	dsrl	@X[0],8
+	dsll	$tmp2,8
+	and	@X[0],$tmp0
+	or	$tmp1,$tmp2
+	or	@X[0],$tmp1
+	dsrl	$tmp1,@X[0],32
+	dsll	@X[0],32
+	or	@X[0],$tmp1
+___
+$code.=<<___;
+	$ADDU	$T1,$X[0],$h			# $i
+	$SRL	$h,$e,@Sigma1[0]
+	xor	$tmp2,$f,$g
+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[2]`
+	and	$tmp2,$e
+	$SRL	$tmp0,$e,@Sigma1[1]
+	xor	$h,$tmp1
+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[1]`
+	xor	$h,$tmp0
+	$SRL	$tmp0,$e,@Sigma1[2]
+	xor	$h,$tmp1
+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[0]`
+	xor	$h,$tmp0
+	xor	$tmp2,$g			# Ch(e,f,g)
+	xor	$tmp0,$tmp1,$h			# Sigma1(e)
+
+	$SRL	$h,$a,@Sigma0[0]
+	$ADDU	$T1,$tmp2
+	$LD	$tmp2,`$i*$SZ`($Ktbl)		# K[$i]
+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[2]`
+	$ADDU	$T1,$tmp0
+	$SRL	$tmp0,$a,@Sigma0[1]
+	xor	$h,$tmp1
+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[1]`
+	xor	$h,$tmp0
+	$SRL	$tmp0,$a,@Sigma0[2]
+	xor	$h,$tmp1
+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[0]`
+	xor	$h,$tmp0
+	$ST	@X[0],`($i%16)*$SZ`($sp)	# offload to ring buffer
+	xor	$h,$tmp1			# Sigma0(a)
+
+	or	$tmp0,$a,$b
+	and	$tmp1,$a,$b
+	and	$tmp0,$c
+	or	$tmp1,$tmp0			# Maj(a,b,c)
+	$ADDU	$T1,$tmp2			# +=K[$i]
+	$ADDU	$h,$tmp1
+
+	$ADDU	$d,$T1
+	$ADDU	$h,$T1
+___
+$code.=<<___ if ($i>=13);
+	$LD	@X[3],`(($i+3)%16)*$SZ`($sp)	# prefetch from ring buffer
+___
+}
+
+sub BODY_16_XX {
+my $i=@_[0];
+my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
+
+$code.=<<___;
+	$SRL	$tmp2,@X[1],@sigma0[0]		# Xupdate($i)
+	$ADDU	@X[0],@X[9]			# +=X[i+9]
+	$SLL	$tmp1,@X[1],`$SZ*8-@sigma0[2]`
+	$SRL	$tmp0,@X[1],@sigma0[1]
+	xor	$tmp2,$tmp1
+	$SLL	$tmp1,`@sigma0[2]-@sigma0[1]`
+	xor	$tmp2,$tmp0
+	$SRL	$tmp0,@X[1],@sigma0[2]
+	xor	$tmp2,$tmp1
+
+	$SRL	$tmp3,@X[14],@sigma1[0]
+	xor	$tmp2,$tmp0			# sigma0(X[i+1])
+	$SLL	$tmp1,@X[14],`$SZ*8-@sigma1[2]`
+	$ADDU	@X[0],$tmp2
+	$SRL	$tmp0,@X[14],@sigma1[1]
+	xor	$tmp3,$tmp1
+	$SLL	$tmp1,`@sigma1[2]-@sigma1[1]`
+	xor	$tmp3,$tmp0
+	$SRL	$tmp0,@X[14],@sigma1[2]
+	xor	$tmp3,$tmp1
+
+	xor	$tmp3,$tmp0			# sigma1(X[i+14])
+	$ADDU	@X[0],$tmp3
+___
+	&BODY_00_15(@_);
+}
+
+$FRAMESIZE=16*$SZ+16*$SZREG;
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+$code.=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+.set	noat
+#if !defined(__vxworks) || defined(__pic__)
+.option	pic2
+#endif
+
+.align	5
+.globl	sha${label}_block_data_order
+.ent	sha${label}_block_data_order
+sha${label}_block_data_order:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	$s3,$FRAMESIZE-11*$SZREG($sp)
+	$REG_S	$s2,$FRAMESIZE-12*$SZREG($sp)
+	$REG_S	$s1,$FRAMESIZE-13*$SZREG($sp)
+	$REG_S	$s0,$FRAMESIZE-14*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	$PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Ktbl
+	.cpsetup	$pf,$zero,sha${label}_block_data_order
+___
+$code.=<<___;
+	.set	reorder
+	la	$Ktbl,K${label}		# PIC-ified 'load address'
+
+	$LD	$A,0*$SZ($ctx)		# load context
+	$LD	$B,1*$SZ($ctx)
+	$LD	$C,2*$SZ($ctx)
+	$LD	$D,3*$SZ($ctx)
+	$LD	$E,4*$SZ($ctx)
+	$LD	$F,5*$SZ($ctx)
+	$LD	$G,6*$SZ($ctx)
+	$LD	$H,7*$SZ($ctx)
+
+	$PTR_ADD @X[15],$inp		# pointer to the end of input
+	$REG_S	@X[15],16*$SZ($sp)
+	b	.Loop
+
+.align	5
+.Loop:
+	${LD}l	@X[0],$MSB($inp)
+	${LD}r	@X[0],$LSB($inp)
+___
+for ($i=0;$i<16;$i++)
+{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+	b	.L16_xx
+.align	4
+.L16_xx:
+___
+for (;$i<32;$i++)
+{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+	and	@X[6],0xfff
+	li	@X[7],$lastK
+	.set	noreorder
+	bne	@X[6],@X[7],.L16_xx
+	$PTR_ADD $Ktbl,16*$SZ		# Ktbl+=16
+
+	$REG_L	@X[15],16*$SZ($sp)	# restore pointer to the end of input
+	$LD	@X[0],0*$SZ($ctx)
+	$LD	@X[1],1*$SZ($ctx)
+	$LD	@X[2],2*$SZ($ctx)
+	$PTR_ADD $inp,16*$SZ
+	$LD	@X[3],3*$SZ($ctx)
+	$ADDU	$A,@X[0]
+	$LD	@X[4],4*$SZ($ctx)
+	$ADDU	$B,@X[1]
+	$LD	@X[5],5*$SZ($ctx)
+	$ADDU	$C,@X[2]
+	$LD	@X[6],6*$SZ($ctx)
+	$ADDU	$D,@X[3]
+	$LD	@X[7],7*$SZ($ctx)
+	$ADDU	$E,@X[4]
+	$ST	$A,0*$SZ($ctx)
+	$ADDU	$F,@X[5]
+	$ST	$B,1*$SZ($ctx)
+	$ADDU	$G,@X[6]
+	$ST	$C,2*$SZ($ctx)
+	$ADDU	$H,@X[7]
+	$ST	$D,3*$SZ($ctx)
+	$ST	$E,4*$SZ($ctx)
+	$ST	$F,5*$SZ($ctx)
+	$ST	$G,6*$SZ($ctx)
+	$ST	$H,7*$SZ($ctx)
+
+	bne	$inp,@X[15],.Loop
+	$PTR_SUB $Ktbl,`($rounds-16)*$SZ`	# rewind $Ktbl
+
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	sha${label}_block_data_order
+
+.rdata
+.align	5
+K${label}:
+___
+if ($SZ==4) {
+$code.=<<___;
+	.word	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+___
+} else {
+$code.=<<___;
+	.dword	0x428a2f98d728ae22, 0x7137449123ef65cd
+	.dword	0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+	.dword	0x3956c25bf348b538, 0x59f111f1b605d019
+	.dword	0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+	.dword	0xd807aa98a3030242, 0x12835b0145706fbe
+	.dword	0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+	.dword	0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+	.dword	0x9bdc06a725c71235, 0xc19bf174cf692694
+	.dword	0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+	.dword	0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+	.dword	0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+	.dword	0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+	.dword	0x983e5152ee66dfab, 0xa831c66d2db43210
+	.dword	0xb00327c898fb213f, 0xbf597fc7beef0ee4
+	.dword	0xc6e00bf33da88fc2, 0xd5a79147930aa725
+	.dword	0x06ca6351e003826f, 0x142929670a0e6e70
+	.dword	0x27b70a8546d22ffc, 0x2e1b21385c26c926
+	.dword	0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+	.dword	0x650a73548baf63de, 0x766a0abb3c77b2a8
+	.dword	0x81c2c92e47edaee6, 0x92722c851482353b
+	.dword	0xa2bfe8a14cf10364, 0xa81a664bbc423001
+	.dword	0xc24b8b70d0f89791, 0xc76c51a30654be30
+	.dword	0xd192e819d6ef5218, 0xd69906245565a910
+	.dword	0xf40e35855771202a, 0x106aa07032bbd1b8
+	.dword	0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+	.dword	0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+	.dword	0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+	.dword	0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+	.dword	0x748f82ee5defb2fc, 0x78a5636f43172f60
+	.dword	0x84c87814a1f0ab72, 0x8cc702081a6439ec
+	.dword	0x90befffa23631e28, 0xa4506cebde82bde9
+	.dword	0xbef9a3f7b2c67915, 0xc67178f2e372532b
+	.dword	0xca273eceea26619c, 0xd186b8c721c0c207
+	.dword	0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+	.dword	0x06f067aa72176fba, 0x0a637dc5a2c898a6
+	.dword	0x113f9804bef90dae, 0x1b710b35131c471b
+	.dword	0x28db77f523047d84, 0x32caab7b40c72493
+	.dword	0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+	.dword	0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+	.dword	0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+___
+}
+$code.=<<___;
+.asciiz	"SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha512-parisc.pl b/main/openssl/crypto/sha/asm/sha512-parisc.pl
new file mode 100755
index 00000000..fc0e15b3
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha512-parisc.pl
@@ -0,0 +1,793 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256/512 block procedure for PA-RISC.
+
+# June 2009.
+#
+# SHA256 performance is >75% better than gcc 3.2 generated code on
+# PA-7100LC. Compared to code generated by vendor compiler this
+# implementation is almost 70% faster in 64-bit build, but delivers
+# virtually same performance in 32-bit build on PA-8600.
+#
+# SHA512 performance is >2.9x better than gcc 3.2 generated code on
+# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
+# code is executed on PA-RISC 2.0 processor and switches to 64-bit
+# code path delivering adequate peformance even in "blended" 32-bit
+# build. Though 64-bit code is not any faster than code generated by
+# vendor compiler on PA-8600...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+if ($output =~ /512/) {
+	$func="sha512_block_data_order";
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$LAST10BITS=0x017;
+	$LD="ldd";
+	$LDM="ldd,ma";
+	$ST="std";
+} else {
+	$func="sha256_block_data_order";
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$LAST10BITS=0x0f2;
+	$LD="ldw";
+	$LDM="ldwm";
+	$ST="stw";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+				#                 [+ argument transfer]
+$XOFF=16*$SZ+32;		# local variables
+$FRAME+=$XOFF;
+$XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
+
+$ctx="%r26";	# zapped by $a0
+$inp="%r25";	# zapped by $a1
+$num="%r24";	# zapped by $t0
+
+$a0 ="%r26";
+$a1 ="%r25";
+$t0 ="%r24";
+$t1 ="%r29";
+$Tbl="%r31";
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
+
+sub ROUND_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$code.=<<___;
+	_ror	$e,$Sigma1[0],$a0
+	and	$f,$e,$t0
+	_ror	$e,$Sigma1[1],$a1
+	addl	$t1,$h,$h
+	andcm	$g,$e,$t1
+	xor	$a1,$a0,$a0
+	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
+	or	$t0,$t1,$t1		; Ch(e,f,g)
+	addl	@X[$i%16],$h,$h
+	xor	$a0,$a1,$a1		; Sigma1(e)
+	addl	$t1,$h,$h
+	_ror	$a,$Sigma0[0],$a0
+	addl	$a1,$h,$h
+
+	_ror	$a,$Sigma0[1],$a1
+	and	$a,$b,$t0
+	and	$a,$c,$t1
+	xor	$a1,$a0,$a0
+	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$t1,$t0,$t0
+	and	$b,$c,$t1
+	xor	$a0,$a1,$a1		; Sigma0(a)
+	addl	$h,$d,$d
+	xor	$t1,$t0,$t0		; Maj(a,b,c)
+	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
+	addl	$a1,$h,$h
+	addl	$t0,$h,$h
+
+___
+}
+
+sub ROUND_16_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$i-=16;
+$code.=<<___;
+	_ror	@X[($i+1)%16],$sigma0[0],$a0
+	_ror	@X[($i+1)%16],$sigma0[1],$a1
+	addl	@X[($i+9)%16],@X[$i],@X[$i]
+	_ror	@X[($i+14)%16],$sigma1[0],$t0
+	_ror	@X[($i+14)%16],$sigma1[1],$t1
+	xor	$a1,$a0,$a0
+	_shr	@X[($i+1)%16],$sigma0[2],$a1
+	xor	$t1,$t0,$t0
+	_shr	@X[($i+14)%16],$sigma1[2],$t1
+	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
+	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
+	$LDM	$SZ($Tbl),$t1
+	addl	$a0,@X[$i],@X[$i]
+	addl	$t0,@X[$i],@X[$i]
+___
+$code.=<<___ if ($i==15);
+	extru	$t1,31,10,$a1
+	comiclr,<> $LAST10BITS,$a1,%r0
+	ldo	1($Tbl),$Tbl		; signal end of $Tbl
+___
+&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.ALIGN	64
+L\$table
+___
+$code.=<<___ if ($SZ==8);
+	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
+	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
+	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
+	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
+	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
+	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
+	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
+	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
+	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
+	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
+	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
+	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
+	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
+	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
+	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
+	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
+	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
+	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
+	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
+	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
+	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
+	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
+	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
+	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
+	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
+	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
+	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
+	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
+	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
+	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
+	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
+	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
+	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
+	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
+	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
+	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
+	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
+	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
+	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
+	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
+___
+$code.=<<___ if ($SZ==4);
+	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+___
+$code.=<<___;
+
+	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	64
+$func
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	_shl	$num,`log(16*$SZ)/log(2)`,$num
+	addl	$inp,$num,$num		; $num to point at the end of $inp
+
+	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
+	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
+
+	blr	%r0,$Tbl
+	ldi	3,$t1
+L\$pic
+	andcm	$Tbl,$t1,$Tbl		; wipe privilege level
+	ldo	L\$table-L\$pic($Tbl),$Tbl
+___
+$code.=<<___ if ($SZ==8 && $SIZE_T==4);
+	ldi	31,$t1
+	mtctl	$t1,%cr11
+	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
+	b	L\$parisc1
+	nop
+___
+$code.=<<___;
+	$LD	`0*$SZ`($ctx),$A	; load context
+	$LD	`1*$SZ`($ctx),$B
+	$LD	`2*$SZ`($ctx),$C
+	$LD	`3*$SZ`($ctx),$D
+	$LD	`4*$SZ`($ctx),$E
+	$LD	`5*$SZ`($ctx),$F
+	$LD	`6*$SZ`($ctx),$G
+	$LD	`7*$SZ`($ctx),$H
+
+	extru	$inp,31,`log($SZ)/log(2)`,$t0
+	sh3addl	$t0,%r0,$t0
+	subi	`8*$SZ`,$t0,$t0
+	mtctl	$t0,%cr11		; load %sar with align factor
+
+L\$oop
+	ldi	`$SZ-1`,$t0
+	$LDM	$SZ($Tbl),$t1
+	andcm	$inp,$t0,$t0		; align $inp
+___
+	for ($i=0;$i<15;$i++) {		# load input block
+	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
+$code.=<<___;
+	cmpb,*=	$inp,$t0,L\$aligned
+	$LD	`$SZ*15`($t0),@X[15]
+	$LD	`$SZ*16`($t0),@X[16]
+___
+	for ($i=0;$i<16;$i++) {		# align data
+	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
+$code.=<<___;
+L\$aligned
+	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+
+for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+L\$rounds
+	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
+	nop
+
+	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
+	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
+
+	$LD	`0*$SZ`($ctx),@X[0]	; load context
+	$LD	`1*$SZ`($ctx),@X[1]
+	$LD	`2*$SZ`($ctx),@X[2]
+	$LD	`3*$SZ`($ctx),@X[3]
+	$LD	`4*$SZ`($ctx),@X[4]
+	$LD	`5*$SZ`($ctx),@X[5]
+	addl	@X[0],$A,$A
+	$LD	`6*$SZ`($ctx),@X[6]
+	addl	@X[1],$B,$B
+	$LD	`7*$SZ`($ctx),@X[7]
+	ldo	`16*$SZ`($inp),$inp	; advance $inp
+
+	$ST	$A,`0*$SZ`($ctx)	; save context
+	addl	@X[2],$C,$C
+	$ST	$B,`1*$SZ`($ctx)
+	addl	@X[3],$D,$D
+	$ST	$C,`2*$SZ`($ctx)
+	addl	@X[4],$E,$E
+	$ST	$D,`3*$SZ`($ctx)
+	addl	@X[5],$F,$F
+	$ST	$E,`4*$SZ`($ctx)
+	addl	@X[6],$G,$G
+	$ST	$F,`5*$SZ`($ctx)
+	addl	@X[7],$H,$H
+	$ST	$G,`6*$SZ`($ctx)
+	$ST	$H,`7*$SZ`($ctx)
+
+	cmpb,*<>,n $inp,$num,L\$oop
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
+___
+if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
+{{
+$code.=<<___;
+	b	L\$done
+	nop
+
+	.ALIGN	64
+L\$parisc1
+___
+
+@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
+      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) = 
+   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
+$a0 ="%r17";
+$a1 ="%r18";
+$a2 ="%r19";
+$a3 ="%r20";
+$t0 ="%r21";
+$t1 ="%r22";
+$t2 ="%r28";
+$t3 ="%r29";
+$Tbl="%r31";
+
+@X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
+
+sub ROUND_00_15_pa1 {
+my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
+       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+
+$code.=<<___ if (!$flag);
+	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
+___
+$code.=<<___;
+	shd	$ehi,$elo,$Sigma1[0],$t0
+	 add	$Xlo,$hlo,$hlo
+	shd	$elo,$ehi,$Sigma1[0],$t1
+	 addc	$Xhi,$hhi,$hhi		; h += X[i]
+	shd	$ehi,$elo,$Sigma1[1],$t2
+	 ldwm	8($Tbl),$Xhi
+	shd	$elo,$ehi,$Sigma1[1],$t3
+	 ldw	-4($Tbl),$Xlo		; load K[i]
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1
+	 and	$flo,$elo,$a0
+	 and	$fhi,$ehi,$a1
+	shd	$ehi,$elo,$Sigma1[2],$t2
+	 andcm	$glo,$elo,$a2
+	shd	$elo,$ehi,$Sigma1[2],$t3
+	 andcm	$ghi,$ehi,$a3
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1		; Sigma1(e)
+	add	$Xlo,$hlo,$hlo
+	 xor	$a2,$a0,$a0
+	addc	$Xhi,$hhi,$hhi		; h += K[i]
+	 xor	$a3,$a1,$a1		; Ch(e,f,g)
+
+	 add	$t0,$hlo,$hlo
+	shd	$ahi,$alo,$Sigma0[0],$t0
+	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
+	shd	$alo,$ahi,$Sigma0[0],$t1	
+	 add	$a0,$hlo,$hlo
+	shd	$ahi,$alo,$Sigma0[1],$t2
+	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
+	shd	$alo,$ahi,$Sigma0[1],$t3
+
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1
+	shd	$ahi,$alo,$Sigma0[2],$t2
+	and	$alo,$blo,$a0
+	shd	$alo,$ahi,$Sigma0[2],$t3
+	and	$ahi,$bhi,$a1
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1		; Sigma0(a)
+
+	and	$alo,$clo,$a2
+	and	$ahi,$chi,$a3
+	xor	$a2,$a0,$a0
+	 add	$hlo,$dlo,$dlo
+	xor	$a3,$a1,$a1
+	 addc	$hhi,$dhi,$dhi		; d += h
+	and	$blo,$clo,$a2
+	 add	$t0,$hlo,$hlo
+	and	$bhi,$chi,$a3
+	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
+	xor	$a2,$a0,$a0
+	 add	$a0,$hlo,$hlo
+	xor	$a3,$a1,$a1		; Maj(a,b,c)
+	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
+
+___
+$code.=<<___ if ($i==15 && $flag);
+	extru	$Xlo,31,10,$Xlo
+	comiclr,= $LAST10BITS,$Xlo,%r0
+	b	L\$rounds_pa1
+	nop
+___
+push(@X,shift(@X)); push(@X,shift(@X));
+}
+
+sub ROUND_16_xx_pa1 {
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+my ($i)=shift;
+$i-=16;
+$code.=<<___;
+	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
+	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
+	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
+	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
+	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
+	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
+	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
+	 add	$a0,$Xlo,$Xlo
+	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
+	 addc	$a1,$Xhi,$Xhi
+	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
+	xor	$t2,$t0,$t0
+	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
+	xor	$t3,$t1,$t1
+	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
+	xor	$t2,$t0,$t0
+	 shd	$a3,$a2,$sigma1[0],$a0
+	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
+	 shd	$a2,$a3,$sigma1[0],$a1
+	add	$t0,$Xlo,$Xlo
+	 shd	$a3,$a2,$sigma1[1],$t2
+	addc	$t1,$Xhi,$Xhi
+	 shd	$a2,$a3,$sigma1[1],$t3
+	xor	$t2,$a0,$a0
+	shd	$a3,$a2,$sigma1[2],$t2
+	xor	$t3,$a1,$a1
+	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
+	xor	$t2,$a0,$a0
+	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
+	add	$a0,$Xlo,$Xlo
+	addc	$a1,$Xhi,$Xhi
+
+	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
+	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
+___
+&ROUND_00_15_pa1($i,@_,1);
+}
+$code.=<<___;
+	ldw	`0*4`($ctx),$Ahi		; load context
+	ldw	`1*4`($ctx),$Alo
+	ldw	`2*4`($ctx),$Bhi
+	ldw	`3*4`($ctx),$Blo
+	ldw	`4*4`($ctx),$Chi
+	ldw	`5*4`($ctx),$Clo
+	ldw	`6*4`($ctx),$Dhi
+	ldw	`7*4`($ctx),$Dlo
+	ldw	`8*4`($ctx),$Ehi
+	ldw	`9*4`($ctx),$Elo
+	ldw	`10*4`($ctx),$Fhi
+	ldw	`11*4`($ctx),$Flo
+	ldw	`12*4`($ctx),$Ghi
+	ldw	`13*4`($ctx),$Glo
+	ldw	`14*4`($ctx),$Hhi
+	ldw	`15*4`($ctx),$Hlo
+
+	extru	$inp,31,2,$t0
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11		; load %sar with align factor
+
+L\$oop_pa1
+	extru	$inp,31,2,$a3
+	comib,=	0,$a3,L\$aligned_pa1
+	sub	$inp,$a3,$inp
+
+	ldw	`0*4`($inp),$X[0]
+	ldw	`1*4`($inp),$X[1]
+	ldw	`2*4`($inp),$t2
+	ldw	`3*4`($inp),$t3
+	ldw	`4*4`($inp),$a0
+	ldw	`5*4`($inp),$a1
+	ldw	`6*4`($inp),$a2
+	ldw	`7*4`($inp),$a3
+	vshd	$X[0],$X[1],$X[0]
+	vshd	$X[1],$t2,$X[1]
+	stw	$X[0],`-$XOFF+0*4`(%sp)
+	ldw	`8*4`($inp),$t0
+	vshd	$t2,$t3,$t2
+	stw	$X[1],`-$XOFF+1*4`(%sp)
+	ldw	`9*4`($inp),$t1
+	vshd	$t3,$a0,$t3
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<=(128/4-8);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	ldw	`(8+$i)*4`($inp),$t[0]
+	vshd	$t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+for (;$i<(128/4-1);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	vshd	$t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+$code.=<<___;
+	b	L\$collected_pa1
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+
+___
+}
+$code.=<<___;
+L\$aligned_pa1
+	ldw	`0*4`($inp),$X[0]
+	ldw	`1*4`($inp),$X[1]
+	ldw	`2*4`($inp),$t2
+	ldw	`3*4`($inp),$t3
+	ldw	`4*4`($inp),$a0
+	ldw	`5*4`($inp),$a1
+	ldw	`6*4`($inp),$a2
+	ldw	`7*4`($inp),$a3
+	stw	$X[0],`-$XOFF+0*4`(%sp)
+	ldw	`8*4`($inp),$t0
+	stw	$X[1],`-$XOFF+1*4`(%sp)
+	ldw	`9*4`($inp),$t1
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<(128/4-8);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	ldw	`(8+$i)*4`($inp),$t[0]
+___
+push(@t,shift(@t));
+}
+for (;$i<128/4;$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+___
+push(@t,shift(@t));
+}
+$code.="L\$collected_pa1\n";
+}
+
+for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+$code.="L\$rounds_pa1\n";
+for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
+	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
+
+	ldw	`0*4`($ctx),$t1		; update context
+	ldw	`1*4`($ctx),$t0
+	ldw	`2*4`($ctx),$t3
+	ldw	`3*4`($ctx),$t2
+	ldw	`4*4`($ctx),$a1
+	ldw	`5*4`($ctx),$a0
+	ldw	`6*4`($ctx),$a3
+	add	$t0,$Alo,$Alo
+	ldw	`7*4`($ctx),$a2
+	addc	$t1,$Ahi,$Ahi
+	ldw	`8*4`($ctx),$t1
+	add	$t2,$Blo,$Blo
+	ldw	`9*4`($ctx),$t0
+	addc	$t3,$Bhi,$Bhi
+	ldw	`10*4`($ctx),$t3
+	add	$a0,$Clo,$Clo
+	ldw	`11*4`($ctx),$t2
+	addc	$a1,$Chi,$Chi
+	ldw	`12*4`($ctx),$a1
+	add	$a2,$Dlo,$Dlo
+	ldw	`13*4`($ctx),$a0
+	addc	$a3,$Dhi,$Dhi
+	ldw	`14*4`($ctx),$a3
+	add	$t0,$Elo,$Elo
+	ldw	`15*4`($ctx),$a2
+	addc	$t1,$Ehi,$Ehi
+	stw	$Ahi,`0*4`($ctx)
+	add	$t2,$Flo,$Flo
+	stw	$Alo,`1*4`($ctx)
+	addc	$t3,$Fhi,$Fhi
+	stw	$Bhi,`2*4`($ctx)
+	add	$a0,$Glo,$Glo
+	stw	$Blo,`3*4`($ctx)
+	addc	$a1,$Ghi,$Ghi
+	stw	$Chi,`4*4`($ctx)
+	add	$a2,$Hlo,$Hlo
+	stw	$Clo,`5*4`($ctx)
+	addc	$a3,$Hhi,$Hhi
+	stw	$Dhi,`6*4`($ctx)
+	ldo	`16*$SZ`($inp),$inp	; advance $inp
+	stw	$Dlo,`7*4`($ctx)
+	stw	$Ehi,`8*4`($ctx)
+	stw	$Elo,`9*4`($ctx)
+	stw	$Fhi,`10*4`($ctx)
+	stw	$Flo,`11*4`($ctx)
+	stw	$Ghi,`12*4`($ctx)
+	stw	$Glo,`13*4`($ctx)
+	stw	$Hhi,`14*4`($ctx)
+	comb,=	$inp,$num,L\$done
+	stw	$Hlo,`15*4`($ctx)
+	b	L\$oop_pa1
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
+L\$done
+___
+}}
+$code.=<<___;
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
+    {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
+	$opcode|=(1<<3) if ($mod =~ /^,m/);
+	$opcode|=(1<<2) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
+    {	sprintf "\t.WORD\t0x%08x\t; %s",
+		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
+		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
+		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
+	# translate made up instructons: _ror, _shr, _align, _shl
+	s/_ror(\s+)(%r[0-9]+),/
+		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
+
+	s/_shr(\s+%r[0-9]+),([0-9]+),/
+		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
+		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
+
+	s/_align(\s+%r[0-9]+,%r[0-9]+),/
+		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
+
+	s/_shl(\s+%r[0-9]+),([0-9]+),/
+		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
+		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
+
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
+
+	s/cmpb,\*/comb,/ if ($SIZE_T==4);
+
+	s/\bbv\b/bve/    if ($SIZE_T==8);
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha512-ppc.pl b/main/openssl/crypto/sha/asm/sha512-ppc.pl
index 768a6a6f..6b44a68e 100755
--- a/main/openssl/crypto/sha/asm/sha512-ppc.pl
+++ b/main/openssl/crypto/sha/asm/sha512-ppc.pl
@@ -40,6 +40,7 @@ $output =shift;
 
 if ($flavour =~ /64/) {
 	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
 	$STU="stdu";
 	$UCMP="cmpld";
 	$SHL="sldi";
@@ -47,6 +48,7 @@ if ($flavour =~ /64/) {
 	$PUSH="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
 	$STU="stwu";
 	$UCMP="cmplw";
 	$SHL="slwi";
@@ -87,7 +89,8 @@ if ($output =~ /512/) {
 	$SHR="srwi";
 }
 
-$FRAME=32*$SIZE_T;
+$FRAME=32*$SIZE_T+16*$SZ;
+$LOCALS=6*$SIZE_T;
 
 $sp ="r1";
 $toc="r2";
@@ -179,13 +182,12 @@ $code=<<___;
 .globl	$func
 .align	6
 $func:
+	$STU	$sp,-$FRAME($sp)
 	mflr	r0
-	$STU	$sp,`-($FRAME+16*$SZ)`($sp)
 	$SHL	$num,$num,`log(16*$SZ)/log(2)`
 
 	$PUSH	$ctx,`$FRAME-$SIZE_T*22`($sp)
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -206,6 +208,7 @@ $func:
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 
 	$LD	$A,`0*$SZ`($ctx)
 	mr	$inp,r4				; incarnate $inp
@@ -217,7 +220,7 @@ $func:
 	$LD	$G,`6*$SZ`($ctx)
 	$LD	$H,`7*$SZ`($ctx)
 
-	b	LPICmeup
+	bl	LPICmeup
 LPICedup:
 	andi.	r0,$inp,3
 	bne	Lunaligned
@@ -226,40 +229,14 @@ Laligned:
 	$PUSH	$num,`$FRAME-$SIZE_T*24`($sp)	; end pointer
 	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
 	bl	Lsha2_block_private
-Ldone:
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
-	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
-	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
-	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
-	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
-	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
-	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
-	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
-	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
-	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
-	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
-	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
-	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
-	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
-	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
-	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
-	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
-	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
-	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
-	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
-	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
-	mtlr	r0
-	addi	$sp,$sp,`$FRAME+16*$SZ`
-	blr
-___
+	b	Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for the input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for the input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align	4
 Lunaligned:
 	subfic	$t1,$inp,4096
@@ -278,7 +255,7 @@ Lunaligned:
 Lcross_page:
 	li	$t1,`16*$SZ/4`
 	mtctr	$t1
-	addi	r20,$sp,$FRAME			; aligned spot below the frame
+	addi	r20,$sp,$LOCALS			; aligned spot below the frame
 Lmemcpy:
 	lbz	r16,0($inp)
 	lbz	r17,1($inp)
@@ -293,8 +270,8 @@ Lmemcpy:
 	bdnz	Lmemcpy
 
 	$PUSH	$inp,`$FRAME-$SIZE_T*26`($sp)	; save real inp
-	addi	$t1,$sp,`$FRAME+16*$SZ`		; fictitious end pointer
-	addi	$inp,$sp,$FRAME			; fictitious inp pointer
+	addi	$t1,$sp,`$LOCALS+16*$SZ`	; fictitious end pointer
+	addi	$inp,$sp,$LOCALS		; fictitious inp pointer
 	$PUSH	$num,`$FRAME-$SIZE_T*25`($sp)	; save real num
 	$PUSH	$t1,`$FRAME-$SIZE_T*24`($sp)	; end pointer
 	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
@@ -303,10 +280,36 @@ Lmemcpy:
 	$POP	$num,`$FRAME-$SIZE_T*25`($sp)	; restore real num
 	addic.	$num,$num,`-16*$SZ`		; num--
 	bne-	Lunaligned
-	b	Ldone
-___
 
-$code.=<<___;
+Ldone:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
+	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
+	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
+	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
+	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
+	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
+	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
+	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
+	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
+	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
+	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
+	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
+	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
+	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
+	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
+	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
+	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
+	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
+	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
+	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
+	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
+	mtlr	r0
+	addi	$sp,$sp,$FRAME
+	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
+
 .align	4
 Lsha2_block_private:
 ___
@@ -372,6 +375,8 @@ $code.=<<___;
 	$ST	$H,`7*$SZ`($ctx)
 	bne	Lsha2_block_private
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 ___
 
 # Ugly hack here, because PPC assembler syntax seem to vary too
@@ -379,22 +384,15 @@ ___
 $code.=<<___;
 .align	6
 LPICmeup:
-	bl	LPIC
-	addi	$Tbl,$Tbl,`64-4`	; "distance" between . and last nop
-	b	LPICedup
-	nop
-	nop
-	nop
-	nop
-	nop
-LPIC:	mflr	$Tbl
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
+	addi	$Tbl,$Tbl,`64-8`
+	mtlr	r0
 	blr
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
 ___
 $code.=<<___ if ($SZ==8);
 	.long	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
diff --git a/main/openssl/crypto/sha/asm/sha512-s390x.pl b/main/openssl/crypto/sha/asm/sha512-s390x.pl
index e7ef2d5a..079a3fc7 100644
--- a/main/openssl/crypto/sha/asm/sha512-s390x.pl
+++ b/main/openssl/crypto/sha/asm/sha512-s390x.pl
@@ -26,6 +26,26 @@
 # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
 # than software.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z900 SHA256 was measured to
+# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 $t0="%r0";
 $t1="%r1";
 $ctx="%r2";	$t2="%r2";
@@ -44,7 +64,7 @@ $tbl="%r13";
 $T1="%r14";
 $sp="%r15";
 
-$output=shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 if ($output =~ /512/) {
@@ -78,7 +98,8 @@ if ($output =~ /512/) {
 }
 $Func="sha${label}_block_data_order";
 $Table="K${label}";
-$frame=160+16*$SZ;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*$SZ;
 
 sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
@@ -93,9 +114,9 @@ $code.=<<___;
 	xgr	$t0,$t1
 	$ROT	$t1,$t1,`$Sigma1[2]-$Sigma1[1]`
 	 xgr	$t2,$g
-	$ST	$T1,`160+$SZ*($i%16)`($sp)
+	$ST	$T1,`$stdframe+$SZ*($i%16)`($sp)
 	xgr	$t0,$t1			# Sigma1(e)
-	la	$T1,0($T1,$h)		# T1+=h
+	algr	$T1,$h			# T1+=h
 	 ngr	$t2,$e
 	 lgr	$t1,$a
 	algr	$T1,$t0			# T1+=Sigma1(e)
@@ -113,7 +134,7 @@ $code.=<<___;
 	 ngr	$t2,$b
 	algr	$h,$T1			# h+=T1
 	 ogr	$t2,$t1			# Maj(a,b,c)
-	la	$d,0($d,$T1)		# d+=T1
+	algr	$d,$T1			# d+=T1
 	algr	$h,$t2			# h+=Maj(a,b,c)
 ___
 }
@@ -122,19 +143,19 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	$LD	$T1,`160+$SZ*(($i+1)%16)`($sp)	### $i
-	$LD	$t1,`160+$SZ*(($i+14)%16)`($sp)
+	$LD	$T1,`$stdframe+$SZ*(($i+1)%16)`($sp)	### $i
+	$LD	$t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
 	$ROT	$t0,$T1,$sigma0[0]
 	$SHR	$T1,$sigma0[2]
 	$ROT	$t2,$t0,`$sigma0[1]-$sigma0[0]`
 	xgr	$T1,$t0
 	$ROT	$t0,$t1,$sigma1[0]
-	xgr	$T1,$t2				# sigma0(X[i+1])
+	xgr	$T1,$t2					# sigma0(X[i+1])
 	$SHR	$t1,$sigma1[2]
-	$ADD	$T1,`160+$SZ*($i%16)`($sp)	# +=X[i]
+	$ADD	$T1,`$stdframe+$SZ*($i%16)`($sp)	# +=X[i]
 	xgr	$t1,$t0
 	$ROT	$t0,$t0,`$sigma1[1]-$sigma1[0]`
-	$ADD	$T1,`160+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
+	$ADD	$T1,`$stdframe+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
 	xgr	$t1,$t0				# sigma1(X[i+14])
 	algr	$T1,$t1				# +=sigma1(X[i+14])
 ___
@@ -212,6 +233,7 @@ $code.=<<___;
 .globl	$Func
 .type	$Func,\@function
 $Func:
+	sllg	$len,$len,`log(16*$SZ)/log(2)`
 ___
 $code.=<<___ if ($kimdfunc);
 	larl	%r1,OPENSSL_s390xcap_P
@@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc);
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
 	lgr	%r1,$ctx
 	lgr	%r2,$inp
-	sllg	%r3,$len,`log(16*$SZ)/log(2)`
+	lgr	%r3,$len
 	.long	0xb93e0002	# kimd %r0,%r2
 	brc	1,.-4		# pay attention to "partial completion"
 	br	%r14
@@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc);
 .Lsoftware:
 ___
 $code.=<<___;
-	sllg	$len,$len,`log(16*$SZ)/log(2)`
 	lghi	%r1,-$frame
-	agr	$len,$inp
-	stmg	$ctx,%r15,16($sp)
+	la	$len,0($len,$inp)
+	stm${g}	$ctx,%r15,`2*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)
 
 	larl	$tbl,$Table
 	$LD	$A,`0*$SZ`($ctx)
@@ -265,7 +286,7 @@ $code.=<<___;
 	clgr	$len,$t0
 	jne	.Lrounds_16_xx
 
-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,`16*$SZ`($inp)
 	$ADD	$A,`0*$SZ`($ctx)
 	$ADD	$B,`1*$SZ`($ctx)
@@ -283,14 +304,14 @@ $code.=<<___;
 	$ST	$F,`5*$SZ`($ctx)
 	$ST	$G,`6*$SZ`($ctx)
 	$ST	$H,`7*$SZ`($ctx)
-	clg	$inp,`$frame+32`($sp)
+	cl${g}	$inp,`$frame+4*$SIZE_T`($sp)
 	jne	.Lloop
 
-	lmg	%r6,%r15,`$frame+48`($sp)	
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)	
 	br	%r14
 .size	$Func,.-$Func
 .string	"SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/sha/asm/sha512-sparcv9.pl b/main/openssl/crypto/sha/asm/sha512-sparcv9.pl
index ec5d7813..58574078 100644
--- a/main/openssl/crypto/sha/asm/sha512-sparcv9.pl
+++ b/main/openssl/crypto/sha/asm/sha512-sparcv9.pl
@@ -305,9 +305,9 @@ $code.=<<___;
 	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
 	srl	@X[($i/2)%8],0,$tmp0
+	add	$tmp2,$tmp1,$tmp1
 	add	$xi,$T1,$T1			! +=X[i]
 	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
-	add	$tmp2,$T1,$T1
 	add	$tmp1,$T1,$T1
 
 	srl	$T1,0,$T1
@@ -318,9 +318,9 @@ ___
 $code.=<<___;
 	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
-	srl	@X[($i/2)%8],0,@X[($i/2)%8]
 	add	$xi,$T1,$T1			! +=X[i+9]
-	add	$tmp2,$T1,$T1
+	add	$tmp2,$tmp1,$tmp1
+	srl	@X[($i/2)%8],0,@X[($i/2)%8]
 	add	$tmp1,$T1,$T1
 
 	sllx	$T1,32,$tmp0
diff --git a/main/openssl/crypto/sha/asm/sha512-x86_64.S b/main/openssl/crypto/sha/asm/sha512-x86_64.S
new file mode 100644
index 00000000..2d3294e0
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha512-x86_64.S
@@ -0,0 +1,1802 @@
+.text	
+
+.globl	sha512_block_data_order
+.type	sha512_block_data_order,@function
+.align	16
+sha512_block_data_order:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	shlq	$4,%rdx
+	subq	$128+32,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%r11,128+24(%rsp)
+.Lprologue:
+
+	leaq	K512(%rip),%rbp
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xorq	%rdi,%rdi
+	movq	0(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+	movq	%r12,0(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%rax,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r8,%r15
+	movq	%rbx,%r11
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	xorq	%rcx,%r11
+	xorq	%rax,%r14
+	addq	%r15,%r12
+	movq	%rbx,%r15
+
+	rorq	$14,%r13
+	andq	%rax,%r11
+	andq	%rcx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r11
+
+	addq	%r12,%rdx
+	addq	%r12,%r11
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r11
+
+	movq	8(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%r15
+	movq	%r12,8(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r11,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rdx,%r15
+	movq	%rax,%r10
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	xorq	%rbx,%r10
+	xorq	%r11,%r14
+	addq	%r15,%r12
+	movq	%rax,%r15
+
+	rorq	$14,%r13
+	andq	%r11,%r10
+	andq	%rbx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r10
+
+	addq	%r12,%rcx
+	addq	%r12,%r10
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r10
+
+	movq	16(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+	movq	%r12,16(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r10,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rcx,%r15
+	movq	%r11,%r9
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	xorq	%rax,%r9
+	xorq	%r10,%r14
+	addq	%r15,%r12
+	movq	%r11,%r15
+
+	rorq	$14,%r13
+	andq	%r10,%r9
+	andq	%rax,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r9
+
+	addq	%r12,%rbx
+	addq	%r12,%r9
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r9
+
+	movq	24(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%r15
+	movq	%r12,24(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%r9,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rbx,%r15
+	movq	%r10,%r8
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	xorq	%r11,%r8
+	xorq	%r9,%r14
+	addq	%r15,%r12
+	movq	%r10,%r15
+
+	rorq	$14,%r13
+	andq	%r9,%r8
+	andq	%r11,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r8
+
+	addq	%r12,%rax
+	addq	%r12,%r8
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r8
+
+	movq	32(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+	movq	%r12,32(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%r8,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rax,%r15
+	movq	%r9,%rdx
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	xorq	%r10,%rdx
+	xorq	%r8,%r14
+	addq	%r15,%r12
+	movq	%r9,%r15
+
+	rorq	$14,%r13
+	andq	%r8,%rdx
+	andq	%r10,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rdx
+
+	addq	%r12,%r11
+	addq	%r12,%rdx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rdx
+
+	movq	40(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%r15
+	movq	%r12,40(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rdx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r11,%r15
+	movq	%r8,%rcx
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	xorq	%r9,%rcx
+	xorq	%rdx,%r14
+	addq	%r15,%r12
+	movq	%r8,%r15
+
+	rorq	$14,%r13
+	andq	%rdx,%rcx
+	andq	%r9,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rcx
+
+	addq	%r12,%r10
+	addq	%r12,%rcx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rcx
+
+	movq	48(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+	movq	%r12,48(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rcx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r10,%r15
+	movq	%rdx,%rbx
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	xorq	%r8,%rbx
+	xorq	%rcx,%r14
+	addq	%r15,%r12
+	movq	%rdx,%r15
+
+	rorq	$14,%r13
+	andq	%rcx,%rbx
+	andq	%r8,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rbx
+
+	addq	%r12,%r9
+	addq	%r12,%rbx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rbx
+
+	movq	56(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%r15
+	movq	%r12,56(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%rbx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r9,%r15
+	movq	%rcx,%rax
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	xorq	%rdx,%rax
+	xorq	%rbx,%r14
+	addq	%r15,%r12
+	movq	%rcx,%r15
+
+	rorq	$14,%r13
+	andq	%rbx,%rax
+	andq	%rdx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rax
+
+	addq	%r12,%r8
+	addq	%r12,%rax
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rax
+
+	movq	64(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+	movq	%r12,64(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%rax,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r8,%r15
+	movq	%rbx,%r11
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	xorq	%rcx,%r11
+	xorq	%rax,%r14
+	addq	%r15,%r12
+	movq	%rbx,%r15
+
+	rorq	$14,%r13
+	andq	%rax,%r11
+	andq	%rcx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r11
+
+	addq	%r12,%rdx
+	addq	%r12,%r11
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r11
+
+	movq	72(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%r15
+	movq	%r12,72(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r11,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rdx,%r15
+	movq	%rax,%r10
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	xorq	%rbx,%r10
+	xorq	%r11,%r14
+	addq	%r15,%r12
+	movq	%rax,%r15
+
+	rorq	$14,%r13
+	andq	%r11,%r10
+	andq	%rbx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r10
+
+	addq	%r12,%rcx
+	addq	%r12,%r10
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r10
+
+	movq	80(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+	movq	%r12,80(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r10,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rcx,%r15
+	movq	%r11,%r9
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	xorq	%rax,%r9
+	xorq	%r10,%r14
+	addq	%r15,%r12
+	movq	%r11,%r15
+
+	rorq	$14,%r13
+	andq	%r10,%r9
+	andq	%rax,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r9
+
+	addq	%r12,%rbx
+	addq	%r12,%r9
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r9
+
+	movq	88(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%r15
+	movq	%r12,88(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%r9,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rbx,%r15
+	movq	%r10,%r8
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	xorq	%r11,%r8
+	xorq	%r9,%r14
+	addq	%r15,%r12
+	movq	%r10,%r15
+
+	rorq	$14,%r13
+	andq	%r9,%r8
+	andq	%r11,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r8
+
+	addq	%r12,%rax
+	addq	%r12,%r8
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r8
+
+	movq	96(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+	movq	%r12,96(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%r8,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rax,%r15
+	movq	%r9,%rdx
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	xorq	%r10,%rdx
+	xorq	%r8,%r14
+	addq	%r15,%r12
+	movq	%r9,%r15
+
+	rorq	$14,%r13
+	andq	%r8,%rdx
+	andq	%r10,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rdx
+
+	addq	%r12,%r11
+	addq	%r12,%rdx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rdx
+
+	movq	104(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%r15
+	movq	%r12,104(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rdx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r11,%r15
+	movq	%r8,%rcx
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	xorq	%r9,%rcx
+	xorq	%rdx,%r14
+	addq	%r15,%r12
+	movq	%r8,%r15
+
+	rorq	$14,%r13
+	andq	%rdx,%rcx
+	andq	%r9,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rcx
+
+	addq	%r12,%r10
+	addq	%r12,%rcx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rcx
+
+	movq	112(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+	movq	%r12,112(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rcx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r10,%r15
+	movq	%rdx,%rbx
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	xorq	%r8,%rbx
+	xorq	%rcx,%r14
+	addq	%r15,%r12
+	movq	%rdx,%r15
+
+	rorq	$14,%r13
+	andq	%rcx,%rbx
+	andq	%r8,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rbx
+
+	addq	%r12,%r9
+	addq	%r12,%rbx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rbx
+
+	movq	120(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%r15
+	movq	%r12,120(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%rbx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r9,%r15
+	movq	%rcx,%rax
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	xorq	%rdx,%rax
+	xorq	%rbx,%r14
+	addq	%r15,%r12
+	movq	%rcx,%r15
+
+	rorq	$14,%r13
+	andq	%rbx,%rax
+	andq	%rdx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rax
+
+	addq	%r12,%r8
+	addq	%r12,%rax
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rax
+
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movq	8(%rsp),%r13
+	movq	112(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	72(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	0(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r14,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+	movq	%r12,0(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%rax,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r8,%r15
+	movq	%rbx,%r11
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	xorq	%rcx,%r11
+	xorq	%rax,%r14
+	addq	%r15,%r12
+	movq	%rbx,%r15
+
+	rorq	$14,%r13
+	andq	%rax,%r11
+	andq	%rcx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r11
+
+	addq	%r12,%rdx
+	addq	%r12,%r11
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r11
+
+	movq	16(%rsp),%r13
+	movq	120(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	80(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	8(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%r14,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%r15
+	movq	%r12,8(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r11,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rdx,%r15
+	movq	%rax,%r10
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	xorq	%rbx,%r10
+	xorq	%r11,%r14
+	addq	%r15,%r12
+	movq	%rax,%r15
+
+	rorq	$14,%r13
+	andq	%r11,%r10
+	andq	%rbx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r10
+
+	addq	%r12,%rcx
+	addq	%r12,%r10
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r10
+
+	movq	24(%rsp),%r13
+	movq	0(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	88(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	16(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r14,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+	movq	%r12,16(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r10,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rcx,%r15
+	movq	%r11,%r9
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	xorq	%rax,%r9
+	xorq	%r10,%r14
+	addq	%r15,%r12
+	movq	%r11,%r15
+
+	rorq	$14,%r13
+	andq	%r10,%r9
+	andq	%rax,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r9
+
+	addq	%r12,%rbx
+	addq	%r12,%r9
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r9
+
+	movq	32(%rsp),%r13
+	movq	8(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	96(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	24(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%r14,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%r15
+	movq	%r12,24(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%r9,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rbx,%r15
+	movq	%r10,%r8
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	xorq	%r11,%r8
+	xorq	%r9,%r14
+	addq	%r15,%r12
+	movq	%r10,%r15
+
+	rorq	$14,%r13
+	andq	%r9,%r8
+	andq	%r11,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r8
+
+	addq	%r12,%rax
+	addq	%r12,%r8
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r8
+
+	movq	40(%rsp),%r13
+	movq	16(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	104(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	32(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r14,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+	movq	%r12,32(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%r8,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rax,%r15
+	movq	%r9,%rdx
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	xorq	%r10,%rdx
+	xorq	%r8,%r14
+	addq	%r15,%r12
+	movq	%r9,%r15
+
+	rorq	$14,%r13
+	andq	%r8,%rdx
+	andq	%r10,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rdx
+
+	addq	%r12,%r11
+	addq	%r12,%rdx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rdx
+
+	movq	48(%rsp),%r13
+	movq	24(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	112(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	40(%rsp),%r12
+	movq	%r11,%r13
+	addq	%r14,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%r15
+	movq	%r12,40(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rdx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r11,%r15
+	movq	%r8,%rcx
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	xorq	%r9,%rcx
+	xorq	%rdx,%r14
+	addq	%r15,%r12
+	movq	%r8,%r15
+
+	rorq	$14,%r13
+	andq	%rdx,%rcx
+	andq	%r9,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rcx
+
+	addq	%r12,%r10
+	addq	%r12,%rcx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rcx
+
+	movq	56(%rsp),%r13
+	movq	32(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	120(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	48(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r14,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+	movq	%r12,48(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rcx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r10,%r15
+	movq	%rdx,%rbx
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	xorq	%r8,%rbx
+	xorq	%rcx,%r14
+	addq	%r15,%r12
+	movq	%rdx,%r15
+
+	rorq	$14,%r13
+	andq	%rcx,%rbx
+	andq	%r8,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rbx
+
+	addq	%r12,%r9
+	addq	%r12,%rbx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rbx
+
+	movq	64(%rsp),%r13
+	movq	40(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	0(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	56(%rsp),%r12
+	movq	%r9,%r13
+	addq	%r14,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%r15
+	movq	%r12,56(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%rbx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r9,%r15
+	movq	%rcx,%rax
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	xorq	%rdx,%rax
+	xorq	%rbx,%r14
+	addq	%r15,%r12
+	movq	%rcx,%r15
+
+	rorq	$14,%r13
+	andq	%rbx,%rax
+	andq	%rdx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rax
+
+	addq	%r12,%r8
+	addq	%r12,%rax
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rax
+
+	movq	72(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	8(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	64(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r14,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+	movq	%r12,64(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%rax,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r8,%r15
+	movq	%rbx,%r11
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	xorq	%rcx,%r11
+	xorq	%rax,%r14
+	addq	%r15,%r12
+	movq	%rbx,%r15
+
+	rorq	$14,%r13
+	andq	%rax,%r11
+	andq	%rcx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r11
+
+	addq	%r12,%rdx
+	addq	%r12,%r11
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r11
+
+	movq	80(%rsp),%r13
+	movq	56(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	16(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	72(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%r14,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%r15
+	movq	%r12,72(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r11,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rdx,%r15
+	movq	%rax,%r10
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	xorq	%rbx,%r10
+	xorq	%r11,%r14
+	addq	%r15,%r12
+	movq	%rax,%r15
+
+	rorq	$14,%r13
+	andq	%r11,%r10
+	andq	%rbx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r10
+
+	addq	%r12,%rcx
+	addq	%r12,%r10
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r10
+
+	movq	88(%rsp),%r13
+	movq	64(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	24(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	80(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r14,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+	movq	%r12,80(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r10,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rcx,%r15
+	movq	%r11,%r9
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	xorq	%rax,%r9
+	xorq	%r10,%r14
+	addq	%r15,%r12
+	movq	%r11,%r15
+
+	rorq	$14,%r13
+	andq	%r10,%r9
+	andq	%rax,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r9
+
+	addq	%r12,%rbx
+	addq	%r12,%r9
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r9
+
+	movq	96(%rsp),%r13
+	movq	72(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	32(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	88(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%r14,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%r15
+	movq	%r12,88(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%r9,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rbx,%r15
+	movq	%r10,%r8
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	xorq	%r11,%r8
+	xorq	%r9,%r14
+	addq	%r15,%r12
+	movq	%r10,%r15
+
+	rorq	$14,%r13
+	andq	%r9,%r8
+	andq	%r11,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r8
+
+	addq	%r12,%rax
+	addq	%r12,%r8
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r8
+
+	movq	104(%rsp),%r13
+	movq	80(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	40(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	96(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r14,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+	movq	%r12,96(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%r8,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rax,%r15
+	movq	%r9,%rdx
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	xorq	%r10,%rdx
+	xorq	%r8,%r14
+	addq	%r15,%r12
+	movq	%r9,%r15
+
+	rorq	$14,%r13
+	andq	%r8,%rdx
+	andq	%r10,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rdx
+
+	addq	%r12,%r11
+	addq	%r12,%rdx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rdx
+
+	movq	112(%rsp),%r13
+	movq	88(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	48(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	104(%rsp),%r12
+	movq	%r11,%r13
+	addq	%r14,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%r15
+	movq	%r12,104(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rdx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r11,%r15
+	movq	%r8,%rcx
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	xorq	%r9,%rcx
+	xorq	%rdx,%r14
+	addq	%r15,%r12
+	movq	%r8,%r15
+
+	rorq	$14,%r13
+	andq	%rdx,%rcx
+	andq	%r9,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rcx
+
+	addq	%r12,%r10
+	addq	%r12,%rcx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rcx
+
+	movq	120(%rsp),%r13
+	movq	96(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	56(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	112(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r14,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+	movq	%r12,112(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rcx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r10,%r15
+	movq	%rdx,%rbx
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	xorq	%r8,%rbx
+	xorq	%rcx,%r14
+	addq	%r15,%r12
+	movq	%rdx,%r15
+
+	rorq	$14,%r13
+	andq	%rcx,%rbx
+	andq	%r8,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rbx
+
+	addq	%r12,%r9
+	addq	%r12,%rbx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rbx
+
+	movq	0(%rsp),%r13
+	movq	104(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	64(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	120(%rsp),%r12
+	movq	%r9,%r13
+	addq	%r14,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%r15
+	movq	%r12,120(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%rbx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r9,%r15
+	movq	%rcx,%rax
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	xorq	%rdx,%rax
+	xorq	%rbx,%r14
+	addq	%r15,%r12
+	movq	%rcx,%r15
+
+	rorq	$14,%r13
+	andq	%rbx,%rax
+	andq	%rdx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rax
+
+	addq	%r12,%r8
+	addq	%r12,%rax
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rax
+
+	cmpq	$80,%rdi
+	jb	.Lrounds_16_xx
+
+	movq	128+0(%rsp),%rdi
+	leaq	128(%rsi),%rsi
+
+	addq	0(%rdi),%rax
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop
+
+	movq	128+24(%rsp),%rsi
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha512_block_data_order,.-sha512_block_data_order
+.align	64
+.type	K512,@object
+K512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/main/openssl/crypto/sha/asm/sha512-x86_64.pl b/main/openssl/crypto/sha/asm/sha512-x86_64.pl
index e6643f8c..8d516785 100755
--- a/main/openssl/crypto/sha/asm/sha512-x86_64.pl
+++ b/main/openssl/crypto/sha/asm/sha512-x86_64.pl
@@ -51,7 +51,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 if ($output =~ /512/) {
 	$func="sha512_block_data_order";
@@ -95,50 +96,44 @@ sub ROUND_00_15()
 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	mov	$e,$a0
-	mov	$e,$a1
+	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
 	mov	$f,$a2
+	mov	$T1,`$SZ*($i&0xf)`(%rsp)
 
-	ror	\$$Sigma1[0],$a0
-	ror	\$$Sigma1[1],$a1
+	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$e,$a0
 	xor	$g,$a2			# f^g
 
-	xor	$a1,$a0
-	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a1
+	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
+	add	$h,$T1			# T1+=h
+	xor	$a,$a1
+
+	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]
 	and	$e,$a2			# (f^g)&e
-	mov	$T1,`$SZ*($i&0xf)`(%rsp)
+	mov	$b,$h
 
-	xor	$a1,$a0			# Sigma1(e)
+	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
+	xor	$e,$a0
 	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
-	add	$h,$T1			# T1+=h
-
-	mov	$a,$h
-	add	$a0,$T1			# T1+=Sigma1(e)
 
+	xor	$c,$h			# b^c
+	xor	$a,$a1
 	add	$a2,$T1			# T1+=Ch(e,f,g)
-	mov	$a,$a0
-	mov	$a,$a1
+	mov	$b,$a2
 
-	ror	\$$Sigma0[0],$h
-	ror	\$$Sigma0[1],$a0
-	mov	$a,$a2
-	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]
+	ror	\$$Sigma1[0],$a0	# Sigma1(e)
+	and	$a,$h			# h=(b^c)&a
+	and	$c,$a2			# b&c
 
-	xor	$a0,$h
-	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a0
-	or	$c,$a1			# a|c
+	ror	\$$Sigma0[0],$a1	# Sigma0(a)
+	add	$a0,$T1			# T1+=Sigma1(e)
+	add	$a2,$h			# h+=b&c (completes +=Maj(a,b,c)
 
-	xor	$a0,$h			# h=Sigma0(a)
-	and	$c,$a2			# a&c
 	add	$T1,$d			# d+=T1
-
-	and	$b,$a1			# (a|c)&b
 	add	$T1,$h			# h+=T1
-
-	or	$a2,$a1			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1($round),$round	# round++
+	add	$a1,$h			# h+=Sigma0(a)
 
-	add	$a1,$h			# h+=Maj(a,b,c)
 ___
 }
 
@@ -147,32 +142,30 @@ sub ROUND_16_XX()
 
 $code.=<<___;
 	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
-	mov	`$SZ*(($i+14)&0xf)`(%rsp),$T1
-
-	mov	$a0,$a2
+	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a1
+	mov	$a0,$T1
+	mov	$a1,$a2
 
+	ror	\$`$sigma0[1]-$sigma0[0]`,$T1
+	xor	$a0,$T1
 	shr	\$$sigma0[2],$a0
-	ror	\$$sigma0[0],$a2
-
-	xor	$a2,$a0
-	ror	\$`$sigma0[1]-$sigma0[0]`,$a2
 
-	xor	$a2,$a0			# sigma0(X[(i+1)&0xf])
-	mov	$T1,$a1
+	ror	\$$sigma0[0],$T1
+	xor	$T1,$a0			# sigma0(X[(i+1)&0xf])
+	mov	`$SZ*(($i+9)&0xf)`(%rsp),$T1
 
-	shr	\$$sigma1[2],$T1
-	ror	\$$sigma1[0],$a1
-
-	xor	$a1,$T1
-	ror	\$`$sigma1[1]-$sigma1[0]`,$a1
-
-	xor	$a1,$T1			# sigma1(X[(i+14)&0xf])
+	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
+	xor	$a1,$a2
+	shr	\$$sigma1[2],$a1
 
+	ror	\$$sigma1[0],$a2
 	add	$a0,$T1
-
-	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
+	xor	$a2,$a1			# sigma1(X[(i+14)&0xf])
 
 	add	`$SZ*($i&0xf)`(%rsp),$T1
+	mov	$e,$a0
+	add	$a1,$T1
+	mov	$a,$a1
 ___
 	&ROUND_00_15(@_);
 }
@@ -219,6 +212,8 @@ $func:
 ___
 	for($i=0;$i<16;$i++) {
 		$code.="	mov	$SZ*$i($inp),$T1\n";
+		$code.="	mov	@ROT[4],$a0\n";
+		$code.="	mov	@ROT[0],$a1\n";
 		$code.="	bswap	$T1\n";
 		&ROUND_00_15($i,@ROT);
 		unshift(@ROT,pop(@ROT));
diff --git a/main/openssl/crypto/sha/sha.h b/main/openssl/crypto/sha/sha.h
index 16cacf9f..8a6bf4bb 100644
--- a/main/openssl/crypto/sha/sha.h
+++ b/main/openssl/crypto/sha/sha.h
@@ -106,6 +106,9 @@ typedef struct SHAstate_st
 	} SHA_CTX;
 
 #ifndef OPENSSL_NO_SHA0
+#ifdef OPENSSL_FIPS
+int private_SHA_Init(SHA_CTX *c);
+#endif
 int SHA_Init(SHA_CTX *c);
 int SHA_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA_Final(unsigned char *md, SHA_CTX *c);
@@ -113,6 +116,9 @@ unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
 void SHA_Transform(SHA_CTX *c, const unsigned char *data);
 #endif
 #ifndef OPENSSL_NO_SHA1
+#ifdef OPENSSL_FIPS
+int private_SHA1_Init(SHA_CTX *c);
+#endif
 int SHA1_Init(SHA_CTX *c);
 int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA1_Final(unsigned char *md, SHA_CTX *c);
@@ -135,6 +141,10 @@ typedef struct SHA256state_st
 	} SHA256_CTX;
 
 #ifndef OPENSSL_NO_SHA256
+#ifdef OPENSSL_FIPS
+int private_SHA224_Init(SHA256_CTX *c);
+int private_SHA256_Init(SHA256_CTX *c);
+#endif
 int SHA224_Init(SHA256_CTX *c);
 int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
 int SHA224_Final(unsigned char *md, SHA256_CTX *c);
@@ -182,6 +192,10 @@ typedef struct SHA512state_st
 #endif
 
 #ifndef OPENSSL_NO_SHA512
+#ifdef OPENSSL_FIPS
+int private_SHA384_Init(SHA512_CTX *c);
+int private_SHA512_Init(SHA512_CTX *c);
+#endif
 int SHA384_Init(SHA512_CTX *c);
 int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
 int SHA384_Final(unsigned char *md, SHA512_CTX *c);
diff --git a/main/openssl/crypto/sha/sha1_one.c b/main/openssl/crypto/sha/sha1_one.c
index 7c65b602..c56ec940 100644
--- a/main/openssl/crypto/sha/sha1_one.c
+++ b/main/openssl/crypto/sha/sha1_one.c
@@ -58,8 +58,8 @@
 
 #include <stdio.h>
 #include <string.h>
-#include <openssl/sha.h>
 #include <openssl/crypto.h>
+#include <openssl/sha.h>
 
 #ifndef OPENSSL_NO_SHA1
 unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
diff --git a/main/openssl/crypto/sha/sha1dgst.c b/main/openssl/crypto/sha/sha1dgst.c
index 50d1925c..a9869022 100644
--- a/main/openssl/crypto/sha/sha1dgst.c
+++ b/main/openssl/crypto/sha/sha1dgst.c
@@ -56,6 +56,7 @@
  * [including the GNU Public Licence.]
  */
 
+#include <openssl/crypto.h>
 #include <openssl/opensslconf.h>
 #if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
 
diff --git a/main/openssl/crypto/sha/sha256.c b/main/openssl/crypto/sha/sha256.c
index 8952d876..4eae0748 100644
--- a/main/openssl/crypto/sha/sha256.c
+++ b/main/openssl/crypto/sha/sha256.c
@@ -16,7 +16,7 @@
 
 const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
 
-int SHA224_Init (SHA256_CTX *c)
+fips_md_init_ctx(SHA224, SHA256)
 	{
 	memset (c,0,sizeof(*c));
 	c->h[0]=0xc1059ed8UL;	c->h[1]=0x367cd507UL;
@@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c)
 	return 1;
 	}
 
-int SHA256_Init (SHA256_CTX *c)
+fips_md_init(SHA256)
 	{
 	memset (c,0,sizeof(*c));
 	c->h[0]=0x6a09e667UL;	c->h[1]=0xbb67ae85UL;
@@ -88,17 +88,17 @@ int SHA224_Final (unsigned char *md, SHA256_CTX *c)
 	switch ((c)->md_len)		\
 	{   case SHA224_DIGEST_LENGTH:	\
 		for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++)	\
-		{   ll=(c)->h[nn]; HOST_l2c(ll,(s));   }	\
+		{   ll=(c)->h[nn]; (void)HOST_l2c(ll,(s));   }	\
 		break;			\
 	    case SHA256_DIGEST_LENGTH:	\
 		for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++)	\
-		{   ll=(c)->h[nn]; HOST_l2c(ll,(s));   }	\
+		{   ll=(c)->h[nn]; (void)HOST_l2c(ll,(s));   }	\
 		break;			\
 	    default:			\
 		if ((c)->md_len > SHA256_DIGEST_LENGTH)	\
 		    return 0;				\
 		for (nn=0;nn<(c)->md_len/4;nn++)		\
-		{   ll=(c)->h[nn]; HOST_l2c(ll,(s));   }	\
+		{   ll=(c)->h[nn]; (void)HOST_l2c(ll,(s));   }	\
 		break;			\
 	}				\
 	} while (0)
diff --git a/main/openssl/crypto/sha/sha512.c b/main/openssl/crypto/sha/sha512.c
index cbc0e58c..50c229dd 100644
--- a/main/openssl/crypto/sha/sha512.c
+++ b/main/openssl/crypto/sha/sha512.c
@@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
 #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
 #endif
 
-int SHA384_Init (SHA512_CTX *c)
+fips_md_init_ctx(SHA384, SHA512)
 	{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* maintain dword order required by assembler module */
-	unsigned int *h = (unsigned int *)c->h;
-
-	h[0]  = 0xcbbb9d5d; h[1]  = 0xc1059ed8;
-	h[2]  = 0x629a292a; h[3]  = 0x367cd507;
-	h[4]  = 0x9159015a; h[5]  = 0x3070dd17;
-	h[6]  = 0x152fecd8; h[7]  = 0xf70e5939;
-	h[8]  = 0x67332667; h[9]  = 0xffc00b31;
-	h[10] = 0x8eb44a87; h[11] = 0x68581511;
-	h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
-	h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
-#else
 	c->h[0]=U64(0xcbbb9d5dc1059ed8);
 	c->h[1]=U64(0x629a292a367cd507);
 	c->h[2]=U64(0x9159015a3070dd17);
@@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c)
 	c->h[5]=U64(0x8eb44a8768581511);
 	c->h[6]=U64(0xdb0c2e0d64f98fa7);
 	c->h[7]=U64(0x47b5481dbefa4fa4);
-#endif
+
         c->Nl=0;        c->Nh=0;
         c->num=0;       c->md_len=SHA384_DIGEST_LENGTH;
         return 1;
 	}
 
-int SHA512_Init (SHA512_CTX *c)
+fips_md_init(SHA512)
 	{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* maintain dword order required by assembler module */
-	unsigned int *h = (unsigned int *)c->h;
-
-	h[0]  = 0x6a09e667; h[1]  = 0xf3bcc908;
-	h[2]  = 0xbb67ae85; h[3]  = 0x84caa73b;
-	h[4]  = 0x3c6ef372; h[5]  = 0xfe94f82b;
-	h[6]  = 0xa54ff53a; h[7]  = 0x5f1d36f1;
-	h[8]  = 0x510e527f; h[9]  = 0xade682d1;
-	h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
-	h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
-	h[14] = 0x5be0cd19; h[15] = 0x137e2179;
-#else
 	c->h[0]=U64(0x6a09e667f3bcc908);
 	c->h[1]=U64(0xbb67ae8584caa73b);
 	c->h[2]=U64(0x3c6ef372fe94f82b);
@@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c)
 	c->h[5]=U64(0x9b05688c2b3e6c1f);
 	c->h[6]=U64(0x1f83d9abfb41bd6b);
 	c->h[7]=U64(0x5be0cd19137e2179);
-#endif
+
         c->Nl=0;        c->Nh=0;
         c->num=0;       c->md_len=SHA512_DIGEST_LENGTH;
         return 1;
@@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
 
 	if (md==0) return 0;
 
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* recall assembler dword order... */
-	n = c->md_len;
-	if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
-		{
-		unsigned int *h = (unsigned int *)c->h, t;
-
-		for (n/=4;n;n--)
-			{
-			t = *(h++);
-			*(md++) = (unsigned char)(t>>24);
-			*(md++) = (unsigned char)(t>>16);
-			*(md++) = (unsigned char)(t>>8);
-			*(md++) = (unsigned char)(t);
-			}
-		}
-	else	return 0;
-#else
 	switch (c->md_len)
 		{
 		/* Let compiler decide if it's appropriate to unroll... */
@@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
 		/* ... as well as make sure md_len is not abused. */
 		default:	return 0;
 		}
-#endif
+
 	return 1;
 	}
 
@@ -276,7 +232,14 @@ int SHA384_Update (SHA512_CTX *c, const void *data, size_t len)
 {   return SHA512_Update (c,data,len);   }
 
 void SHA512_Transform (SHA512_CTX *c, const unsigned char *data)
-{   sha512_block_data_order (c,data,1);  }
+	{
+#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+	if ((size_t)data%sizeof(c->u.d[0]) != 0)
+		memcpy(c->u.p,data,sizeof(c->u.p)),
+		data = c->u.p;
+#endif
+	sha512_block_data_order (c,data,1);
+	}
 
 unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
 	{
diff --git a/main/openssl/crypto/sha/sha_dgst.c b/main/openssl/crypto/sha/sha_dgst.c
index 70eb5603..fb63b17f 100644
--- a/main/openssl/crypto/sha/sha_dgst.c
+++ b/main/openssl/crypto/sha/sha_dgst.c
@@ -56,6 +56,7 @@
  * [including the GNU Public Licence.]
  */
 
+#include <openssl/crypto.h>
 #include <openssl/opensslconf.h>
 #if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
 
diff --git a/main/openssl/crypto/sha/sha_locl.h b/main/openssl/crypto/sha/sha_locl.h
index 672c26ee..d673255f 100644
--- a/main/openssl/crypto/sha/sha_locl.h
+++ b/main/openssl/crypto/sha/sha_locl.h
@@ -69,11 +69,11 @@
 #define HASH_CBLOCK             SHA_CBLOCK
 #define HASH_MAKE_STRING(c,s)   do {	\
 	unsigned long ll;		\
-	ll=(c)->h0; HOST_l2c(ll,(s));	\
-	ll=(c)->h1; HOST_l2c(ll,(s));	\
-	ll=(c)->h2; HOST_l2c(ll,(s));	\
-	ll=(c)->h3; HOST_l2c(ll,(s));	\
-	ll=(c)->h4; HOST_l2c(ll,(s));	\
+	ll=(c)->h0; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->h1; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->h2; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->h3; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->h4; (void)HOST_l2c(ll,(s));	\
 	} while (0)
 
 #if defined(SHA_0)
@@ -122,7 +122,11 @@ void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
 #define INIT_DATA_h3 0x10325476UL
 #define INIT_DATA_h4 0xc3d2e1f0UL
 
-int HASH_INIT (SHA_CTX *c)
+#ifdef SHA_0
+fips_md_init(SHA)
+#else
+fips_md_init_ctx(SHA1, SHA)
+#endif
 	{
 	memset (c,0,sizeof(*c));
 	c->h0=INIT_DATA_h0;
@@ -252,21 +256,21 @@ static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
 		}
 	else
 		{
-		HOST_c2l(data,l); X( 0)=l;		HOST_c2l(data,l); X( 1)=l;
-		BODY_00_15( 0,A,B,C,D,E,T,X( 0));	HOST_c2l(data,l); X( 2)=l;
-		BODY_00_15( 1,T,A,B,C,D,E,X( 1));	HOST_c2l(data,l); X( 3)=l;
-		BODY_00_15( 2,E,T,A,B,C,D,X( 2));	HOST_c2l(data,l); X( 4)=l;
-		BODY_00_15( 3,D,E,T,A,B,C,X( 3));	HOST_c2l(data,l); X( 5)=l;
-		BODY_00_15( 4,C,D,E,T,A,B,X( 4));	HOST_c2l(data,l); X( 6)=l;
-		BODY_00_15( 5,B,C,D,E,T,A,X( 5));	HOST_c2l(data,l); X( 7)=l;
-		BODY_00_15( 6,A,B,C,D,E,T,X( 6));	HOST_c2l(data,l); X( 8)=l;
-		BODY_00_15( 7,T,A,B,C,D,E,X( 7));	HOST_c2l(data,l); X( 9)=l;
-		BODY_00_15( 8,E,T,A,B,C,D,X( 8));	HOST_c2l(data,l); X(10)=l;
-		BODY_00_15( 9,D,E,T,A,B,C,X( 9));	HOST_c2l(data,l); X(11)=l;
-		BODY_00_15(10,C,D,E,T,A,B,X(10));	HOST_c2l(data,l); X(12)=l;
-		BODY_00_15(11,B,C,D,E,T,A,X(11));	HOST_c2l(data,l); X(13)=l;
-		BODY_00_15(12,A,B,C,D,E,T,X(12));	HOST_c2l(data,l); X(14)=l;
-		BODY_00_15(13,T,A,B,C,D,E,X(13));	HOST_c2l(data,l); X(15)=l;
+		(void)HOST_c2l(data,l); X( 0)=l;	(void)HOST_c2l(data,l); X( 1)=l;
+		BODY_00_15( 0,A,B,C,D,E,T,X( 0));	(void)HOST_c2l(data,l); X( 2)=l;
+		BODY_00_15( 1,T,A,B,C,D,E,X( 1));	(void)HOST_c2l(data,l); X( 3)=l;
+		BODY_00_15( 2,E,T,A,B,C,D,X( 2));	(void)HOST_c2l(data,l); X( 4)=l;
+		BODY_00_15( 3,D,E,T,A,B,C,X( 3));	(void)HOST_c2l(data,l); X( 5)=l;
+		BODY_00_15( 4,C,D,E,T,A,B,X( 4));	(void)HOST_c2l(data,l); X( 6)=l;
+		BODY_00_15( 5,B,C,D,E,T,A,X( 5));	(void)HOST_c2l(data,l); X( 7)=l;
+		BODY_00_15( 6,A,B,C,D,E,T,X( 6));	(void)HOST_c2l(data,l); X( 8)=l;
+		BODY_00_15( 7,T,A,B,C,D,E,X( 7));	(void)HOST_c2l(data,l); X( 9)=l;
+		BODY_00_15( 8,E,T,A,B,C,D,X( 8));	(void)HOST_c2l(data,l); X(10)=l;
+		BODY_00_15( 9,D,E,T,A,B,C,X( 9));	(void)HOST_c2l(data,l); X(11)=l;
+		BODY_00_15(10,C,D,E,T,A,B,X(10));	(void)HOST_c2l(data,l); X(12)=l;
+		BODY_00_15(11,B,C,D,E,T,A,X(11));	(void)HOST_c2l(data,l); X(13)=l;
+		BODY_00_15(12,A,B,C,D,E,T,X(12));	(void)HOST_c2l(data,l); X(14)=l;
+		BODY_00_15(13,T,A,B,C,D,E,X(13));	(void)HOST_c2l(data,l); X(15)=l;
 		BODY_00_15(14,E,T,A,B,C,D,X(14));
 		BODY_00_15(15,D,E,T,A,B,C,X(15));
 		}
diff --git a/main/openssl/crypto/sparccpuid.S b/main/openssl/crypto/sparccpuid.S
index ae61f7f5..c63d5da4 100644
--- a/main/openssl/crypto/sparccpuid.S
+++ b/main/openssl/crypto/sparccpuid.S
@@ -123,7 +123,7 @@ OPENSSL_wipe_cpu:
 			fmovs	%f1,%f3
 			fmovs	%f0,%f2
 
-	add	%fp,BIAS,%i0	! return pointer to caller�s top of stack
+	add	%fp,BIAS,%i0	! return pointer to caller´s top of stack
 
 	ret
 	restore
@@ -235,10 +235,10 @@ _sparcv9_rdtick:
 .global	_sparcv9_vis1_probe
 .align	8
 _sparcv9_vis1_probe:
-	.word	0x81b00d80	!fxor	%f0,%f0,%f0
 	add	%sp,BIAS+2,%o1
-	retl
 	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
+	retl
+	.word	0x81b00d80	!fxor	%f0,%f0,%f0
 .type	_sparcv9_vis1_probe,#function
 .size	_sparcv9_vis1_probe,.-_sparcv9_vis1_probe
 
diff --git a/main/openssl/crypto/sparcv9cap.c b/main/openssl/crypto/sparcv9cap.c
index ed195ab4..43b3ac6f 100644
--- a/main/openssl/crypto/sparcv9cap.c
+++ b/main/openssl/crypto/sparcv9cap.c
@@ -19,7 +19,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
 	int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 	int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 
-	if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+	if (num>=8 && !(num&1) &&
+	    (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
 		(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
 		return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
 	else
@@ -169,7 +170,6 @@ void OPENSSL_cpuid_setup(void)
 	char *e;
 	struct sigaction	common_act,ill_oact,bus_oact;
 	sigset_t		all_masked,oset;
-	int			sig;
 	static int trigger=0;
 
 	if (trigger) return;
diff --git a/main/openssl/crypto/srp/srp.h b/main/openssl/crypto/srp/srp.h
new file mode 100644
index 00000000..7ec7825c
--- /dev/null
+++ b/main/openssl/crypto/srp/srp.h
@@ -0,0 +1,172 @@
+/* crypto/srp/srp.h */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with 
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef __SRP_H__
+#define __SRP_H__
+
+#ifndef OPENSSL_NO_SRP
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/safestack.h>
+#include <openssl/bn.h>
+#include <openssl/crypto.h>
+
+typedef struct SRP_gN_cache_st
+	{
+	char *b64_bn;
+	BIGNUM *bn;
+	} SRP_gN_cache;
+
+
+DECLARE_STACK_OF(SRP_gN_cache)
+
+typedef struct SRP_user_pwd_st
+	{
+	char *id;
+	BIGNUM *s;
+	BIGNUM *v;
+	const BIGNUM *g;
+	const BIGNUM *N;
+	char *info;
+	} SRP_user_pwd;
+
+DECLARE_STACK_OF(SRP_user_pwd)
+
+typedef struct SRP_VBASE_st
+	{
+	STACK_OF(SRP_user_pwd) *users_pwd;
+	STACK_OF(SRP_gN_cache) *gN_cache;
+/* to simulate a user */
+	char *seed_key;
+	BIGNUM *default_g;
+	BIGNUM *default_N;
+	} SRP_VBASE;
+
+
+/*Structure interne pour retenir les couples N et g*/
+typedef struct SRP_gN_st
+	{
+	char *id;
+	BIGNUM *g;
+	BIGNUM *N;
+	} SRP_gN;
+
+DECLARE_STACK_OF(SRP_gN)
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key);
+int SRP_VBASE_free(SRP_VBASE *vb);
+int SRP_VBASE_init(SRP_VBASE *vb, char * verifier_file);
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+			  char **verifier, const char *N, const char *g);
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g);
+
+
+#define SRP_NO_ERROR 0
+#define SRP_ERR_VBASE_INCOMPLETE_FILE 1
+#define SRP_ERR_VBASE_BN_LIB 2
+#define SRP_ERR_OPEN_FILE 3
+#define SRP_ERR_MEMORY 4
+
+#define DB_srptype	0
+#define DB_srpverifier	1
+#define DB_srpsalt 	2
+#define DB_srpid	3              
+#define DB_srpgN	4       
+#define DB_srpinfo	5 
+#undef  DB_NUMBER      
+#define DB_NUMBER       6
+
+#define DB_SRP_INDEX	'I'
+#define DB_SRP_VALID	'V'
+#define DB_SRP_REVOKED	'R'
+#define DB_SRP_MODIF	'v'
+
+
+/* see srp.c */
+char * SRP_check_known_gN_param(BIGNUM* g, BIGNUM* N); 
+SRP_gN *SRP_get_default_gN(const char * id) ;
+
+/* server side .... */
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N);
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v);
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N);
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N) ;
+
+
+
+/* client side .... */
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass);
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g);
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u);
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N);
+
+#define SRP_MINIMAL_N 1024
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+#endif
diff --git a/main/openssl/crypto/srp/srp_grps.h b/main/openssl/crypto/srp/srp_grps.h
new file mode 100644
index 00000000..8e3c35e3
--- /dev/null
+++ b/main/openssl/crypto/srp/srp_grps.h
@@ -0,0 +1,517 @@
+/* start of generated data */
+
+static BN_ULONG bn_group_1024_value[] = {
+	bn_pack4(0x9FC6,0x1D2F,0xC0EB,0x06E3),
+	bn_pack4(0xFD51,0x38FE,0x8376,0x435B),
+	bn_pack4(0x2FD4,0xCBF4,0x976E,0xAA9A),
+	bn_pack4(0x68ED,0xBC3C,0x0572,0x6CC0),
+	bn_pack4(0xC529,0xF566,0x660E,0x57EC),
+	bn_pack4(0x8255,0x9B29,0x7BCF,0x1885),
+	bn_pack4(0xCE8E,0xF4AD,0x69B1,0x5D49),
+	bn_pack4(0x5DC7,0xD7B4,0x6154,0xD6B6),
+	bn_pack4(0x8E49,0x5C1D,0x6089,0xDAD1),
+	bn_pack4(0xE0D5,0xD8E2,0x50B9,0x8BE4),
+	bn_pack4(0x383B,0x4813,0xD692,0xC6E0),
+	bn_pack4(0xD674,0xDF74,0x96EA,0x81D3),
+	bn_pack4(0x9EA2,0x314C,0x9C25,0x6576),
+	bn_pack4(0x6072,0x6187,0x75FF,0x3C0B),
+	bn_pack4(0x9C33,0xF80A,0xFA8F,0xC5E8),
+	bn_pack4(0xEEAF,0x0AB9,0xADB3,0x8DD6)
+};
+static BIGNUM bn_group_1024 = {
+	bn_group_1024_value,
+	(sizeof bn_group_1024_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_1024_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_1536_value[] = {
+	bn_pack4(0xCF76,0xE3FE,0xD135,0xF9BB),
+	bn_pack4(0x1518,0x0F93,0x499A,0x234D),
+	bn_pack4(0x8CE7,0xA28C,0x2442,0xC6F3),
+	bn_pack4(0x5A02,0x1FFF,0x5E91,0x479E),
+	bn_pack4(0x7F8A,0x2FE9,0xB8B5,0x292E),
+	bn_pack4(0x837C,0x264A,0xE3A9,0xBEB8),
+	bn_pack4(0xE442,0x734A,0xF7CC,0xB7AE),
+	bn_pack4(0x6577,0x2E43,0x7D6C,0x7F8C),
+	bn_pack4(0xDB2F,0xD53D,0x24B7,0xC486),
+	bn_pack4(0x6EDF,0x0195,0x3934,0x9627),
+	bn_pack4(0x158B,0xFD3E,0x2B9C,0x8CF5),
+	bn_pack4(0x764E,0x3F4B,0x53DD,0x9DA1),
+	bn_pack4(0x4754,0x8381,0xDBC5,0xB1FC),
+	bn_pack4(0x9B60,0x9E0B,0xE3BA,0xB63D),
+	bn_pack4(0x8134,0xB1C8,0xB979,0x8914),
+	bn_pack4(0xDF02,0x8A7C,0xEC67,0xF0D0),
+	bn_pack4(0x80B6,0x55BB,0x9A22,0xE8DC),
+	bn_pack4(0x1558,0x903B,0xA0D0,0xF843),
+	bn_pack4(0x51C6,0xA94B,0xE460,0x7A29),
+	bn_pack4(0x5F4F,0x5F55,0x6E27,0xCBDE),
+	bn_pack4(0xBEEE,0xA961,0x4B19,0xCC4D),
+	bn_pack4(0xDBA5,0x1DF4,0x99AC,0x4C80),
+	bn_pack4(0xB1F1,0x2A86,0x17A4,0x7BBB),
+	bn_pack4(0x9DEF,0x3CAF,0xB939,0x277A)
+};
+static BIGNUM bn_group_1536 = {
+	bn_group_1536_value,
+	(sizeof bn_group_1536_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_1536_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_2048_value[] = {
+	bn_pack4(0x0FA7,0x111F,0x9E4A,0xFF73),
+	bn_pack4(0x9B65,0xE372,0xFCD6,0x8EF2),
+	bn_pack4(0x35DE,0x236D,0x525F,0x5475),
+	bn_pack4(0x94B5,0xC803,0xD89F,0x7AE4),
+	bn_pack4(0x71AE,0x35F8,0xE9DB,0xFBB6),
+	bn_pack4(0x2A56,0x98F3,0xA8D0,0xC382),
+	bn_pack4(0x9CCC,0x041C,0x7BC3,0x08D8),
+	bn_pack4(0xAF87,0x4E73,0x03CE,0x5329),
+	bn_pack4(0x6160,0x2790,0x04E5,0x7AE6),
+	bn_pack4(0x032C,0xFBDB,0xF52F,0xB378),
+	bn_pack4(0x5EA7,0x7A27,0x75D2,0xECFA),
+	bn_pack4(0x5445,0x23B5,0x24B0,0xD57D),
+	bn_pack4(0x5B9D,0x32E6,0x88F8,0x7748),
+	bn_pack4(0xF1D2,0xB907,0x8717,0x461A),
+	bn_pack4(0x76BD,0x207A,0x436C,0x6481),
+	bn_pack4(0xCA97,0xB43A,0x23FB,0x8016),
+	bn_pack4(0x1D28,0x1E44,0x6B14,0x773B),
+	bn_pack4(0x7359,0xD041,0xD5C3,0x3EA7),
+	bn_pack4(0xA80D,0x740A,0xDBF4,0xFF74),
+	bn_pack4(0x55F9,0x7993,0xEC97,0x5EEA),
+	bn_pack4(0x2918,0xA996,0x2F0B,0x93B8),
+	bn_pack4(0x661A,0x05FB,0xD5FA,0xAAE8),
+	bn_pack4(0xCF60,0x9517,0x9A16,0x3AB3),
+	bn_pack4(0xE808,0x3969,0xEDB7,0x67B0),
+	bn_pack4(0xCD7F,0x48A9,0xDA04,0xFD50),
+	bn_pack4(0xD523,0x12AB,0x4B03,0x310D),
+	bn_pack4(0x8193,0xE075,0x7767,0xA13D),
+	bn_pack4(0xA373,0x29CB,0xB4A0,0x99ED),
+	bn_pack4(0xFC31,0x9294,0x3DB5,0x6050),
+	bn_pack4(0xAF72,0xB665,0x1987,0xEE07),
+	bn_pack4(0xF166,0xDE5E,0x1389,0x582F),
+	bn_pack4(0xAC6B,0xDB41,0x324A,0x9A9B)
+};
+static BIGNUM bn_group_2048 = {
+	bn_group_2048_value,
+	(sizeof bn_group_2048_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_2048_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_3072_value[] = {
+	bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF),
+	bn_pack4(0x4B82,0xD120,0xA93A,0xD2CA),
+	bn_pack4(0x43DB,0x5BFC,0xE0FD,0x108E),
+	bn_pack4(0x08E2,0x4FA0,0x74E5,0xAB31),
+	bn_pack4(0x7709,0x88C0,0xBAD9,0x46E2),
+	bn_pack4(0xBBE1,0x1757,0x7A61,0x5D6C),
+	bn_pack4(0x521F,0x2B18,0x177B,0x200C),
+	bn_pack4(0xD876,0x0273,0x3EC8,0x6A64),
+	bn_pack4(0xF12F,0xFA06,0xD98A,0x0864),
+	bn_pack4(0xCEE3,0xD226,0x1AD2,0xEE6B),
+	bn_pack4(0x1E8C,0x94E0,0x4A25,0x619D),
+	bn_pack4(0xABF5,0xAE8C,0xDB09,0x33D7),
+	bn_pack4(0xB397,0x0F85,0xA6E1,0xE4C7),
+	bn_pack4(0x8AEA,0x7157,0x5D06,0x0C7D),
+	bn_pack4(0xECFB,0x8504,0x58DB,0xEF0A),
+	bn_pack4(0xA855,0x21AB,0xDF1C,0xBA64),
+	bn_pack4(0xAD33,0x170D,0x0450,0x7A33),
+	bn_pack4(0x1572,0x8E5A,0x8AAA,0xC42D),
+	bn_pack4(0x15D2,0x2618,0x98FA,0x0510),
+	bn_pack4(0x3995,0x497C,0xEA95,0x6AE5),
+	bn_pack4(0xDE2B,0xCBF6,0x9558,0x1718),
+	bn_pack4(0xB5C5,0x5DF0,0x6F4C,0x52C9),
+	bn_pack4(0x9B27,0x83A2,0xEC07,0xA28F),
+	bn_pack4(0xE39E,0x772C,0x180E,0x8603),
+	bn_pack4(0x3290,0x5E46,0x2E36,0xCE3B),
+	bn_pack4(0xF174,0x6C08,0xCA18,0x217C),
+	bn_pack4(0x670C,0x354E,0x4ABC,0x9804),
+	bn_pack4(0x9ED5,0x2907,0x7096,0x966D),
+	bn_pack4(0x1C62,0xF356,0x2085,0x52BB),
+	bn_pack4(0x8365,0x5D23,0xDCA3,0xAD96),
+	bn_pack4(0x6916,0x3FA8,0xFD24,0xCF5F),
+	bn_pack4(0x98DA,0x4836,0x1C55,0xD39A),
+	bn_pack4(0xC200,0x7CB8,0xA163,0xBF05),
+	bn_pack4(0x4928,0x6651,0xECE4,0x5B3D),
+	bn_pack4(0xAE9F,0x2411,0x7C4B,0x1FE6),
+	bn_pack4(0xEE38,0x6BFB,0x5A89,0x9FA5),
+	bn_pack4(0x0BFF,0x5CB6,0xF406,0xB7ED),
+	bn_pack4(0xF44C,0x42E9,0xA637,0xED6B),
+	bn_pack4(0xE485,0xB576,0x625E,0x7EC6),
+	bn_pack4(0x4FE1,0x356D,0x6D51,0xC245),
+	bn_pack4(0x302B,0x0A6D,0xF25F,0x1437),
+	bn_pack4(0xEF95,0x19B3,0xCD3A,0x431B),
+	bn_pack4(0x514A,0x0879,0x8E34,0x04DD),
+	bn_pack4(0x020B,0xBEA6,0x3B13,0x9B22),
+	bn_pack4(0x2902,0x4E08,0x8A67,0xCC74),
+	bn_pack4(0xC4C6,0x628B,0x80DC,0x1CD1),
+	bn_pack4(0xC90F,0xDAA2,0x2168,0xC234),
+	bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF)
+};
+static BIGNUM bn_group_3072 = {
+	bn_group_3072_value,
+	(sizeof bn_group_3072_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_3072_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_4096_value[] = {
+	bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF),
+	bn_pack4(0x4DF4,0x35C9,0x3406,0x3199),
+	bn_pack4(0x86FF,0xB7DC,0x90A6,0xC08F),
+	bn_pack4(0x93B4,0xEA98,0x8D8F,0xDDC1),
+	bn_pack4(0xD006,0x9127,0xD5B0,0x5AA9),
+	bn_pack4(0xB81B,0xDD76,0x2170,0x481C),
+	bn_pack4(0x1F61,0x2970,0xCEE2,0xD7AF),
+	bn_pack4(0x233B,0xA186,0x515B,0xE7ED),
+	bn_pack4(0x99B2,0x964F,0xA090,0xC3A2),
+	bn_pack4(0x287C,0x5947,0x4E6B,0xC05D),
+	bn_pack4(0x2E8E,0xFC14,0x1FBE,0xCAA6),
+	bn_pack4(0xDBBB,0xC2DB,0x04DE,0x8EF9),
+	bn_pack4(0x2583,0xE9CA,0x2AD4,0x4CE8),
+	bn_pack4(0x1A94,0x6834,0xB615,0x0BDA),
+	bn_pack4(0x99C3,0x2718,0x6AF4,0xE23C),
+	bn_pack4(0x8871,0x9A10,0xBDBA,0x5B26),
+	bn_pack4(0x1A72,0x3C12,0xA787,0xE6D7),
+	bn_pack4(0x4B82,0xD120,0xA921,0x0801),
+	bn_pack4(0x43DB,0x5BFC,0xE0FD,0x108E),
+	bn_pack4(0x08E2,0x4FA0,0x74E5,0xAB31),
+	bn_pack4(0x7709,0x88C0,0xBAD9,0x46E2),
+	bn_pack4(0xBBE1,0x1757,0x7A61,0x5D6C),
+	bn_pack4(0x521F,0x2B18,0x177B,0x200C),
+	bn_pack4(0xD876,0x0273,0x3EC8,0x6A64),
+	bn_pack4(0xF12F,0xFA06,0xD98A,0x0864),
+	bn_pack4(0xCEE3,0xD226,0x1AD2,0xEE6B),
+	bn_pack4(0x1E8C,0x94E0,0x4A25,0x619D),
+	bn_pack4(0xABF5,0xAE8C,0xDB09,0x33D7),
+	bn_pack4(0xB397,0x0F85,0xA6E1,0xE4C7),
+	bn_pack4(0x8AEA,0x7157,0x5D06,0x0C7D),
+	bn_pack4(0xECFB,0x8504,0x58DB,0xEF0A),
+	bn_pack4(0xA855,0x21AB,0xDF1C,0xBA64),
+	bn_pack4(0xAD33,0x170D,0x0450,0x7A33),
+	bn_pack4(0x1572,0x8E5A,0x8AAA,0xC42D),
+	bn_pack4(0x15D2,0x2618,0x98FA,0x0510),
+	bn_pack4(0x3995,0x497C,0xEA95,0x6AE5),
+	bn_pack4(0xDE2B,0xCBF6,0x9558,0x1718),
+	bn_pack4(0xB5C5,0x5DF0,0x6F4C,0x52C9),
+	bn_pack4(0x9B27,0x83A2,0xEC07,0xA28F),
+	bn_pack4(0xE39E,0x772C,0x180E,0x8603),
+	bn_pack4(0x3290,0x5E46,0x2E36,0xCE3B),
+	bn_pack4(0xF174,0x6C08,0xCA18,0x217C),
+	bn_pack4(0x670C,0x354E,0x4ABC,0x9804),
+	bn_pack4(0x9ED5,0x2907,0x7096,0x966D),
+	bn_pack4(0x1C62,0xF356,0x2085,0x52BB),
+	bn_pack4(0x8365,0x5D23,0xDCA3,0xAD96),
+	bn_pack4(0x6916,0x3FA8,0xFD24,0xCF5F),
+	bn_pack4(0x98DA,0x4836,0x1C55,0xD39A),
+	bn_pack4(0xC200,0x7CB8,0xA163,0xBF05),
+	bn_pack4(0x4928,0x6651,0xECE4,0x5B3D),
+	bn_pack4(0xAE9F,0x2411,0x7C4B,0x1FE6),
+	bn_pack4(0xEE38,0x6BFB,0x5A89,0x9FA5),
+	bn_pack4(0x0BFF,0x5CB6,0xF406,0xB7ED),
+	bn_pack4(0xF44C,0x42E9,0xA637,0xED6B),
+	bn_pack4(0xE485,0xB576,0x625E,0x7EC6),
+	bn_pack4(0x4FE1,0x356D,0x6D51,0xC245),
+	bn_pack4(0x302B,0x0A6D,0xF25F,0x1437),
+	bn_pack4(0xEF95,0x19B3,0xCD3A,0x431B),
+	bn_pack4(0x514A,0x0879,0x8E34,0x04DD),
+	bn_pack4(0x020B,0xBEA6,0x3B13,0x9B22),
+	bn_pack4(0x2902,0x4E08,0x8A67,0xCC74),
+	bn_pack4(0xC4C6,0x628B,0x80DC,0x1CD1),
+	bn_pack4(0xC90F,0xDAA2,0x2168,0xC234),
+	bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF)
+};
+static BIGNUM bn_group_4096 = {
+	bn_group_4096_value,
+	(sizeof bn_group_4096_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_4096_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_6144_value[] = {
+	bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF),
+	bn_pack4(0xE694,0xF91E,0x6DCC,0x4024),
+	bn_pack4(0x12BF,0x2D5B,0x0B74,0x74D6),
+	bn_pack4(0x043E,0x8F66,0x3F48,0x60EE),
+	bn_pack4(0x387F,0xE8D7,0x6E3C,0x0468),
+	bn_pack4(0xDA56,0xC9EC,0x2EF2,0x9632),
+	bn_pack4(0xEB19,0xCCB1,0xA313,0xD55C),
+	bn_pack4(0xF550,0xAA3D,0x8A1F,0xBFF0),
+	bn_pack4(0x06A1,0xD58B,0xB7C5,0xDA76),
+	bn_pack4(0xA797,0x15EE,0xF29B,0xE328),
+	bn_pack4(0x14CC,0x5ED2,0x0F80,0x37E0),
+	bn_pack4(0xCC8F,0x6D7E,0xBF48,0xE1D8),
+	bn_pack4(0x4BD4,0x07B2,0x2B41,0x54AA),
+	bn_pack4(0x0F1D,0x45B7,0xFF58,0x5AC5),
+	bn_pack4(0x23A9,0x7A7E,0x36CC,0x88BE),
+	bn_pack4(0x59E7,0xC97F,0xBEC7,0xE8F3),
+	bn_pack4(0xB5A8,0x4031,0x900B,0x1C9E),
+	bn_pack4(0xD55E,0x702F,0x4698,0x0C82),
+	bn_pack4(0xF482,0xD7CE,0x6E74,0xFEF6),
+	bn_pack4(0xF032,0xEA15,0xD172,0x1D03),
+	bn_pack4(0x5983,0xCA01,0xC64B,0x92EC),
+	bn_pack4(0x6FB8,0xF401,0x378C,0xD2BF),
+	bn_pack4(0x3320,0x5151,0x2BD7,0xAF42),
+	bn_pack4(0xDB7F,0x1447,0xE6CC,0x254B),
+	bn_pack4(0x44CE,0x6CBA,0xCED4,0xBB1B),
+	bn_pack4(0xDA3E,0xDBEB,0xCF9B,0x14ED),
+	bn_pack4(0x1797,0x27B0,0x865A,0x8918),
+	bn_pack4(0xB06A,0x53ED,0x9027,0xD831),
+	bn_pack4(0xE5DB,0x382F,0x4130,0x01AE),
+	bn_pack4(0xF8FF,0x9406,0xAD9E,0x530E),
+	bn_pack4(0xC975,0x1E76,0x3DBA,0x37BD),
+	bn_pack4(0xC1D4,0xDCB2,0x6026,0x46DE),
+	bn_pack4(0x36C3,0xFAB4,0xD27C,0x7026),
+	bn_pack4(0x4DF4,0x35C9,0x3402,0x8492),
+	bn_pack4(0x86FF,0xB7DC,0x90A6,0xC08F),
+	bn_pack4(0x93B4,0xEA98,0x8D8F,0xDDC1),
+	bn_pack4(0xD006,0x9127,0xD5B0,0x5AA9),
+	bn_pack4(0xB81B,0xDD76,0x2170,0x481C),
+	bn_pack4(0x1F61,0x2970,0xCEE2,0xD7AF),
+	bn_pack4(0x233B,0xA186,0x515B,0xE7ED),
+	bn_pack4(0x99B2,0x964F,0xA090,0xC3A2),
+	bn_pack4(0x287C,0x5947,0x4E6B,0xC05D),
+	bn_pack4(0x2E8E,0xFC14,0x1FBE,0xCAA6),
+	bn_pack4(0xDBBB,0xC2DB,0x04DE,0x8EF9),
+	bn_pack4(0x2583,0xE9CA,0x2AD4,0x4CE8),
+	bn_pack4(0x1A94,0x6834,0xB615,0x0BDA),
+	bn_pack4(0x99C3,0x2718,0x6AF4,0xE23C),
+	bn_pack4(0x8871,0x9A10,0xBDBA,0x5B26),
+	bn_pack4(0x1A72,0x3C12,0xA787,0xE6D7),
+	bn_pack4(0x4B82,0xD120,0xA921,0x0801),
+	bn_pack4(0x43DB,0x5BFC,0xE0FD,0x108E),
+	bn_pack4(0x08E2,0x4FA0,0x74E5,0xAB31),
+	bn_pack4(0x7709,0x88C0,0xBAD9,0x46E2),
+	bn_pack4(0xBBE1,0x1757,0x7A61,0x5D6C),
+	bn_pack4(0x521F,0x2B18,0x177B,0x200C),
+	bn_pack4(0xD876,0x0273,0x3EC8,0x6A64),
+	bn_pack4(0xF12F,0xFA06,0xD98A,0x0864),
+	bn_pack4(0xCEE3,0xD226,0x1AD2,0xEE6B),
+	bn_pack4(0x1E8C,0x94E0,0x4A25,0x619D),
+	bn_pack4(0xABF5,0xAE8C,0xDB09,0x33D7),
+	bn_pack4(0xB397,0x0F85,0xA6E1,0xE4C7),
+	bn_pack4(0x8AEA,0x7157,0x5D06,0x0C7D),
+	bn_pack4(0xECFB,0x8504,0x58DB,0xEF0A),
+	bn_pack4(0xA855,0x21AB,0xDF1C,0xBA64),
+	bn_pack4(0xAD33,0x170D,0x0450,0x7A33),
+	bn_pack4(0x1572,0x8E5A,0x8AAA,0xC42D),
+	bn_pack4(0x15D2,0x2618,0x98FA,0x0510),
+	bn_pack4(0x3995,0x497C,0xEA95,0x6AE5),
+	bn_pack4(0xDE2B,0xCBF6,0x9558,0x1718),
+	bn_pack4(0xB5C5,0x5DF0,0x6F4C,0x52C9),
+	bn_pack4(0x9B27,0x83A2,0xEC07,0xA28F),
+	bn_pack4(0xE39E,0x772C,0x180E,0x8603),
+	bn_pack4(0x3290,0x5E46,0x2E36,0xCE3B),
+	bn_pack4(0xF174,0x6C08,0xCA18,0x217C),
+	bn_pack4(0x670C,0x354E,0x4ABC,0x9804),
+	bn_pack4(0x9ED5,0x2907,0x7096,0x966D),
+	bn_pack4(0x1C62,0xF356,0x2085,0x52BB),
+	bn_pack4(0x8365,0x5D23,0xDCA3,0xAD96),
+	bn_pack4(0x6916,0x3FA8,0xFD24,0xCF5F),
+	bn_pack4(0x98DA,0x4836,0x1C55,0xD39A),
+	bn_pack4(0xC200,0x7CB8,0xA163,0xBF05),
+	bn_pack4(0x4928,0x6651,0xECE4,0x5B3D),
+	bn_pack4(0xAE9F,0x2411,0x7C4B,0x1FE6),
+	bn_pack4(0xEE38,0x6BFB,0x5A89,0x9FA5),
+	bn_pack4(0x0BFF,0x5CB6,0xF406,0xB7ED),
+	bn_pack4(0xF44C,0x42E9,0xA637,0xED6B),
+	bn_pack4(0xE485,0xB576,0x625E,0x7EC6),
+	bn_pack4(0x4FE1,0x356D,0x6D51,0xC245),
+	bn_pack4(0x302B,0x0A6D,0xF25F,0x1437),
+	bn_pack4(0xEF95,0x19B3,0xCD3A,0x431B),
+	bn_pack4(0x514A,0x0879,0x8E34,0x04DD),
+	bn_pack4(0x020B,0xBEA6,0x3B13,0x9B22),
+	bn_pack4(0x2902,0x4E08,0x8A67,0xCC74),
+	bn_pack4(0xC4C6,0x628B,0x80DC,0x1CD1),
+	bn_pack4(0xC90F,0xDAA2,0x2168,0xC234),
+	bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF)
+};
+static BIGNUM bn_group_6144 = {
+	bn_group_6144_value,
+	(sizeof bn_group_6144_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_6144_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_8192_value[] = {
+	bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF),
+	bn_pack4(0x60C9,0x80DD,0x98ED,0xD3DF),
+	bn_pack4(0xC81F,0x56E8,0x80B9,0x6E71),
+	bn_pack4(0x9E30,0x50E2,0x7656,0x94DF),
+	bn_pack4(0x9558,0xE447,0x5677,0xE9AA),
+	bn_pack4(0xC919,0x0DA6,0xFC02,0x6E47),
+	bn_pack4(0x889A,0x002E,0xD5EE,0x382B),
+	bn_pack4(0x4009,0x438B,0x481C,0x6CD7),
+	bn_pack4(0x3590,0x46F4,0xEB87,0x9F92),
+	bn_pack4(0xFAF3,0x6BC3,0x1ECF,0xA268),
+	bn_pack4(0xB1D5,0x10BD,0x7EE7,0x4D73),
+	bn_pack4(0xF9AB,0x4819,0x5DED,0x7EA1),
+	bn_pack4(0x64F3,0x1CC5,0x0846,0x851D),
+	bn_pack4(0x4597,0xE899,0xA025,0x5DC1),
+	bn_pack4(0xDF31,0x0EE0,0x74AB,0x6A36),
+	bn_pack4(0x6D2A,0x13F8,0x3F44,0xF82D),
+	bn_pack4(0x062B,0x3CF5,0xB3A2,0x78A6),
+	bn_pack4(0x7968,0x3303,0xED5B,0xDD3A),
+	bn_pack4(0xFA9D,0x4B7F,0xA2C0,0x87E8),
+	bn_pack4(0x4BCB,0xC886,0x2F83,0x85DD),
+	bn_pack4(0x3473,0xFC64,0x6CEA,0x306B),
+	bn_pack4(0x13EB,0x57A8,0x1A23,0xF0C7),
+	bn_pack4(0x2222,0x2E04,0xA403,0x7C07),
+	bn_pack4(0xE3FD,0xB8BE,0xFC84,0x8AD9),
+	bn_pack4(0x238F,0x16CB,0xE39D,0x652D),
+	bn_pack4(0x3423,0xB474,0x2BF1,0xC978),
+	bn_pack4(0x3AAB,0x639C,0x5AE4,0xF568),
+	bn_pack4(0x2576,0xF693,0x6BA4,0x2466),
+	bn_pack4(0x741F,0xA7BF,0x8AFC,0x47ED),
+	bn_pack4(0x3BC8,0x32B6,0x8D9D,0xD300),
+	bn_pack4(0xD8BE,0xC4D0,0x73B9,0x31BA),
+	bn_pack4(0x3877,0x7CB6,0xA932,0xDF8C),
+	bn_pack4(0x74A3,0x926F,0x12FE,0xE5E4),
+	bn_pack4(0xE694,0xF91E,0x6DBE,0x1159),
+	bn_pack4(0x12BF,0x2D5B,0x0B74,0x74D6),
+	bn_pack4(0x043E,0x8F66,0x3F48,0x60EE),
+	bn_pack4(0x387F,0xE8D7,0x6E3C,0x0468),
+	bn_pack4(0xDA56,0xC9EC,0x2EF2,0x9632),
+	bn_pack4(0xEB19,0xCCB1,0xA313,0xD55C),
+	bn_pack4(0xF550,0xAA3D,0x8A1F,0xBFF0),
+	bn_pack4(0x06A1,0xD58B,0xB7C5,0xDA76),
+	bn_pack4(0xA797,0x15EE,0xF29B,0xE328),
+	bn_pack4(0x14CC,0x5ED2,0x0F80,0x37E0),
+	bn_pack4(0xCC8F,0x6D7E,0xBF48,0xE1D8),
+	bn_pack4(0x4BD4,0x07B2,0x2B41,0x54AA),
+	bn_pack4(0x0F1D,0x45B7,0xFF58,0x5AC5),
+	bn_pack4(0x23A9,0x7A7E,0x36CC,0x88BE),
+	bn_pack4(0x59E7,0xC97F,0xBEC7,0xE8F3),
+	bn_pack4(0xB5A8,0x4031,0x900B,0x1C9E),
+	bn_pack4(0xD55E,0x702F,0x4698,0x0C82),
+	bn_pack4(0xF482,0xD7CE,0x6E74,0xFEF6),
+	bn_pack4(0xF032,0xEA15,0xD172,0x1D03),
+	bn_pack4(0x5983,0xCA01,0xC64B,0x92EC),
+	bn_pack4(0x6FB8,0xF401,0x378C,0xD2BF),
+	bn_pack4(0x3320,0x5151,0x2BD7,0xAF42),
+	bn_pack4(0xDB7F,0x1447,0xE6CC,0x254B),
+	bn_pack4(0x44CE,0x6CBA,0xCED4,0xBB1B),
+	bn_pack4(0xDA3E,0xDBEB,0xCF9B,0x14ED),
+	bn_pack4(0x1797,0x27B0,0x865A,0x8918),
+	bn_pack4(0xB06A,0x53ED,0x9027,0xD831),
+	bn_pack4(0xE5DB,0x382F,0x4130,0x01AE),
+	bn_pack4(0xF8FF,0x9406,0xAD9E,0x530E),
+	bn_pack4(0xC975,0x1E76,0x3DBA,0x37BD),
+	bn_pack4(0xC1D4,0xDCB2,0x6026,0x46DE),
+	bn_pack4(0x36C3,0xFAB4,0xD27C,0x7026),
+	bn_pack4(0x4DF4,0x35C9,0x3402,0x8492),
+	bn_pack4(0x86FF,0xB7DC,0x90A6,0xC08F),
+	bn_pack4(0x93B4,0xEA98,0x8D8F,0xDDC1),
+	bn_pack4(0xD006,0x9127,0xD5B0,0x5AA9),
+	bn_pack4(0xB81B,0xDD76,0x2170,0x481C),
+	bn_pack4(0x1F61,0x2970,0xCEE2,0xD7AF),
+	bn_pack4(0x233B,0xA186,0x515B,0xE7ED),
+	bn_pack4(0x99B2,0x964F,0xA090,0xC3A2),
+	bn_pack4(0x287C,0x5947,0x4E6B,0xC05D),
+	bn_pack4(0x2E8E,0xFC14,0x1FBE,0xCAA6),
+	bn_pack4(0xDBBB,0xC2DB,0x04DE,0x8EF9),
+	bn_pack4(0x2583,0xE9CA,0x2AD4,0x4CE8),
+	bn_pack4(0x1A94,0x6834,0xB615,0x0BDA),
+	bn_pack4(0x99C3,0x2718,0x6AF4,0xE23C),
+	bn_pack4(0x8871,0x9A10,0xBDBA,0x5B26),
+	bn_pack4(0x1A72,0x3C12,0xA787,0xE6D7),
+	bn_pack4(0x4B82,0xD120,0xA921,0x0801),
+	bn_pack4(0x43DB,0x5BFC,0xE0FD,0x108E),
+	bn_pack4(0x08E2,0x4FA0,0x74E5,0xAB31),
+	bn_pack4(0x7709,0x88C0,0xBAD9,0x46E2),
+	bn_pack4(0xBBE1,0x1757,0x7A61,0x5D6C),
+	bn_pack4(0x521F,0x2B18,0x177B,0x200C),
+	bn_pack4(0xD876,0x0273,0x3EC8,0x6A64),
+	bn_pack4(0xF12F,0xFA06,0xD98A,0x0864),
+	bn_pack4(0xCEE3,0xD226,0x1AD2,0xEE6B),
+	bn_pack4(0x1E8C,0x94E0,0x4A25,0x619D),
+	bn_pack4(0xABF5,0xAE8C,0xDB09,0x33D7),
+	bn_pack4(0xB397,0x0F85,0xA6E1,0xE4C7),
+	bn_pack4(0x8AEA,0x7157,0x5D06,0x0C7D),
+	bn_pack4(0xECFB,0x8504,0x58DB,0xEF0A),
+	bn_pack4(0xA855,0x21AB,0xDF1C,0xBA64),
+	bn_pack4(0xAD33,0x170D,0x0450,0x7A33),
+	bn_pack4(0x1572,0x8E5A,0x8AAA,0xC42D),
+	bn_pack4(0x15D2,0x2618,0x98FA,0x0510),
+	bn_pack4(0x3995,0x497C,0xEA95,0x6AE5),
+	bn_pack4(0xDE2B,0xCBF6,0x9558,0x1718),
+	bn_pack4(0xB5C5,0x5DF0,0x6F4C,0x52C9),
+	bn_pack4(0x9B27,0x83A2,0xEC07,0xA28F),
+	bn_pack4(0xE39E,0x772C,0x180E,0x8603),
+	bn_pack4(0x3290,0x5E46,0x2E36,0xCE3B),
+	bn_pack4(0xF174,0x6C08,0xCA18,0x217C),
+	bn_pack4(0x670C,0x354E,0x4ABC,0x9804),
+	bn_pack4(0x9ED5,0x2907,0x7096,0x966D),
+	bn_pack4(0x1C62,0xF356,0x2085,0x52BB),
+	bn_pack4(0x8365,0x5D23,0xDCA3,0xAD96),
+	bn_pack4(0x6916,0x3FA8,0xFD24,0xCF5F),
+	bn_pack4(0x98DA,0x4836,0x1C55,0xD39A),
+	bn_pack4(0xC200,0x7CB8,0xA163,0xBF05),
+	bn_pack4(0x4928,0x6651,0xECE4,0x5B3D),
+	bn_pack4(0xAE9F,0x2411,0x7C4B,0x1FE6),
+	bn_pack4(0xEE38,0x6BFB,0x5A89,0x9FA5),
+	bn_pack4(0x0BFF,0x5CB6,0xF406,0xB7ED),
+	bn_pack4(0xF44C,0x42E9,0xA637,0xED6B),
+	bn_pack4(0xE485,0xB576,0x625E,0x7EC6),
+	bn_pack4(0x4FE1,0x356D,0x6D51,0xC245),
+	bn_pack4(0x302B,0x0A6D,0xF25F,0x1437),
+	bn_pack4(0xEF95,0x19B3,0xCD3A,0x431B),
+	bn_pack4(0x514A,0x0879,0x8E34,0x04DD),
+	bn_pack4(0x020B,0xBEA6,0x3B13,0x9B22),
+	bn_pack4(0x2902,0x4E08,0x8A67,0xCC74),
+	bn_pack4(0xC4C6,0x628B,0x80DC,0x1CD1),
+	bn_pack4(0xC90F,0xDAA2,0x2168,0xC234),
+	bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF)
+};
+static BIGNUM bn_group_8192 = {
+	bn_group_8192_value,
+	(sizeof bn_group_8192_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_8192_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_generator_19_value[] = {19} ;
+static BIGNUM bn_generator_19 = {
+	bn_generator_19_value,
+	1,
+	1,
+	0,
+	BN_FLG_STATIC_DATA
+};
+static BN_ULONG bn_generator_5_value[] = {5} ;
+static BIGNUM bn_generator_5 = {
+	bn_generator_5_value,
+	1,
+	1,
+	0,
+	BN_FLG_STATIC_DATA
+};
+static BN_ULONG bn_generator_2_value[] = {2} ;
+static BIGNUM bn_generator_2 = {
+	bn_generator_2_value,
+	1,
+	1,
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static SRP_gN knowngN[] = {
+	{"8192",&bn_generator_19 , &bn_group_8192},
+	{"6144",&bn_generator_5 , &bn_group_6144},
+	{"4096",&bn_generator_5 , &bn_group_4096},
+	{"3072",&bn_generator_5 , &bn_group_3072},
+	{"2048",&bn_generator_2 , &bn_group_2048},
+	{"1536",&bn_generator_2 , &bn_group_1536},
+	{"1024",&bn_generator_2 , &bn_group_1024},
+};
+#define KNOWN_GN_NUMBER sizeof(knowngN) / sizeof(SRP_gN)
+
+/* end of generated data */
diff --git a/main/openssl/crypto/srp/srp_lcl.h b/main/openssl/crypto/srp/srp_lcl.h
new file mode 100644
index 00000000..42bda3f1
--- /dev/null
+++ b/main/openssl/crypto/srp/srp_lcl.h
@@ -0,0 +1,83 @@
+/* crypto/srp/srp_lcl.h */
+/* Written by Peter Sylvester (peter.sylvester@edelweb.fr)  
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef HEADER_SRP_LCL_H
+#define HEADER_SRP_LCL_H
+
+#include <openssl/srp.h>
+#include <openssl/sha.h>
+
+#if 0
+#define srp_bn_print(a) {fprintf(stderr, #a "="); BN_print_fp(stderr,a); \
+   fprintf(stderr,"\n");}
+#else
+#define   srp_bn_print(a)
+#endif
+
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
diff --git a/main/openssl/crypto/srp/srp_lib.c b/main/openssl/crypto/srp/srp_lib.c
new file mode 100644
index 00000000..7c1dcc51
--- /dev/null
+++ b/main/openssl/crypto/srp/srp_lib.c
@@ -0,0 +1,361 @@
+/* crypto/srp/srp_lib.c */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with 
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef OPENSSL_NO_SRP
+#include "cryptlib.h"
+#include "srp_lcl.h"
+#include <openssl/srp.h>
+#include <openssl/evp.h>
+
+#if (BN_BYTES == 8)
+# if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+#  define bn_pack4(a1,a2,a3,a4) ((a1##UI64<<48)|(a2##UI64<<32)|(a3##UI64<<16)|a4##UI64)
+# elif defined(__arch64__)
+#  define bn_pack4(a1,a2,a3,a4) ((a1##UL<<48)|(a2##UL<<32)|(a3##UL<<16)|a4##UL)
+# else
+#  define bn_pack4(a1,a2,a3,a4) ((a1##ULL<<48)|(a2##ULL<<32)|(a3##ULL<<16)|a4##ULL)
+# endif
+#elif (BN_BYTES == 4)
+# define bn_pack4(a1,a2,a3,a4)  ((a3##UL<<16)|a4##UL), ((a1##UL<<16)|a2##UL)
+#else
+# error "unsupported BN_BYTES"
+#endif
+
+
+#include "srp_grps.h"
+
+static BIGNUM *srp_Calc_k(BIGNUM *N, BIGNUM *g)
+	{
+	/* k = SHA1(N | PAD(g)) -- tls-srp draft 8 */
+
+	unsigned char digest[SHA_DIGEST_LENGTH];
+	unsigned char *tmp;
+	EVP_MD_CTX ctxt;
+	int longg ;
+	int longN = BN_num_bytes(N);
+
+	if ((tmp = OPENSSL_malloc(longN)) == NULL)
+		return NULL;
+	BN_bn2bin(N,tmp) ;
+
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, tmp, longN);
+
+	memset(tmp, 0, longN);
+	longg = BN_bn2bin(g,tmp) ;
+        /* use the zeros behind to pad on left */
+	EVP_DigestUpdate(&ctxt, tmp + longg, longN-longg);
+	EVP_DigestUpdate(&ctxt, tmp, longg);
+	OPENSSL_free(tmp);
+
+	EVP_DigestFinal_ex(&ctxt, digest, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+	return BN_bin2bn(digest, sizeof(digest), NULL);	
+	}
+
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N)
+	{
+	/* k = SHA1(PAD(A) || PAD(B) ) -- tls-srp draft 8 */
+
+	BIGNUM *u;	
+	unsigned char cu[SHA_DIGEST_LENGTH];
+	unsigned char *cAB;
+	EVP_MD_CTX ctxt;
+	int longN;  
+	if ((A == NULL) ||(B == NULL) || (N == NULL))
+		return NULL;
+
+	longN= BN_num_bytes(N);
+
+	if ((cAB = OPENSSL_malloc(2*longN)) == NULL) 
+		return NULL;
+
+	memset(cAB, 0, longN);
+
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(A,cAB+longN), longN);
+	EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(B,cAB+longN), longN);
+	OPENSSL_free(cAB);
+	EVP_DigestFinal_ex(&ctxt, cu, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+
+	if (!(u = BN_bin2bn(cu, sizeof(cu), NULL)))
+		return NULL;
+	if (!BN_is_zero(u))
+		return u;
+	BN_free(u);
+	return NULL;
+}
+
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N)
+	{
+	BIGNUM *tmp = NULL, *S = NULL;
+	BN_CTX *bn_ctx; 
+	
+	if (u == NULL || A == NULL || v == NULL || b == NULL || N == NULL)
+		return NULL; 
+
+	if ((bn_ctx = BN_CTX_new()) == NULL ||
+		(tmp = BN_new()) == NULL ||
+		(S = BN_new()) == NULL )
+		goto err;
+
+	/* S = (A*v**u) ** b */ 
+
+	if (!BN_mod_exp(tmp,v,u,N,bn_ctx))
+		goto err;
+	if (!BN_mod_mul(tmp,A,tmp,N,bn_ctx))
+		goto err;
+	if (!BN_mod_exp(S,tmp,b,N,bn_ctx))
+		goto err;
+err:
+	BN_CTX_free(bn_ctx);
+	BN_clear_free(tmp);
+	return S;
+	}
+
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v)
+	{
+	BIGNUM  *kv = NULL, *gb = NULL;
+	BIGNUM *B = NULL, *k = NULL;
+	BN_CTX *bn_ctx;
+
+	if (b == NULL || N == NULL || g == NULL || v == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL)
+		return NULL; 
+
+	if ( (kv = BN_new()) == NULL ||
+		(gb = BN_new()) == NULL ||
+		(B = BN_new())== NULL)
+		goto err;
+
+	/* B = g**b + k*v */
+
+	if (!BN_mod_exp(gb,g,b,N,bn_ctx) ||
+	   !(k = srp_Calc_k(N,g)) ||
+	   !BN_mod_mul(kv,v,k,N,bn_ctx) || 
+	   !BN_mod_add(B,gb,kv,N,bn_ctx))
+		{
+		BN_free(B);
+		B = NULL;
+		}
+err:
+	BN_CTX_free(bn_ctx);
+	BN_clear_free(kv);
+	BN_clear_free(gb);
+	BN_free(k); 
+	return B;
+	}
+
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass)
+	{
+	unsigned char dig[SHA_DIGEST_LENGTH];
+	EVP_MD_CTX ctxt;
+	unsigned char *cs;
+
+	if ((s == NULL) ||
+		(user == NULL) ||
+		(pass == NULL))
+		return NULL;
+
+	if ((cs = OPENSSL_malloc(BN_num_bytes(s))) == NULL)
+		return NULL;
+
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, user, strlen(user));
+	EVP_DigestUpdate(&ctxt, ":", 1);
+	EVP_DigestUpdate(&ctxt, pass, strlen(pass));
+	EVP_DigestFinal_ex(&ctxt, dig, NULL);
+
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	BN_bn2bin(s,cs);
+	EVP_DigestUpdate(&ctxt, cs, BN_num_bytes(s));
+	OPENSSL_free(cs);
+	EVP_DigestUpdate(&ctxt, dig, sizeof(dig));
+	EVP_DigestFinal_ex(&ctxt, dig, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+
+	return BN_bin2bn(dig, sizeof(dig), NULL);
+	}
+
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g)
+	{
+	BN_CTX *bn_ctx; 
+	BIGNUM * A = NULL;
+
+	if (a == NULL || N == NULL || g == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL) 
+		return NULL;
+
+	if ((A = BN_new()) != NULL &&
+	   !BN_mod_exp(A,g,a,N,bn_ctx))
+		{
+		BN_free(A);
+		A = NULL;
+		}
+	BN_CTX_free(bn_ctx);
+	return A;
+	}
+
+
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u)
+	{
+	BIGNUM *tmp = NULL, *tmp2 = NULL, *tmp3 = NULL , *k = NULL, *K = NULL;
+	BN_CTX *bn_ctx;
+
+	if (u == NULL || B == NULL || N == NULL || g == NULL || x == NULL || a == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL)
+		return NULL; 
+
+	if ((tmp = BN_new()) == NULL ||
+		(tmp2 = BN_new())== NULL ||
+		(tmp3 = BN_new())== NULL ||
+		(K = BN_new()) == NULL)
+		goto err;
+	
+	if (!BN_mod_exp(tmp,g,x,N,bn_ctx))
+		goto err;
+	if (!(k = srp_Calc_k(N,g)))
+		goto err;
+	if (!BN_mod_mul(tmp2,tmp,k,N,bn_ctx))
+		goto err;
+	if (!BN_mod_sub(tmp,B,tmp2,N,bn_ctx))
+		goto err;
+
+	if (!BN_mod_mul(tmp3,u,x,N,bn_ctx))
+		goto err;
+	if (!BN_mod_add(tmp2,a,tmp3,N,bn_ctx))
+		goto err;
+	if (!BN_mod_exp(K,tmp,tmp2,N,bn_ctx))
+		goto err;
+
+err :
+	BN_CTX_free(bn_ctx);
+	BN_clear_free(tmp);
+	BN_clear_free(tmp2);
+	BN_clear_free(tmp3);
+	BN_free(k);
+	return K;	
+	}
+
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N)
+	{
+	BIGNUM *r;
+	BN_CTX *bn_ctx; 
+	int ret = 0;
+
+	if (B == NULL || N == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL)
+		return 0;
+
+	if ((r = BN_new()) == NULL)
+		goto err;
+	/* Checks if B % N == 0 */
+	if (!BN_nnmod(r,B,N,bn_ctx))
+		goto err;
+	ret = !BN_is_zero(r);
+err:
+	BN_CTX_free(bn_ctx);
+	BN_free(r);
+	return ret;
+	}
+
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N)
+	{
+	/* Checks if A % N == 0 */
+	return SRP_Verify_B_mod_N(A,N) ;
+	}
+
+
+/* Check if G and N are kwown parameters. 
+   The values have been generated from the ietf-tls-srp draft version 8
+*/
+char *SRP_check_known_gN_param(BIGNUM *g, BIGNUM *N)
+	{
+	size_t i;
+	if ((g == NULL) || (N == NULL))
+		return 0;
+
+	srp_bn_print(g);
+	srp_bn_print(N);
+
+	for(i = 0; i < KNOWN_GN_NUMBER; i++)
+		{
+		if (BN_cmp(knowngN[i].g, g) == 0 && BN_cmp(knowngN[i].N, N) == 0) 
+			return knowngN[i].id;
+		}
+	return NULL;
+	}
+
+SRP_gN *SRP_get_default_gN(const char *id)
+	{
+	size_t i;
+
+	if (id == NULL) 
+		return knowngN;
+	for(i = 0; i < KNOWN_GN_NUMBER; i++)
+		{
+		if (strcmp(knowngN[i].id, id)==0)
+			return knowngN + i;
+		}
+	return NULL;
+	}
+#endif
diff --git a/main/openssl/crypto/srp/srp_vfy.c b/main/openssl/crypto/srp/srp_vfy.c
new file mode 100644
index 00000000..4a3d13ed
--- /dev/null
+++ b/main/openssl/crypto/srp/srp_vfy.c
@@ -0,0 +1,658 @@
+/* crypto/srp/srp_vfy.c */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with 
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef OPENSSL_NO_SRP
+#include "cryptlib.h"
+#include "srp_lcl.h"
+#include <openssl/srp.h>
+#include <openssl/evp.h>
+#include <openssl/buffer.h>
+#include <openssl/rand.h>
+#include <openssl/txt_db.h>
+
+#define SRP_RANDOM_SALT_LEN 20
+#define MAX_LEN 2500
+
+static char b64table[] =
+  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz./";
+
+/* the following two conversion routines have been inspired by code from Stanford */ 
+
+/*
+ * Convert a base64 string into raw byte array representation.
+ */
+static int t_fromb64(unsigned char *a, const char *src)
+	{
+	char *loc;
+	int i, j;
+	int size;
+
+	while(*src && (*src == ' ' || *src == '\t' || *src == '\n'))
+		++src;
+	size = strlen(src);
+	i = 0;
+	while(i < size)
+		{
+		loc = strchr(b64table, src[i]);
+		if(loc == (char *) 0) break;
+		else a[i] = loc - b64table;
+		++i;
+		}
+	size = i;
+	i = size - 1;
+	j = size;
+	while(1)
+		{
+		a[j] = a[i];
+		if(--i < 0) break;
+		a[j] |= (a[i] & 3) << 6;
+		--j;
+		a[j] = (unsigned char) ((a[i] & 0x3c) >> 2);
+		if(--i < 0) break;
+		a[j] |= (a[i] & 0xf) << 4;
+		--j;
+		a[j] = (unsigned char) ((a[i] & 0x30) >> 4);
+		if(--i < 0) break;
+		a[j] |= (a[i] << 2);
+
+		a[--j] = 0;
+		if(--i < 0) break;
+		}
+	while(a[j] == 0 && j <= size) ++j;
+	i = 0;
+	while (j <= size) a[i++] = a[j++];
+	return i;
+	}
+
+
+/*
+ * Convert a raw byte string into a null-terminated base64 ASCII string.
+ */
+static char *t_tob64(char *dst, const unsigned char *src, int size)
+	{
+	int c, pos = size % 3;
+	unsigned char b0 = 0, b1 = 0, b2 = 0, notleading = 0;
+	char *olddst = dst;
+
+	switch(pos)
+		{
+	case 1:
+		b2 = src[0];
+		break;
+	case 2:
+		b1 = src[0];
+		b2 = src[1];
+		break;
+		}
+
+	while(1)
+		{
+		c = (b0 & 0xfc) >> 2;
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		c = ((b0 & 3) << 4) | ((b1 & 0xf0) >> 4);
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		c = ((b1 & 0xf) << 2) | ((b2 & 0xc0) >> 6);
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		c = b2 & 0x3f;
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		if(pos >= size) break;
+		else
+			{
+			b0 = src[pos++];
+			b1 = src[pos++];
+			b2 = src[pos++];
+			}
+		}
+
+	*dst++ = '\0';
+	return olddst;
+	}
+
+static void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
+	{
+	if (user_pwd == NULL) 
+		return;
+	BN_free(user_pwd->s);
+	BN_clear_free(user_pwd->v);
+	OPENSSL_free(user_pwd->id);
+	OPENSSL_free(user_pwd->info);
+	OPENSSL_free(user_pwd);
+	}
+
+static SRP_user_pwd *SRP_user_pwd_new()
+	{
+	SRP_user_pwd *ret = OPENSSL_malloc(sizeof(SRP_user_pwd));
+	if (ret == NULL)
+		return NULL;								
+	ret->N = NULL;
+	ret->g = NULL;	
+	ret->s = NULL;
+	ret->v = NULL;
+	ret->id = NULL ;
+	ret->info = NULL;
+	return ret;
+	}
+
+static void SRP_user_pwd_set_gN(SRP_user_pwd *vinfo, const BIGNUM *g,
+				const BIGNUM *N)
+	{
+	vinfo->N = N;
+	vinfo->g = g;	
+	}
+
+static int SRP_user_pwd_set_ids(SRP_user_pwd *vinfo, const char *id,
+				const char *info)
+	{
+	if (id != NULL && NULL == (vinfo->id = BUF_strdup(id)))
+		return 0;
+	return (info == NULL || NULL != (vinfo->info = BUF_strdup(info))) ;
+	}
+
+static int SRP_user_pwd_set_sv(SRP_user_pwd *vinfo, const char *s,
+			       const char *v)
+	{
+	unsigned char tmp[MAX_LEN];
+	int len;
+
+	if (strlen(s) > MAX_LEN || strlen(v) > MAX_LEN) 
+		return 0; 
+	len = t_fromb64(tmp, v);
+	if (NULL == (vinfo->v = BN_bin2bn(tmp, len, NULL)) )
+		return 0;
+	len = t_fromb64(tmp, s);
+	return ((vinfo->s = BN_bin2bn(tmp, len, NULL)) != NULL) ;
+	}
+
+static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v)
+	{
+	vinfo->v = v;
+	vinfo->s = s;
+	return (vinfo->s != NULL && vinfo->v != NULL) ;
+	}
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key)
+	{
+	SRP_VBASE *vb = (SRP_VBASE *) OPENSSL_malloc(sizeof(SRP_VBASE));
+
+	if (vb == NULL)
+		return NULL;
+	if (!(vb->users_pwd = sk_SRP_user_pwd_new_null()) ||
+		!(vb->gN_cache = sk_SRP_gN_cache_new_null()))
+		{
+		OPENSSL_free(vb);
+		return NULL;
+		}
+	vb->default_g = NULL;
+	vb->default_N = NULL;
+	vb->seed_key = NULL;
+	if ((seed_key != NULL) && 
+		(vb->seed_key = BUF_strdup(seed_key)) == NULL)
+		{
+		sk_SRP_user_pwd_free(vb->users_pwd);
+		sk_SRP_gN_cache_free(vb->gN_cache);
+		OPENSSL_free(vb);
+		return NULL;
+		}
+	return vb;
+	}
+
+
+int SRP_VBASE_free(SRP_VBASE *vb)
+	{
+	sk_SRP_user_pwd_pop_free(vb->users_pwd,SRP_user_pwd_free);
+	sk_SRP_gN_cache_free(vb->gN_cache);
+	OPENSSL_free(vb->seed_key);
+	OPENSSL_free(vb);
+	return 0;
+	}
+
+
+static SRP_gN_cache *SRP_gN_new_init(const char *ch)
+	{
+	unsigned char tmp[MAX_LEN];
+	int len;
+
+	SRP_gN_cache *newgN = (SRP_gN_cache *)OPENSSL_malloc(sizeof(SRP_gN_cache));
+	if (newgN == NULL)
+		return NULL;
+
+	if ((newgN->b64_bn = BUF_strdup(ch)) == NULL)
+		goto err;
+
+	len = t_fromb64(tmp, ch);
+	if ((newgN->bn = BN_bin2bn(tmp, len, NULL)))
+		return newgN;
+
+	OPENSSL_free(newgN->b64_bn);
+err:
+	OPENSSL_free(newgN);
+	return NULL;
+	}
+
+
+static void SRP_gN_free(SRP_gN_cache *gN_cache)
+	{
+	if (gN_cache == NULL)
+		return;
+	OPENSSL_free(gN_cache->b64_bn);
+	BN_free(gN_cache->bn);
+	OPENSSL_free(gN_cache);
+	}
+
+static SRP_gN *SRP_get_gN_by_id(const char *id, STACK_OF(SRP_gN) *gN_tab)
+	{
+	int i;
+
+	SRP_gN *gN;
+	if (gN_tab != NULL) 
+	for(i = 0; i < sk_SRP_gN_num(gN_tab); i++)
+		{
+		gN = sk_SRP_gN_value(gN_tab, i);
+		if (gN && (id == NULL || strcmp(gN->id,id)==0))
+			return gN;
+		}
+	
+	return SRP_get_default_gN(id);
+	}
+
+static BIGNUM *SRP_gN_place_bn(STACK_OF(SRP_gN_cache) *gN_cache, char *ch)
+	{
+	int i;
+	if (gN_cache == NULL)
+		return NULL;
+
+	/* search if we have already one... */
+	for(i = 0; i < sk_SRP_gN_cache_num(gN_cache); i++)
+		{
+		SRP_gN_cache *cache = sk_SRP_gN_cache_value(gN_cache, i);
+		if (strcmp(cache->b64_bn,ch)==0)
+			return cache->bn;
+		}
+		{		/* it is the first time that we find it */
+		SRP_gN_cache *newgN = SRP_gN_new_init(ch);
+		if (newgN)
+			{
+			if (sk_SRP_gN_cache_insert(gN_cache,newgN,0)>0)
+				return newgN->bn;
+			SRP_gN_free(newgN);
+			}
+		}
+	return NULL;
+	}
+
+/* this function parses verifier file. Format is:
+ * string(index):base64(N):base64(g):0
+ * string(username):base64(v):base64(salt):int(index)
+ */
+
+
+int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file)
+	{
+	int error_code ;
+	STACK_OF(SRP_gN) *SRP_gN_tab = sk_SRP_gN_new_null();
+	char *last_index = NULL;
+	int i;
+	char **pp;
+
+	SRP_gN *gN = NULL;
+	SRP_user_pwd *user_pwd = NULL ;
+
+	TXT_DB *tmpdb = NULL;
+	BIO *in = BIO_new(BIO_s_file());
+
+	error_code = SRP_ERR_OPEN_FILE;
+
+	if (in == NULL || BIO_read_filename(in,verifier_file) <= 0)
+		goto err;
+
+	error_code = SRP_ERR_VBASE_INCOMPLETE_FILE;
+
+	if ((tmpdb =TXT_DB_read(in,DB_NUMBER)) == NULL)
+		goto err;
+
+	error_code = SRP_ERR_MEMORY;
+
+
+	if (vb->seed_key)
+		{
+		last_index = SRP_get_default_gN(NULL)->id;
+		}
+	for (i = 0; i < sk_OPENSSL_PSTRING_num(tmpdb->data); i++)
+		{
+		pp = sk_OPENSSL_PSTRING_value(tmpdb->data,i);
+		if (pp[DB_srptype][0] == DB_SRP_INDEX)
+			{
+			/*we add this couple in the internal Stack */
+
+			if ((gN = (SRP_gN *)OPENSSL_malloc(sizeof(SRP_gN))) == NULL) 
+ 				goto err;
+
+			if  (!(gN->id = BUF_strdup(pp[DB_srpid]))
+	                ||  !(gN->N = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpverifier]))
+			||  !(gN->g = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpsalt]))
+			||  sk_SRP_gN_insert(SRP_gN_tab,gN,0) == 0)
+				goto err;
+
+			gN = NULL;
+
+			if (vb->seed_key != NULL)
+				{
+				last_index = pp[DB_srpid];
+				}
+			}
+		else if (pp[DB_srptype][0] == DB_SRP_VALID)
+			{
+			/* it is a user .... */
+			SRP_gN *lgN;
+			if ((lgN = SRP_get_gN_by_id(pp[DB_srpgN],SRP_gN_tab))!=NULL)
+				{
+				error_code = SRP_ERR_MEMORY;
+				if ((user_pwd = SRP_user_pwd_new()) == NULL) 
+					goto err;
+				
+				SRP_user_pwd_set_gN(user_pwd,lgN->g,lgN->N);
+				if (!SRP_user_pwd_set_ids(user_pwd, pp[DB_srpid],pp[DB_srpinfo]))
+					goto err;
+				
+				error_code = SRP_ERR_VBASE_BN_LIB;
+				if (!SRP_user_pwd_set_sv(user_pwd, pp[DB_srpsalt],pp[DB_srpverifier]))
+					goto err;
+
+				if (sk_SRP_user_pwd_insert(vb->users_pwd, user_pwd, 0) == 0)
+					goto err;
+				user_pwd = NULL; /* abandon responsability */
+				}
+			}
+		}
+	
+	if (last_index != NULL)
+		{
+		/* this means that we want to simulate a default user */
+
+		if (((gN = SRP_get_gN_by_id(last_index,SRP_gN_tab))==NULL))
+			{
+			error_code = SRP_ERR_VBASE_BN_LIB;
+			goto err;
+			}
+		vb->default_g = gN->g ;
+		vb->default_N = gN->N ;
+		gN = NULL ;
+		}
+	error_code = SRP_NO_ERROR;
+
+ err:
+	/* there may be still some leaks to fix, if this fails, the application terminates most likely */
+
+	if (gN != NULL)
+		{
+		OPENSSL_free(gN->id);
+		OPENSSL_free(gN);
+		}
+
+	SRP_user_pwd_free(user_pwd);
+
+	if (tmpdb) TXT_DB_free(tmpdb);
+	if (in) BIO_free_all(in);
+
+	sk_SRP_gN_free(SRP_gN_tab);
+
+	return error_code;
+
+	}
+
+
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+	{
+	int i;
+	SRP_user_pwd *user;
+	unsigned char digv[SHA_DIGEST_LENGTH];
+	unsigned char digs[SHA_DIGEST_LENGTH];
+	EVP_MD_CTX ctxt;
+
+	if (vb == NULL)
+		return NULL;
+	for(i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++)
+		{
+		user = sk_SRP_user_pwd_value(vb->users_pwd, i);
+		if (strcmp(user->id,username)==0)
+			return user;
+		}
+	if ((vb->seed_key == NULL) ||
+		(vb->default_g == NULL) ||
+		(vb->default_N == NULL))
+		return NULL;
+
+/* if the user is unknown we set parameters as well if we have a seed_key */
+
+	if ((user = SRP_user_pwd_new()) == NULL) 
+		return NULL;
+
+	SRP_user_pwd_set_gN(user,vb->default_g,vb->default_N);
+				
+	if (!SRP_user_pwd_set_ids(user,username,NULL))
+		goto err;
+		
+	RAND_pseudo_bytes(digv, SHA_DIGEST_LENGTH);
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, vb->seed_key, strlen(vb->seed_key));
+	EVP_DigestUpdate(&ctxt, username, strlen(username));
+	EVP_DigestFinal_ex(&ctxt, digs, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+	if (SRP_user_pwd_set_sv_BN(user, BN_bin2bn(digs,SHA_DIGEST_LENGTH,NULL), BN_bin2bn(digv,SHA_DIGEST_LENGTH, NULL))) 
+		return user;
+
+err:    SRP_user_pwd_free(user);
+	return NULL;
+	}
+
+
+/*
+   create a verifier (*salt,*verifier,g and N are in base64)
+*/
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+			  char **verifier, const char *N, const char *g)
+	{
+	int len;
+	char * result=NULL;
+	char *vf;
+	BIGNUM *N_bn = NULL, *g_bn = NULL, *s = NULL, *v = NULL;
+	unsigned char tmp[MAX_LEN];
+	unsigned char tmp2[MAX_LEN];
+	char * defgNid = NULL;
+
+	if ((user == NULL)||
+		(pass == NULL)||
+		(salt == NULL)||
+		(verifier == NULL))
+		goto err;
+
+	if (N)
+		{
+		if (!(len = t_fromb64(tmp, N))) goto err;
+		N_bn = BN_bin2bn(tmp, len, NULL);
+		if (!(len = t_fromb64(tmp, g))) goto err;
+		g_bn = BN_bin2bn(tmp, len, NULL);
+		defgNid = "*";
+		}
+	else
+		{ 
+		SRP_gN * gN = SRP_get_gN_by_id(g, NULL) ;
+		if (gN == NULL)
+			goto err;
+		N_bn = gN->N;
+		g_bn = gN->g;
+		defgNid = gN->id;
+		}
+
+	if (*salt == NULL)
+		{
+		RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN);
+
+		s = BN_bin2bn(tmp2, SRP_RANDOM_SALT_LEN, NULL);
+		}
+	else
+		{
+		if (!(len = t_fromb64(tmp2, *salt)))
+			goto err;
+		s = BN_bin2bn(tmp2, len, NULL);
+		}
+
+
+	if(!SRP_create_verifier_BN(user, pass, &s, &v, N_bn, g_bn)) goto err;
+
+	BN_bn2bin(v,tmp);
+	if (((vf = OPENSSL_malloc(BN_num_bytes(v)*2)) == NULL))
+		goto err;
+	t_tob64(vf, tmp, BN_num_bytes(v));
+
+	*verifier = vf;
+	if (*salt == NULL)
+		{
+		char *tmp_salt;
+
+		if ((tmp_salt = OPENSSL_malloc(SRP_RANDOM_SALT_LEN * 2)) == NULL)
+			{
+			OPENSSL_free(vf);
+			goto err;
+			}
+		t_tob64(tmp_salt, tmp2, SRP_RANDOM_SALT_LEN);
+		*salt = tmp_salt;
+		}
+
+	result=defgNid;
+
+err:
+	if(N)
+		{
+		BN_free(N_bn);
+		BN_free(g_bn);
+		}
+	return result;
+	}
+
+/*
+   create a verifier (*salt,*verifier,g and N are BIGNUMs)
+*/
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g)
+	{
+	int result=0;
+	BIGNUM *x = NULL;
+	BN_CTX *bn_ctx = BN_CTX_new();
+	unsigned char tmp2[MAX_LEN];
+
+	if ((user == NULL)||
+		(pass == NULL)||
+		(salt == NULL)||
+		(verifier == NULL)||
+		(N == NULL)||
+		(g == NULL)||
+		(bn_ctx == NULL))
+		goto err;
+
+	srp_bn_print(N);
+	srp_bn_print(g);
+
+	if (*salt == NULL)
+		{
+		RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN);
+
+		*salt = BN_bin2bn(tmp2,SRP_RANDOM_SALT_LEN,NULL);
+		}
+
+	x = SRP_Calc_x(*salt,user,pass);
+
+	*verifier = BN_new();
+	if(*verifier == NULL) goto err;
+
+	if (!BN_mod_exp(*verifier,g,x,N,bn_ctx))
+		{
+		BN_clear_free(*verifier);
+		goto err;
+		}
+
+	srp_bn_print(*verifier);
+
+	result=1;
+
+err:
+
+	BN_clear_free(x);
+	BN_CTX_free(bn_ctx);
+	return result;
+	}
+
+
+
+#endif
diff --git a/main/openssl/crypto/stack/safestack.h b/main/openssl/crypto/stack/safestack.h
index 3e76aa58..ea3aa0d8 100644
--- a/main/openssl/crypto/stack/safestack.h
+++ b/main/openssl/crypto/stack/safestack.h
@@ -1459,6 +1459,94 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st))
 #define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st))
 
+#define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp))
+#define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN)
+#define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st))
+#define sk_SRP_gN_num(st) SKM_sk_num(SRP_gN, (st))
+#define sk_SRP_gN_value(st, i) SKM_sk_value(SRP_gN, (st), (i))
+#define sk_SRP_gN_set(st, i, val) SKM_sk_set(SRP_gN, (st), (i), (val))
+#define sk_SRP_gN_zero(st) SKM_sk_zero(SRP_gN, (st))
+#define sk_SRP_gN_push(st, val) SKM_sk_push(SRP_gN, (st), (val))
+#define sk_SRP_gN_unshift(st, val) SKM_sk_unshift(SRP_gN, (st), (val))
+#define sk_SRP_gN_find(st, val) SKM_sk_find(SRP_gN, (st), (val))
+#define sk_SRP_gN_find_ex(st, val) SKM_sk_find_ex(SRP_gN, (st), (val))
+#define sk_SRP_gN_delete(st, i) SKM_sk_delete(SRP_gN, (st), (i))
+#define sk_SRP_gN_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN, (st), (ptr))
+#define sk_SRP_gN_insert(st, val, i) SKM_sk_insert(SRP_gN, (st), (val), (i))
+#define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp))
+#define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st)
+#define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func))
+#define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st))
+#define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st))
+#define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st))
+#define sk_SRP_gN_is_sorted(st) SKM_sk_is_sorted(SRP_gN, (st))
+
+#define sk_SRP_gN_cache_new(cmp) SKM_sk_new(SRP_gN_cache, (cmp))
+#define sk_SRP_gN_cache_new_null() SKM_sk_new_null(SRP_gN_cache)
+#define sk_SRP_gN_cache_free(st) SKM_sk_free(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_num(st) SKM_sk_num(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_value(st, i) SKM_sk_value(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_set(st, i, val) SKM_sk_set(SRP_gN_cache, (st), (i), (val))
+#define sk_SRP_gN_cache_zero(st) SKM_sk_zero(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_push(st, val) SKM_sk_push(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_unshift(st, val) SKM_sk_unshift(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find(st, val) SKM_sk_find(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find_ex(st, val) SKM_sk_find_ex(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_delete(st, i) SKM_sk_delete(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN_cache, (st), (ptr))
+#define sk_SRP_gN_cache_insert(st, val, i) SKM_sk_insert(SRP_gN_cache, (st), (val), (i))
+#define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp))
+#define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st)
+#define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func))
+#define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_is_sorted(st) SKM_sk_is_sorted(SRP_gN_cache, (st))
+
+#define sk_SRP_user_pwd_new(cmp) SKM_sk_new(SRP_user_pwd, (cmp))
+#define sk_SRP_user_pwd_new_null() SKM_sk_new_null(SRP_user_pwd)
+#define sk_SRP_user_pwd_free(st) SKM_sk_free(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_num(st) SKM_sk_num(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_value(st, i) SKM_sk_value(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_set(st, i, val) SKM_sk_set(SRP_user_pwd, (st), (i), (val))
+#define sk_SRP_user_pwd_zero(st) SKM_sk_zero(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_push(st, val) SKM_sk_push(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_unshift(st, val) SKM_sk_unshift(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find(st, val) SKM_sk_find(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find_ex(st, val) SKM_sk_find_ex(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_delete(st, i) SKM_sk_delete(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_user_pwd, (st), (ptr))
+#define sk_SRP_user_pwd_insert(st, val, i) SKM_sk_insert(SRP_user_pwd, (st), (val), (i))
+#define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp))
+#define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st)
+#define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func))
+#define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_is_sorted(st) SKM_sk_is_sorted(SRP_user_pwd, (st))
+
+#define sk_SRTP_PROTECTION_PROFILE_new(cmp) SKM_sk_new(SRTP_PROTECTION_PROFILE, (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_new_null() SKM_sk_new_null(SRTP_PROTECTION_PROFILE)
+#define sk_SRTP_PROTECTION_PROFILE_free(st) SKM_sk_free(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_num(st) SKM_sk_num(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_value(st, i) SKM_sk_value(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set(st, i, val) SKM_sk_set(SRTP_PROTECTION_PROFILE, (st), (i), (val))
+#define sk_SRTP_PROTECTION_PROFILE_zero(st) SKM_sk_zero(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_push(st, val) SKM_sk_push(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_unshift(st, val) SKM_sk_unshift(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find(st, val) SKM_sk_find(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find_ex(st, val) SKM_sk_find_ex(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_delete(st, i) SKM_sk_delete(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRTP_PROTECTION_PROFILE, (st), (ptr))
+#define sk_SRTP_PROTECTION_PROFILE_insert(st, val, i) SKM_sk_insert(SRTP_PROTECTION_PROFILE, (st), (val), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st)
+#define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func))
+#define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_is_sorted(st) SKM_sk_is_sorted(SRTP_PROTECTION_PROFILE, (st))
+
 #define sk_SSL_CIPHER_new(cmp) SKM_sk_new(SSL_CIPHER, (cmp))
 #define sk_SSL_CIPHER_new_null() SKM_sk_new_null(SSL_CIPHER)
 #define sk_SSL_CIPHER_free(st) SKM_sk_free(SSL_CIPHER, (st))
@@ -2056,31 +2144,6 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_OPENSSL_STRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_STRING, (st))
 
 
-#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
-#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
-#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
-#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
-#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
-#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
-#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
-	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
-	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
-#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
-
-
 #define sk_OPENSSL_BLOCK_new(cmp) ((STACK_OF(OPENSSL_BLOCK) *)sk_new(CHECKED_SK_CMP_FUNC(void, cmp)))
 #define sk_OPENSSL_BLOCK_new_null() ((STACK_OF(OPENSSL_BLOCK) *)sk_new_null())
 #define sk_OPENSSL_BLOCK_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
@@ -2106,6 +2169,31 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_OPENSSL_BLOCK_is_sorted(st) SKM_sk_is_sorted(OPENSSL_BLOCK, (st))
 
 
+#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
+#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
+#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
+#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
+#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
+#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
+#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
+	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
+#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
+
+
 #define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
 	SKM_ASN1_SET_OF_d2i(ACCESS_DESCRIPTION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
 #define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, i2d_func, ex_tag, ex_class, is_set) \
diff --git a/main/openssl/crypto/symhacks.h b/main/openssl/crypto/symhacks.h
index 3fd4a816..07a412f8 100644
--- a/main/openssl/crypto/symhacks.h
+++ b/main/openssl/crypto/symhacks.h
@@ -176,7 +176,6 @@
 #define SSL_CTX_set_default_passwd_cb_userdata  SSL_CTX_set_def_passwd_cb_ud
 #undef SSL_COMP_get_compression_methods
 #define SSL_COMP_get_compression_methods	SSL_COMP_get_compress_methods
-
 #undef ssl_add_clienthello_renegotiate_ext
 #define ssl_add_clienthello_renegotiate_ext	ssl_add_clienthello_reneg_ext
 #undef ssl_add_serverhello_renegotiate_ext
@@ -185,6 +184,26 @@
 #define ssl_parse_clienthello_renegotiate_ext	ssl_parse_clienthello_reneg_ext
 #undef ssl_parse_serverhello_renegotiate_ext
 #define ssl_parse_serverhello_renegotiate_ext	ssl_parse_serverhello_reneg_ext
+#undef SSL_srp_server_param_with_username
+#define SSL_srp_server_param_with_username	SSL_srp_server_param_with_un
+#undef SSL_CTX_set_srp_client_pwd_callback
+#define SSL_CTX_set_srp_client_pwd_callback	SSL_CTX_set_srp_client_pwd_cb
+#undef SSL_CTX_set_srp_verify_param_callback
+#define SSL_CTX_set_srp_verify_param_callback	SSL_CTX_set_srp_vfy_param_cb
+#undef SSL_CTX_set_srp_username_callback
+#define SSL_CTX_set_srp_username_callback	SSL_CTX_set_srp_un_cb
+#undef ssl_add_clienthello_use_srtp_ext
+#define ssl_add_clienthello_use_srtp_ext	ssl_add_clihello_use_srtp_ext
+#undef ssl_add_serverhello_use_srtp_ext
+#define ssl_add_serverhello_use_srtp_ext	ssl_add_serhello_use_srtp_ext
+#undef ssl_parse_clienthello_use_srtp_ext
+#define ssl_parse_clienthello_use_srtp_ext	ssl_parse_clihello_use_srtp_ext
+#undef ssl_parse_serverhello_use_srtp_ext
+#define ssl_parse_serverhello_use_srtp_ext	ssl_parse_serhello_use_srtp_ext
+#undef SSL_CTX_set_next_protos_advertised_cb
+#define SSL_CTX_set_next_protos_advertised_cb	SSL_CTX_set_next_protos_adv_cb
+#undef SSL_CTX_set_next_proto_select_cb
+#define SSL_CTX_set_next_proto_select_cb	SSL_CTX_set_next_proto_sel_cb
 
 /* Hack some long ENGINE names */
 #undef ENGINE_get_default_BN_mod_exp_crt
@@ -238,6 +257,9 @@
 #define EC_GROUP_get_point_conversion_form	EC_GROUP_get_point_conv_form
 #undef EC_GROUP_clear_free_all_extra_data
 #define EC_GROUP_clear_free_all_extra_data	EC_GROUP_clr_free_all_xtra_data
+#undef EC_KEY_set_public_key_affine_coordinates
+#define EC_KEY_set_public_key_affine_coordinates \
+						EC_KEY_set_pub_key_aff_coords
 #undef EC_POINT_set_Jprojective_coordinates_GFp
 #define EC_POINT_set_Jprojective_coordinates_GFp \
                                                 EC_POINT_set_Jproj_coords_GFp
@@ -294,8 +316,6 @@
 #define ec_GFp_simple_point_set_to_infinity     ec_GFp_simple_pt_set_to_inf
 #undef ec_GFp_simple_points_make_affine
 #define ec_GFp_simple_points_make_affine	ec_GFp_simple_pts_make_affine
-#undef ec_GFp_simple_group_get_curve_GFp
-#define ec_GFp_simple_group_get_curve_GFp       ec_GFp_simple_grp_get_curve_GFp
 #undef ec_GFp_simple_set_Jprojective_coordinates_GFp
 #define ec_GFp_simple_set_Jprojective_coordinates_GFp \
                                                 ec_GFp_smp_set_Jproj_coords_GFp
@@ -399,6 +419,12 @@
 #undef dtls1_retransmit_buffered_messages
 #define dtls1_retransmit_buffered_messages	dtls1_retransmit_buffered_msgs
 
+/* Hack some long SRP names */
+#undef SRP_generate_server_master_secret
+#define SRP_generate_server_master_secret	SRP_gen_server_master_secret
+#undef SRP_generate_client_master_secret
+#define SRP_generate_client_master_secret	SRP_gen_client_master_secret
+
 /* Hack some long UI names */
 #undef UI_method_get_prompt_constructor
 #define UI_method_get_prompt_constructor	UI_method_get_prompt_constructr
diff --git a/main/openssl/crypto/ui/ui.h b/main/openssl/crypto/ui/ui.h
index 2b1cfa22..bd78aa41 100644
--- a/main/openssl/crypto/ui/ui.h
+++ b/main/openssl/crypto/ui/ui.h
@@ -316,7 +316,7 @@ int (*UI_method_get_writer(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_flusher(UI_METHOD *method))(UI*);
 int (*UI_method_get_reader(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_closer(UI_METHOD *method))(UI*);
-char* (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
+char * (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
 
 /* The following functions are helpers for method writers to access relevant
    data from a UI_STRING. */
diff --git a/main/openssl/crypto/ui/ui_openssl.c b/main/openssl/crypto/ui/ui_openssl.c
index b05cbf34..e6ccd34a 100644
--- a/main/openssl/crypto/ui/ui_openssl.c
+++ b/main/openssl/crypto/ui/ui_openssl.c
@@ -122,9 +122,15 @@
  * sigaction and fileno included. -pedantic would be more appropriate for
  * the intended purposes, but we can't prevent users from adding -ansi.
  */
+#if defined(OPENSSL_SYSNAME_VXWORKS)
+#include <sys/types.h>
+#endif
+
+#if !defined(_POSIX_C_SOURCE) && defined(OPENSSL_SYS_VMS)
 #ifndef _POSIX_C_SOURCE
 #define _POSIX_C_SOURCE 2
 #endif
+#endif
 #include <signal.h>
 #include <stdio.h>
 #include <string.h>
diff --git a/main/openssl/crypto/x509/x509.h b/main/openssl/crypto/x509/x509.h
index e6f8a403..092dd745 100644
--- a/main/openssl/crypto/x509/x509.h
+++ b/main/openssl/crypto/x509/x509.h
@@ -657,11 +657,15 @@ int NETSCAPE_SPKI_set_pubkey(NETSCAPE_SPKI *x, EVP_PKEY *pkey);
 
 int NETSCAPE_SPKI_print(BIO *out, NETSCAPE_SPKI *spki);
 
+int X509_signature_dump(BIO *bp,const ASN1_STRING *sig, int indent);
 int X509_signature_print(BIO *bp,X509_ALGOR *alg, ASN1_STRING *sig);
 
 int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx);
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx);
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx);
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md);
 
 int X509_pubkey_digest(const X509 *data,const EVP_MD *type,
@@ -763,6 +767,7 @@ X509_ALGOR *X509_ALGOR_dup(X509_ALGOR *xn);
 int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype, void *pval);
 void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
 						X509_ALGOR *algor);
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md);
 
 X509_NAME *X509_NAME_dup(X509_NAME *xn);
 X509_NAME_ENTRY *X509_NAME_ENTRY_dup(X509_NAME_ENTRY *ne);
@@ -896,6 +901,9 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *algor1,
 int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	ASN1_BIT_STRING *signature,
 	void *data, EVP_PKEY *pkey, const EVP_MD *type);
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx);
 #endif
 
 int 		X509_set_version(X509 *x,long version);
@@ -1161,6 +1169,9 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 				 unsigned char *salt, int saltlen,
 				 unsigned char *aiv, int prf_nid);
 
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen);
+
 /* PKCS#8 utilities */
 
 DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO)
diff --git a/main/openssl/crypto/x509/x509_cmp.c b/main/openssl/crypto/x509/x509_cmp.c
index 4bc9da07..352aa374 100644
--- a/main/openssl/crypto/x509/x509_cmp.c
+++ b/main/openssl/crypto/x509/x509_cmp.c
@@ -86,16 +86,20 @@ unsigned long X509_issuer_and_serial_hash(X509 *a)
 
 	EVP_MD_CTX_init(&ctx);
 	f=X509_NAME_oneline(a->cert_info->issuer,NULL,0);
-	ret=strlen(f);
-	EVP_DigestInit_ex(&ctx, EVP_md5(), NULL);
-	EVP_DigestUpdate(&ctx,(unsigned char *)f,ret);
+	if (!EVP_DigestInit_ex(&ctx, EVP_md5(), NULL))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx,(unsigned char *)f,strlen(f)))
+		goto err;
 	OPENSSL_free(f);
-	EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
-		(unsigned long)a->cert_info->serialNumber->length);
-	EVP_DigestFinal_ex(&ctx,&(md[0]),NULL);
+	if(!EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
+		(unsigned long)a->cert_info->serialNumber->length))
+		goto err;
+	if (!EVP_DigestFinal_ex(&ctx,&(md[0]),NULL))
+		goto err;
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
 		)&0xffffffffL;
+	err:
 	EVP_MD_CTX_cleanup(&ctx);
 	return(ret);
 	}
@@ -219,7 +223,9 @@ unsigned long X509_NAME_hash(X509_NAME *x)
 
 	/* Make sure X509_NAME structure contains valid cached encoding */
 	i2d_X509_NAME(x,NULL);
-	EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(),
+		NULL))
+		return 0;
 
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
@@ -234,16 +240,22 @@ unsigned long X509_NAME_hash(X509_NAME *x)
 
 unsigned long X509_NAME_hash_old(X509_NAME *x)
 	{
+	EVP_MD_CTX md_ctx;
 	unsigned long ret=0;
 	unsigned char md[16];
 
 	/* Make sure X509_NAME structure contains valid cached encoding */
 	i2d_X509_NAME(x,NULL);
-	EVP_Digest(x->bytes->data, x->bytes->length, md, NULL, EVP_md5(), NULL);
+	EVP_MD_CTX_init(&md_ctx);
+	EVP_MD_CTX_set_flags(&md_ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	if (EVP_DigestInit_ex(&md_ctx, EVP_md5(), NULL)
+	    && EVP_DigestUpdate(&md_ctx, x->bytes->data, x->bytes->length)
+	    && EVP_DigestFinal_ex(&md_ctx,md,NULL))
+		ret=(((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
+		     ((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
+		     )&0xffffffffL;
+	EVP_MD_CTX_cleanup(&md_ctx);
 
-	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
-		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
-		)&0xffffffffL;
 	return(ret);
 	}
 #endif
diff --git a/main/openssl/crypto/x509/x509_lu.c b/main/openssl/crypto/x509/x509_lu.c
index 3a6e04a1..38525a8c 100644
--- a/main/openssl/crypto/x509/x509_lu.c
+++ b/main/openssl/crypto/x509/x509_lu.c
@@ -87,7 +87,7 @@ void X509_LOOKUP_free(X509_LOOKUP *ctx)
 	if (ctx == NULL) return;
 	if (	(ctx->method != NULL) &&
 		(ctx->method->free != NULL))
-		ctx->method->free(ctx);
+		(*ctx->method->free)(ctx);
 	OPENSSL_free(ctx);
 	}
 
diff --git a/main/openssl/crypto/x509/x509_vfy.c b/main/openssl/crypto/x509/x509_vfy.c
index 5a0b0249..5195ffef 100644
--- a/main/openssl/crypto/x509/x509_vfy.c
+++ b/main/openssl/crypto/x509/x509_vfy.c
@@ -153,7 +153,6 @@ static int x509_subject_cmp(X509 **a, X509 **b)
 int X509_verify_cert(X509_STORE_CTX *ctx)
 	{
 	X509 *x,*xtmp,*chain_ss=NULL;
-	X509_NAME *xn;
 	int bad_chain = 0;
 	X509_VERIFY_PARAM *param = ctx->param;
 	int depth,i,ok=0;
@@ -205,7 +204,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 		                         */
 
 		/* If we are self signed, we break */
-		xn=X509_get_issuer_name(x);
 		if (ctx->check_issued(ctx, x,x)) break;
 
 		/* If we were passed a cert chain, use it first */
@@ -242,7 +240,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 
 	i=sk_X509_num(ctx->chain);
 	x=sk_X509_value(ctx->chain,i-1);
-	xn = X509_get_subject_name(x);
 	if (ctx->check_issued(ctx, x, x))
 		{
 		/* we have a self signed certificate */
@@ -291,7 +288,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 		if (depth < num) break;
 
 		/* If we are self signed, we break */
-		xn=X509_get_issuer_name(x);
 		if (ctx->check_issued(ctx,x,x)) break;
 
 		ok = ctx->get_issuer(&xtmp, ctx, x);
@@ -310,7 +306,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 		}
 
 	/* we now have our chain, lets check it... */
-	xn=X509_get_issuer_name(x);
 
 	/* Is last certificate looked up self signed? */
 	if (!ctx->check_issued(ctx,x,x))
@@ -699,6 +694,7 @@ static int check_cert(X509_STORE_CTX *ctx)
 	X509_CRL *crl = NULL, *dcrl = NULL;
 	X509 *x;
 	int ok, cnum;
+	unsigned int last_reasons;
 	cnum = ctx->error_depth;
 	x = sk_X509_value(ctx->chain, cnum);
 	ctx->current_cert = x;
@@ -707,6 +703,7 @@ static int check_cert(X509_STORE_CTX *ctx)
 	ctx->current_reasons = 0;
 	while (ctx->current_reasons != CRLDP_ALL_REASONS)
 		{
+		last_reasons = ctx->current_reasons;
 		/* Try to retrieve relevant CRL */
 		if (ctx->get_crl)
 			ok = ctx->get_crl(ctx, &crl, x);
@@ -750,6 +747,15 @@ static int check_cert(X509_STORE_CTX *ctx)
 		X509_CRL_free(dcrl);
 		crl = NULL;
 		dcrl = NULL;
+		/* If reasons not updated we wont get anywhere by
+		 * another iteration, so exit loop.
+		 */
+		if (last_reasons == ctx->current_reasons)
+			{
+			ctx->error = X509_V_ERR_UNABLE_TO_GET_CRL;
+			ok = ctx->verify_cb(0, ctx);
+			goto err;
+			}
 		}
 	err:
 	X509_CRL_free(crl);
@@ -877,7 +883,7 @@ static int crl_extension_match(X509_CRL *a, X509_CRL *b, int nid)
 	{
 	ASN1_OCTET_STRING *exta, *extb;
 	int i;
-	i = X509_CRL_get_ext_by_NID(a, nid, 0);
+	i = X509_CRL_get_ext_by_NID(a, nid, -1);
 	if (i >= 0)
 		{
 		/* Can't have multiple occurrences */
@@ -888,7 +894,7 @@ static int crl_extension_match(X509_CRL *a, X509_CRL *b, int nid)
 	else
 		exta = NULL;
 
-	i = X509_CRL_get_ext_by_NID(b, nid, 0);
+	i = X509_CRL_get_ext_by_NID(b, nid, -1);
 
 	if (i >= 0)
 		{
@@ -1732,7 +1738,7 @@ int X509_cmp_time(const ASN1_TIME *ctm, time_t *cmp_time)
 	atm.length=sizeof(buff2);
 	atm.data=(unsigned char *)buff2;
 
-	if (X509_time_adj(&atm,-offset*60, cmp_time) == NULL)
+	if (X509_time_adj(&atm, offset*60, cmp_time) == NULL)
 		return 0;
 
 	if (ctm->type == V_ASN1_UTCTIME)
diff --git a/main/openssl/crypto/x509/x509type.c b/main/openssl/crypto/x509/x509type.c
index 3385ad3f..9702ec53 100644
--- a/main/openssl/crypto/x509/x509type.c
+++ b/main/openssl/crypto/x509/x509type.c
@@ -100,20 +100,26 @@ int X509_certificate_type(X509 *x, EVP_PKEY *pkey)
 		break;
 		}
 
-	i=X509_get_signature_type(x);
-	switch (i)
+	i=OBJ_obj2nid(x->sig_alg->algorithm);
+	if (i && OBJ_find_sigid_algs(i, NULL, &i))
 		{
-	case EVP_PKEY_RSA:
-		ret|=EVP_PKS_RSA;
-		break;
-	case EVP_PKEY_DSA:
-		ret|=EVP_PKS_DSA;
-		break;
-	case EVP_PKEY_EC:
-		ret|=EVP_PKS_EC;
-		break;
-	default:
-		break;
+
+		switch (i)
+			{
+		case NID_rsaEncryption:
+		case NID_rsa:
+			ret|=EVP_PKS_RSA;
+			break;
+		case NID_dsa:
+		case NID_dsa_2:
+			ret|=EVP_PKS_DSA;
+			break;
+		case NID_X9_62_id_ecPublicKey:
+			ret|=EVP_PKS_EC;
+			break;
+		default:
+			break;
+			}
 		}
 
 	if (EVP_PKEY_size(pk) <= 1024/8)/* /8 because it's 1024 bits we look
diff --git a/main/openssl/crypto/x509/x_all.c b/main/openssl/crypto/x509/x_all.c
index 8ec88c21..e06602d6 100644
--- a/main/openssl/crypto/x509/x_all.c
+++ b/main/openssl/crypto/x509/x_all.c
@@ -95,12 +95,26 @@ int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md)
 		x->sig_alg, x->signature, x->cert_info,pkey,md));
 	}
 
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx)
+	{
+	x->cert_info->enc.modified = 1;
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CINF),
+		x->cert_info->signature,
+		x->sig_alg, x->signature, x->cert_info, ctx);
+	}
+
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	return(ASN1_item_sign(ASN1_ITEM_rptr(X509_REQ_INFO),x->sig_alg, NULL,
 		x->signature, x->req_info,pkey,md));
 	}
 
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_REQ_INFO),
+		x->sig_alg, NULL, x->signature, x->req_info, ctx);
+	}
+
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	x->crl->enc.modified = 1;
@@ -108,6 +122,13 @@ int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md)
 		x->sig_alg, x->signature, x->crl,pkey,md));
 	}
 
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx)
+	{
+	x->crl->enc.modified = 1;
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CRL_INFO),
+		x->crl->sig_alg, x->sig_alg, x->signature, x->crl, ctx);
+	}
+
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	return(ASN1_item_sign(ASN1_ITEM_rptr(NETSCAPE_SPKAC), x->sig_algor,NULL,
diff --git a/main/openssl/crypto/x509v3/v3_addr.c b/main/openssl/crypto/x509v3/v3_addr.c
index 0d70e869..df46a498 100644
--- a/main/openssl/crypto/x509v3/v3_addr.c
+++ b/main/openssl/crypto/x509v3/v3_addr.c
@@ -142,12 +142,13 @@ unsigned int v3_addr_get_afi(const IPAddressFamily *f)
  * Expand the bitstring form of an address into a raw byte array.
  * At the moment this is coded for simplicity, not speed.
  */
-static void addr_expand(unsigned char *addr,
+static int addr_expand(unsigned char *addr,
 			const ASN1_BIT_STRING *bs,
 			const int length,
 			const unsigned char fill)
 {
-  OPENSSL_assert(bs->length >= 0 && bs->length <= length);
+  if (bs->length < 0 || bs->length > length)
+    return 0;
   if (bs->length > 0) {
     memcpy(addr, bs->data, bs->length);
     if ((bs->flags & 7) != 0) {
@@ -159,6 +160,7 @@ static void addr_expand(unsigned char *addr,
     }
   }
   memset(addr + bs->length, fill, length - bs->length);
+  return 1;
 }
 
 /*
@@ -181,15 +183,13 @@ static int i2r_address(BIO *out,
     return 0;
   switch (afi) {
   case IANA_AFI_IPV4:
-    if (bs->length > 4)
+    if (!addr_expand(addr, bs, 4, fill))
       return 0;
-    addr_expand(addr, bs, 4, fill);
     BIO_printf(out, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]);
     break;
   case IANA_AFI_IPV6:
-    if (bs->length > 16)
+    if (!addr_expand(addr, bs, 16, fill))
       return 0;
-    addr_expand(addr, bs, 16, fill);
     for (n = 16; n > 1 && addr[n-1] == 0x00 && addr[n-2] == 0x00; n -= 2)
       ;
     for (i = 0; i < n; i += 2)
@@ -315,6 +315,12 @@ static int i2r_IPAddrBlocks(const X509V3_EXT_METHOD *method,
 /*
  * Sort comparison function for a sequence of IPAddressOrRange
  * elements.
+ *
+ * There's no sane answer we can give if addr_expand() fails, and an
+ * assertion failure on externally supplied data is seriously uncool,
+ * so we just arbitrarily declare that if given invalid inputs this
+ * function returns -1.  If this messes up your preferred sort order
+ * for garbage input, tough noogies.
  */
 static int IPAddressOrRange_cmp(const IPAddressOrRange *a,
 				const IPAddressOrRange *b,
@@ -326,22 +332,26 @@ static int IPAddressOrRange_cmp(const IPAddressOrRange *a,
 
   switch (a->type) {
   case IPAddressOrRange_addressPrefix:
-    addr_expand(addr_a, a->u.addressPrefix, length, 0x00);
+    if (!addr_expand(addr_a, a->u.addressPrefix, length, 0x00))
+      return -1;
     prefixlen_a = addr_prefixlen(a->u.addressPrefix);
     break;
   case IPAddressOrRange_addressRange:
-    addr_expand(addr_a, a->u.addressRange->min, length, 0x00);
+    if (!addr_expand(addr_a, a->u.addressRange->min, length, 0x00))
+      return -1;
     prefixlen_a = length * 8;
     break;
   }
 
   switch (b->type) {
   case IPAddressOrRange_addressPrefix:
-    addr_expand(addr_b, b->u.addressPrefix, length, 0x00);
+    if (!addr_expand(addr_b, b->u.addressPrefix, length, 0x00))
+      return -1;
     prefixlen_b = addr_prefixlen(b->u.addressPrefix);
     break;
   case IPAddressOrRange_addressRange:
-    addr_expand(addr_b, b->u.addressRange->min, length, 0x00);
+    if (!addr_expand(addr_b, b->u.addressRange->min, length, 0x00))
+      return -1;
     prefixlen_b = length * 8;
     break;
   }
@@ -383,6 +393,7 @@ static int range_should_be_prefix(const unsigned char *min,
   unsigned char mask;
   int i, j;
 
+  OPENSSL_assert(memcmp(min, max, length) <= 0);
   for (i = 0; i < length && min[i] == max[i]; i++)
     ;
   for (j = length - 1; j >= 0 && min[j] == 0x00 && max[j] == 0xFF; j--)
@@ -601,10 +612,10 @@ static IPAddressOrRanges *make_prefix_or_range(IPAddrBlocks *addr,
     return NULL;
   switch (afi) {
   case IANA_AFI_IPV4:
-    sk_IPAddressOrRange_set_cmp_func(aors, v4IPAddressOrRange_cmp);
+    (void) sk_IPAddressOrRange_set_cmp_func(aors, v4IPAddressOrRange_cmp);
     break;
   case IANA_AFI_IPV6:
-    sk_IPAddressOrRange_set_cmp_func(aors, v6IPAddressOrRange_cmp);
+    (void) sk_IPAddressOrRange_set_cmp_func(aors, v6IPAddressOrRange_cmp);
     break;
   }
   f->ipAddressChoice->type = IPAddressChoice_addressesOrRanges;
@@ -656,22 +667,22 @@ int v3_addr_add_range(IPAddrBlocks *addr,
 /*
  * Extract min and max values from an IPAddressOrRange.
  */
-static void extract_min_max(IPAddressOrRange *aor,
+static int extract_min_max(IPAddressOrRange *aor,
 			    unsigned char *min,
 			    unsigned char *max,
 			    int length)
 {
-  OPENSSL_assert(aor != NULL && min != NULL && max != NULL);
+  if (aor == NULL || min == NULL || max == NULL)
+    return 0;
   switch (aor->type) {
   case IPAddressOrRange_addressPrefix:
-    addr_expand(min, aor->u.addressPrefix, length, 0x00);
-    addr_expand(max, aor->u.addressPrefix, length, 0xFF);
-    return;
+    return (addr_expand(min, aor->u.addressPrefix, length, 0x00) &&
+	    addr_expand(max, aor->u.addressPrefix, length, 0xFF));
   case IPAddressOrRange_addressRange:
-    addr_expand(min, aor->u.addressRange->min, length, 0x00);
-    addr_expand(max, aor->u.addressRange->max, length, 0xFF);
-    return;
+    return (addr_expand(min, aor->u.addressRange->min, length, 0x00) &&
+	    addr_expand(max, aor->u.addressRange->max, length, 0xFF));
   }
+  return 0;
 }
 
 /*
@@ -687,9 +698,10 @@ int v3_addr_get_range(IPAddressOrRange *aor,
   if (aor == NULL || min == NULL || max == NULL ||
       afi_length == 0 || length < afi_length ||
       (aor->type != IPAddressOrRange_addressPrefix &&
-       aor->type != IPAddressOrRange_addressRange))
+       aor->type != IPAddressOrRange_addressRange) ||
+      !extract_min_max(aor, min, max, afi_length))
     return 0;
-  extract_min_max(aor, min, max, afi_length);
+
   return afi_length;
 }
 
@@ -771,8 +783,9 @@ int v3_addr_is_canonical(IPAddrBlocks *addr)
       IPAddressOrRange *a = sk_IPAddressOrRange_value(aors, j);
       IPAddressOrRange *b = sk_IPAddressOrRange_value(aors, j + 1);
 
-      extract_min_max(a, a_min, a_max, length);
-      extract_min_max(b, b_min, b_max, length);
+      if (!extract_min_max(a, a_min, a_max, length) ||
+	  !extract_min_max(b, b_min, b_max, length))
+	return 0;
 
       /*
        * Punt misordered list, overlapping start, or inverted range.
@@ -800,14 +813,17 @@ int v3_addr_is_canonical(IPAddrBlocks *addr)
     }
 
     /*
-     * Check final range to see if it should be a prefix.
+     * Check range to see if it's inverted or should be a
+     * prefix.
      */
     j = sk_IPAddressOrRange_num(aors) - 1;
     {
       IPAddressOrRange *a = sk_IPAddressOrRange_value(aors, j);
-      if (a->type == IPAddressOrRange_addressRange) {
-	extract_min_max(a, a_min, a_max, length);
-	if (range_should_be_prefix(a_min, a_max, length) >= 0)
+      if (a != NULL && a->type == IPAddressOrRange_addressRange) {
+	if (!extract_min_max(a, a_min, a_max, length))
+	  return 0;
+	if (memcmp(a_min, a_max, length) > 0 ||
+	    range_should_be_prefix(a_min, a_max, length) >= 0)
 	  return 0;
       }
     }
@@ -841,8 +857,16 @@ static int IPAddressOrRanges_canonize(IPAddressOrRanges *aors,
     unsigned char a_min[ADDR_RAW_BUF_LEN], a_max[ADDR_RAW_BUF_LEN];
     unsigned char b_min[ADDR_RAW_BUF_LEN], b_max[ADDR_RAW_BUF_LEN];
 
-    extract_min_max(a, a_min, a_max, length);
-    extract_min_max(b, b_min, b_max, length);
+    if (!extract_min_max(a, a_min, a_max, length) ||
+	!extract_min_max(b, b_min, b_max, length))
+      return 0;
+
+    /*
+     * Punt inverted ranges.
+     */
+    if (memcmp(a_min, a_max, length) > 0 ||
+	memcmp(b_min, b_max, length) > 0)
+      return 0;
 
     /*
      * Punt overlaps.
@@ -860,8 +884,8 @@ static int IPAddressOrRanges_canonize(IPAddressOrRanges *aors,
       IPAddressOrRange *merged;
       if (!make_addressRange(&merged, a_min, b_max, length))
 	return 0;
-      sk_IPAddressOrRange_set(aors, i, merged);
-      sk_IPAddressOrRange_delete(aors, i + 1);
+      (void) sk_IPAddressOrRange_set(aors, i, merged);
+      (void) sk_IPAddressOrRange_delete(aors, i + 1);
       IPAddressOrRange_free(a);
       IPAddressOrRange_free(b);
       --i;
@@ -869,6 +893,20 @@ static int IPAddressOrRanges_canonize(IPAddressOrRanges *aors,
     }
   }
 
+  /*
+   * Check for inverted final range.
+   */
+  j = sk_IPAddressOrRange_num(aors) - 1;
+  {
+    IPAddressOrRange *a = sk_IPAddressOrRange_value(aors, j);
+    if (a != NULL && a->type == IPAddressOrRange_addressRange) {
+      unsigned char a_min[ADDR_RAW_BUF_LEN], a_max[ADDR_RAW_BUF_LEN];
+      extract_min_max(a, a_min, a_max, length);
+      if (memcmp(a_min, a_max, length) > 0)
+	return 0;
+    }
+  }
+
   return 1;
 }
 
@@ -885,7 +923,7 @@ int v3_addr_canonize(IPAddrBlocks *addr)
 				    v3_addr_get_afi(f)))
       return 0;
   }
-  sk_IPAddressFamily_set_cmp_func(addr, IPAddressFamily_cmp);
+  (void) sk_IPAddressFamily_set_cmp_func(addr, IPAddressFamily_cmp);
   sk_IPAddressFamily_sort(addr);
   OPENSSL_assert(v3_addr_is_canonical(addr));
   return 1;
@@ -1017,6 +1055,11 @@ static void *v2i_IPAddrBlocks(const struct v3_ext_method *method,
 	X509V3_conf_err(val);
 	goto err;
       }
+      if (memcmp(min, max, length_from_afi(afi)) > 0) {
+	X509V3err(X509V3_F_V2I_IPADDRBLOCKS, X509V3_R_EXTENSION_VALUE_ERROR);
+	X509V3_conf_err(val);
+	goto err;
+      }
       if (!v3_addr_add_range(addr, afi, safi, min, max)) {
 	X509V3err(X509V3_F_V2I_IPADDRBLOCKS, ERR_R_MALLOC_FAILURE);
 	goto err;
@@ -1102,13 +1145,15 @@ static int addr_contains(IPAddressOrRanges *parent,
 
   p = 0;
   for (c = 0; c < sk_IPAddressOrRange_num(child); c++) {
-    extract_min_max(sk_IPAddressOrRange_value(child, c),
-		    c_min, c_max, length);
+    if (!extract_min_max(sk_IPAddressOrRange_value(child, c),
+			 c_min, c_max, length))
+      return -1;
     for (;; p++) {
       if (p >= sk_IPAddressOrRange_num(parent))
 	return 0;
-      extract_min_max(sk_IPAddressOrRange_value(parent, p),
-		      p_min, p_max, length);
+      if (!extract_min_max(sk_IPAddressOrRange_value(parent, p),
+			   p_min, p_max, length))
+	return 0;
       if (memcmp(p_max, c_max, length) < 0)
 	continue;
       if (memcmp(p_min, c_min, length) > 0)
@@ -1130,7 +1175,7 @@ int v3_addr_subset(IPAddrBlocks *a, IPAddrBlocks *b)
     return 1;
   if (b == NULL || v3_addr_inherits(a) || v3_addr_inherits(b))
     return 0;
-  sk_IPAddressFamily_set_cmp_func(b, IPAddressFamily_cmp);
+  (void) sk_IPAddressFamily_set_cmp_func(b, IPAddressFamily_cmp);
   for (i = 0; i < sk_IPAddressFamily_num(a); i++) {
     IPAddressFamily *fa = sk_IPAddressFamily_value(a, i);
     int j = sk_IPAddressFamily_find(b, fa);
@@ -1195,7 +1240,7 @@ static int v3_addr_validate_path_internal(X509_STORE_CTX *ctx,
   }
   if (!v3_addr_is_canonical(ext))
     validation_err(X509_V_ERR_INVALID_EXTENSION);
-  sk_IPAddressFamily_set_cmp_func(ext, IPAddressFamily_cmp);
+  (void) sk_IPAddressFamily_set_cmp_func(ext, IPAddressFamily_cmp);
   if ((child = sk_IPAddressFamily_dup(ext)) == NULL) {
     X509V3err(X509V3_F_V3_ADDR_VALIDATE_PATH_INTERNAL, ERR_R_MALLOC_FAILURE);
     ret = 0;
@@ -1221,7 +1266,7 @@ static int v3_addr_validate_path_internal(X509_STORE_CTX *ctx,
       }
       continue;
     }
-    sk_IPAddressFamily_set_cmp_func(x->rfc3779_addr, IPAddressFamily_cmp);
+    (void) sk_IPAddressFamily_set_cmp_func(x->rfc3779_addr, IPAddressFamily_cmp);
     for (j = 0; j < sk_IPAddressFamily_num(child); j++) {
       IPAddressFamily *fc = sk_IPAddressFamily_value(child, j);
       int k = sk_IPAddressFamily_find(x->rfc3779_addr, fc);
diff --git a/main/openssl/crypto/x509v3/v3_asid.c b/main/openssl/crypto/x509v3/v3_asid.c
index 3f434c06..1587e8ed 100644
--- a/main/openssl/crypto/x509v3/v3_asid.c
+++ b/main/openssl/crypto/x509v3/v3_asid.c
@@ -358,6 +358,20 @@ static int ASIdentifierChoice_is_canonical(ASIdentifierChoice *choice)
       goto done;
   }
 
+  /*
+   * Check for inverted range.
+   */
+  i = sk_ASIdOrRange_num(choice->u.asIdsOrRanges) - 1;
+  {
+    ASIdOrRange *a = sk_ASIdOrRange_value(choice->u.asIdsOrRanges, i);
+    ASN1_INTEGER *a_min, *a_max;
+    if (a != NULL && a->type == ASIdOrRange_range) {
+      extract_min_max(a, &a_min, &a_max);
+      if (ASN1_INTEGER_cmp(a_min, a_max) > 0)
+	goto done;
+    }
+  }
+
   ret = 1;
 
  done:
@@ -392,9 +406,18 @@ static int ASIdentifierChoice_canonize(ASIdentifierChoice *choice)
     return 1;
 
   /*
-   * We have a list.  Sort it.
+   * If not a list, or if empty list, it's broken.
+   */
+  if (choice->type != ASIdentifierChoice_asIdsOrRanges ||
+      sk_ASIdOrRange_num(choice->u.asIdsOrRanges) == 0) {
+    X509V3err(X509V3_F_ASIDENTIFIERCHOICE_CANONIZE,
+	      X509V3_R_EXTENSION_VALUE_ERROR);
+    return 0;
+  }
+
+  /*
+   * We have a non-empty list.  Sort it.
    */
-  OPENSSL_assert(choice->type == ASIdentifierChoice_asIdsOrRanges);
   sk_ASIdOrRange_sort(choice->u.asIdsOrRanges);
 
   /*
@@ -415,6 +438,13 @@ static int ASIdentifierChoice_canonize(ASIdentifierChoice *choice)
     OPENSSL_assert(ASN1_INTEGER_cmp(a_min, b_min) <= 0);
 
     /*
+     * Punt inverted ranges.
+     */
+    if (ASN1_INTEGER_cmp(a_min, a_max) > 0 ||
+	ASN1_INTEGER_cmp(b_min, b_max) > 0)
+      goto done;
+
+    /*
      * Check for overlaps.
      */
     if (ASN1_INTEGER_cmp(a_max, b_min) >= 0) {
@@ -465,12 +495,26 @@ static int ASIdentifierChoice_canonize(ASIdentifierChoice *choice)
 	break;
       }
       ASIdOrRange_free(b);
-      sk_ASIdOrRange_delete(choice->u.asIdsOrRanges, i + 1);
+      (void) sk_ASIdOrRange_delete(choice->u.asIdsOrRanges, i + 1);
       i--;
       continue;
     }
   }
 
+  /*
+   * Check for final inverted range.
+   */
+  i = sk_ASIdOrRange_num(choice->u.asIdsOrRanges) - 1;
+  {
+    ASIdOrRange *a = sk_ASIdOrRange_value(choice->u.asIdsOrRanges, i);
+    ASN1_INTEGER *a_min, *a_max;
+    if (a != NULL && a->type == ASIdOrRange_range) {
+      extract_min_max(a, &a_min, &a_max);
+      if (ASN1_INTEGER_cmp(a_min, a_max) > 0)
+	goto done;
+    }
+  }
+
   OPENSSL_assert(ASIdentifierChoice_is_canonical(choice)); /* Paranoia */
 
   ret = 1;
@@ -498,6 +542,7 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
 			       struct v3_ext_ctx *ctx,
 			       STACK_OF(CONF_VALUE) *values)
 {
+  ASN1_INTEGER *min = NULL, *max = NULL;
   ASIdentifiers *asid = NULL;
   int i;
 
@@ -508,7 +553,6 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
 
   for (i = 0; i < sk_CONF_VALUE_num(values); i++) {
     CONF_VALUE *val = sk_CONF_VALUE_value(values, i);
-    ASN1_INTEGER *min = NULL, *max = NULL;
     int i1, i2, i3, is_range, which;
 
     /*
@@ -578,18 +622,19 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
       max = s2i_ASN1_INTEGER(NULL, s + i2);
       OPENSSL_free(s);
       if (min == NULL || max == NULL) {
-	ASN1_INTEGER_free(min);
-	ASN1_INTEGER_free(max);
 	X509V3err(X509V3_F_V2I_ASIDENTIFIERS, ERR_R_MALLOC_FAILURE);
 	goto err;
       }
+      if (ASN1_INTEGER_cmp(min, max) > 0) {
+	X509V3err(X509V3_F_V2I_ASIDENTIFIERS, X509V3_R_EXTENSION_VALUE_ERROR);
+	goto err;
+      }
     }
     if (!v3_asid_add_id_or_range(asid, which, min, max)) {
-      ASN1_INTEGER_free(min);
-      ASN1_INTEGER_free(max);
       X509V3err(X509V3_F_V2I_ASIDENTIFIERS, ERR_R_MALLOC_FAILURE);
       goto err;
     }
+    min = max = NULL;
   }
 
   /*
@@ -601,6 +646,8 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
 
  err:
   ASIdentifiers_free(asid);
+  ASN1_INTEGER_free(min);
+  ASN1_INTEGER_free(max);
   return NULL;
 }
 
diff --git a/main/openssl/crypto/x509v3/v3_pci.c b/main/openssl/crypto/x509v3/v3_pci.c
index 0dcfa004..f7b733ae 100644
--- a/main/openssl/crypto/x509v3/v3_pci.c
+++ b/main/openssl/crypto/x509v3/v3_pci.c
@@ -2,7 +2,7 @@
 /* Contributed to the OpenSSL Project 2004
  * by Richard Levitte (richard@levitte.org)
  */
-/* Copyright (c) 2004 Kungliga Tekniska H�gskolan
+/* Copyright (c) 2004 Kungliga Tekniska Högskolan
  * (Royal Institute of Technology, Stockholm, Sweden).
  * All rights reserved.
  *
diff --git a/main/openssl/crypto/x509v3/v3_pcia.c b/main/openssl/crypto/x509v3/v3_pcia.c
index bb362e0e..eb082739 100644
--- a/main/openssl/crypto/x509v3/v3_pcia.c
+++ b/main/openssl/crypto/x509v3/v3_pcia.c
@@ -2,7 +2,7 @@
 /* Contributed to the OpenSSL Project 2004
  * by Richard Levitte (richard@levitte.org)
  */
-/* Copyright (c) 2004 Kungliga Tekniska H�gskolan
+/* Copyright (c) 2004 Kungliga Tekniska Högskolan
  * (Royal Institute of Technology, Stockholm, Sweden).
  * All rights reserved.
  *
diff --git a/main/openssl/crypto/x509v3/v3_purp.c b/main/openssl/crypto/x509v3/v3_purp.c
index 181bd349..ad688657 100644
--- a/main/openssl/crypto/x509v3/v3_purp.c
+++ b/main/openssl/crypto/x509v3/v3_purp.c
@@ -474,11 +474,11 @@ static void x509v3_cache_extensions(X509 *x)
 	for (i = 0; i < X509_get_ext_count(x); i++)
 		{
 		ex = X509_get_ext(x, i);
-		if (!X509_EXTENSION_get_critical(ex))
-			continue;
 		if (OBJ_obj2nid(X509_EXTENSION_get_object(ex))
 					== NID_freshest_crl)
 			x->ex_flags |= EXFLAG_FRESHEST;
+		if (!X509_EXTENSION_get_critical(ex))
+			continue;
 		if (!X509_supported_extension(ex))
 			{
 			x->ex_flags |= EXFLAG_CRITICAL;
diff --git a/main/openssl/crypto/x509v3/v3_skey.c b/main/openssl/crypto/x509v3/v3_skey.c
index 202c9e48..0a984fba 100644
--- a/main/openssl/crypto/x509v3/v3_skey.c
+++ b/main/openssl/crypto/x509v3/v3_skey.c
@@ -129,7 +129,8 @@ static ASN1_OCTET_STRING *s2i_skey_id(X509V3_EXT_METHOD *method,
 		goto err;
 	}
 
-	EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL);
+	if (!EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL))
+		goto err;
 
 	if(!M_ASN1_OCTET_STRING_set(oct, pkey_dig, diglen)) {
 		X509V3err(X509V3_F_S2I_SKEY_ID,ERR_R_MALLOC_FAILURE);
diff --git a/main/openssl/crypto/x509v3/v3_utl.c b/main/openssl/crypto/x509v3/v3_utl.c
index e0302345..87cfceb4 100644
--- a/main/openssl/crypto/x509v3/v3_utl.c
+++ b/main/openssl/crypto/x509v3/v3_utl.c
@@ -365,7 +365,7 @@ char *hex_to_string(const unsigned char *buffer, long len)
 	char *tmp, *q;
 	const unsigned char *p;
 	int i;
-	const static char hexdig[] = "0123456789ABCDEF";
+	static const char hexdig[] = "0123456789ABCDEF";
 	if(!buffer || !len) return NULL;
 	if(!(tmp = OPENSSL_malloc(len * 3 + 1))) {
 		X509V3err(X509V3_F_HEX_TO_STRING,ERR_R_MALLOC_FAILURE);
diff --git a/main/openssl/crypto/x86_64cpuid.S b/main/openssl/crypto/x86_64cpuid.S
new file mode 100644
index 00000000..562b03ab
--- /dev/null
+++ b/main/openssl/crypto/x86_64cpuid.S
@@ -0,0 +1,234 @@
+
+.hidden	OPENSSL_cpuid_setup
+.section	.init
+	call	OPENSSL_cpuid_setup
+
+.hidden	OPENSSL_ia32cap_P
+.comm	OPENSSL_ia32cap_P,8,4
+
+.text	
+
+.globl	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,@function
+.align	16
+OPENSSL_atomic_add:
+	movl	(%rdi),%eax
+.Lspin:	leaq	(%rsi,%rax,1),%r8
+.byte	0xf0		
+	cmpxchgl	%r8d,(%rdi)
+	jne	.Lspin
+	movl	%r8d,%eax
+.byte	0x48,0x98	
+	.byte	0xf3,0xc3
+.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.globl	OPENSSL_rdtsc
+.type	OPENSSL_rdtsc,@function
+.align	16
+OPENSSL_rdtsc:
+	rdtsc
+	shlq	$32,%rdx
+	orq	%rdx,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_rdtsc,.-OPENSSL_rdtsc
+
+.globl	OPENSSL_ia32_cpuid
+.type	OPENSSL_ia32_cpuid,@function
+.align	16
+OPENSSL_ia32_cpuid:
+	movq	%rbx,%r8
+
+	xorl	%eax,%eax
+	cpuid
+	movl	%eax,%r11d
+
+	xorl	%eax,%eax
+	cmpl	$1970169159,%ebx
+	setne	%al
+	movl	%eax,%r9d
+	cmpl	$1231384169,%edx
+	setne	%al
+	orl	%eax,%r9d
+	cmpl	$1818588270,%ecx
+	setne	%al
+	orl	%eax,%r9d
+	jz	.Lintel
+
+	cmpl	$1752462657,%ebx
+	setne	%al
+	movl	%eax,%r10d
+	cmpl	$1769238117,%edx
+	setne	%al
+	orl	%eax,%r10d
+	cmpl	$1145913699,%ecx
+	setne	%al
+	orl	%eax,%r10d
+	jnz	.Lintel
+
+
+	movl	$2147483648,%eax
+	cpuid
+	cmpl	$2147483649,%eax
+	jb	.Lintel
+	movl	%eax,%r10d
+	movl	$2147483649,%eax
+	cpuid
+	orl	%ecx,%r9d
+	andl	$2049,%r9d
+
+	cmpl	$2147483656,%r10d
+	jb	.Lintel
+
+	movl	$2147483656,%eax
+	cpuid
+	movzbq	%cl,%r10
+	incq	%r10
+
+	movl	$1,%eax
+	cpuid
+	btl	$28,%edx
+	jnc	.Lgeneric
+	shrl	$16,%ebx
+	cmpb	%r10b,%bl
+	ja	.Lgeneric
+	andl	$4026531839,%edx
+	jmp	.Lgeneric
+
+.Lintel:
+	cmpl	$4,%r11d
+	movl	$-1,%r10d
+	jb	.Lnocacheinfo
+
+	movl	$4,%eax
+	movl	$0,%ecx
+	cpuid
+	movl	%eax,%r10d
+	shrl	$14,%r10d
+	andl	$4095,%r10d
+
+.Lnocacheinfo:
+	movl	$1,%eax
+	cpuid
+	andl	$3220176895,%edx
+	cmpl	$0,%r9d
+	jne	.Lnotintel
+	orl	$1073741824,%edx
+	andb	$15,%ah
+	cmpb	$15,%ah
+	jne	.Lnotintel
+	orl	$1048576,%edx
+.Lnotintel:
+	btl	$28,%edx
+	jnc	.Lgeneric
+	andl	$4026531839,%edx
+	cmpl	$0,%r10d
+	je	.Lgeneric
+
+	orl	$268435456,%edx
+	shrl	$16,%ebx
+	cmpb	$1,%bl
+	ja	.Lgeneric
+	andl	$4026531839,%edx
+.Lgeneric:
+	andl	$2048,%r9d
+	andl	$4294965247,%ecx
+	orl	%ecx,%r9d
+
+	movl	%edx,%r10d
+	btl	$27,%r9d
+	jnc	.Lclear_avx
+	xorl	%ecx,%ecx
+.byte	0x0f,0x01,0xd0		
+	andl	$6,%eax
+	cmpl	$6,%eax
+	je	.Ldone
+.Lclear_avx:
+	movl	$4026525695,%eax
+	andl	%eax,%r9d
+.Ldone:
+	shlq	$32,%r9
+	movl	%r10d,%eax
+	movq	%r8,%rbx
+	orq	%r9,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+
+.globl	OPENSSL_cleanse
+.type	OPENSSL_cleanse,@function
+.align	16
+OPENSSL_cleanse:
+	xorq	%rax,%rax
+	cmpq	$15,%rsi
+	jae	.Lot
+	cmpq	$0,%rsi
+	je	.Lret
+.Little:
+	movb	%al,(%rdi)
+	subq	$1,%rsi
+	leaq	1(%rdi),%rdi
+	jnz	.Little
+.Lret:
+	.byte	0xf3,0xc3
+.align	16
+.Lot:
+	testq	$7,%rdi
+	jz	.Laligned
+	movb	%al,(%rdi)
+	leaq	-1(%rsi),%rsi
+	leaq	1(%rdi),%rdi
+	jmp	.Lot
+.Laligned:
+	movq	%rax,(%rdi)
+	leaq	-8(%rsi),%rsi
+	testq	$-8,%rsi
+	leaq	8(%rdi),%rdi
+	jnz	.Laligned
+	cmpq	$0,%rsi
+	jne	.Little
+	.byte	0xf3,0xc3
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
+.globl	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,@function
+.align	16
+OPENSSL_wipe_cpu:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	xorq	%rcx,%rcx
+	xorq	%rdx,%rdx
+	xorq	%rsi,%rsi
+	xorq	%rdi,%rdi
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	leaq	8(%rsp),%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+.globl	OPENSSL_ia32_rdrand
+.type	OPENSSL_ia32_rdrand,@function
+.align	16
+OPENSSL_ia32_rdrand:
+	movl	$8,%ecx
+.Loop_rdrand:
+.byte	72,15,199,240
+	jc	.Lbreak_rdrand
+	loop	.Loop_rdrand
+.Lbreak_rdrand:
+	cmpq	$0,%rax
+	cmoveq	%rcx,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
diff --git a/main/openssl/crypto/x86_64cpuid.pl b/main/openssl/crypto/x86_64cpuid.pl
index c96821a3..6ebfd017 100644
--- a/main/openssl/crypto/x86_64cpuid.pl
+++ b/main/openssl/crypto/x86_64cpuid.pl
@@ -7,15 +7,25 @@ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+				 ("%rdi","%rsi","%rdx","%rcx");	# Unix order
 
-if ($win64)	{ $arg1="%rcx"; $arg2="%rdx"; }
-else		{ $arg1="%rdi"; $arg2="%rsi"; }
 print<<___;
 .extern		OPENSSL_cpuid_setup
+.hidden		OPENSSL_cpuid_setup
 .section	.init
 	call	OPENSSL_cpuid_setup
 
+.hidden	OPENSSL_ia32cap_P
+.comm	OPENSSL_ia32cap_P,8,4
+
 .text
 
 .globl	OPENSSL_atomic_add
@@ -46,7 +56,7 @@ OPENSSL_rdtsc:
 .type	OPENSSL_ia32_cpuid,\@abi-omnipotent
 .align	16
 OPENSSL_ia32_cpuid:
-	mov	%rbx,%r8
+	mov	%rbx,%r8		# save %rbx
 
 	xor	%eax,%eax
 	cpuid
@@ -78,7 +88,15 @@ OPENSSL_ia32_cpuid:
 	# AMD specific
 	mov	\$0x80000000,%eax
 	cpuid
-	cmp	\$0x80000008,%eax
+	cmp	\$0x80000001,%eax
+	jb	.Lintel
+	mov	%eax,%r10d
+	mov	\$0x80000001,%eax
+	cpuid
+	or	%ecx,%r9d
+	and	\$0x00000801,%r9d	# isolate AMD XOP bit, 1<<11
+
+	cmp	\$0x80000008,%r10d
 	jb	.Lintel
 
 	mov	\$0x80000008,%eax
@@ -89,12 +107,12 @@ OPENSSL_ia32_cpuid:
 	mov	\$1,%eax
 	cpuid
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	shr	\$16,%ebx		# number of logical processors
 	cmp	%r10b,%bl
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
-	jmp	.Ldone
+	jmp	.Lgeneric
 
 .Lintel:
 	cmp	\$4,%r11d
@@ -111,30 +129,47 @@ OPENSSL_ia32_cpuid:
 .Lnocacheinfo:
 	mov	\$1,%eax
 	cpuid
+	and	\$0xbfefffff,%edx	# force reserved bits to 0
 	cmp	\$0,%r9d
 	jne	.Lnotintel
-	or	\$0x00100000,%edx	# use reserved 20th bit to engage RC4_CHAR
+	or	\$0x40000000,%edx	# set reserved bit#30 on Intel CPUs
 	and	\$15,%ah
 	cmp	\$15,%ah		# examine Family ID
-	je	.Lnotintel
-	or	\$0x40000000,%edx	# use reserved bit to skip unrolled loop
+	jne	.Lnotintel
+	or	\$0x00100000,%edx	# set reserved bit#20 to engage RC4_CHAR
 .Lnotintel:
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
 	cmp	\$0,%r10d
-	je	.Ldone
+	je	.Lgeneric
 
 	or	\$0x10000000,%edx	# 1<<28
 	shr	\$16,%ebx
 	cmp	\$1,%bl			# see if cache is shared
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
+.Lgeneric:
+	and	\$0x00000800,%r9d	# isolate AMD XOP flag
+	and	\$0xfffff7ff,%ecx
+	or	%ecx,%r9d		# merge AMD XOP flag
+
+	mov	%edx,%r10d		# %r9d:%r10d is copy of %ecx:%edx
+	bt	\$27,%r9d		# check OSXSAVE bit
+	jnc	.Lclear_avx
+	xor	%ecx,%ecx		# XCR0
+	.byte	0x0f,0x01,0xd0		# xgetbv
+	and	\$6,%eax		# isolate XMM and YMM state support
+	cmp	\$6,%eax
+	je	.Ldone
+.Lclear_avx:
+	mov	\$0xefffe7ff,%eax	# ~(1<<28|1<<12|1<<11)
+	and	%eax,%r9d		# clear AVX, FMA and AMD XOP bits
 .Ldone:
-	shl	\$32,%rcx
-	mov	%edx,%eax
-	mov	%r8,%rbx
-	or	%rcx,%rax
+	shl	\$32,%r9
+	mov	%r10d,%eax
+	mov	%r8,%rbx		# restore %rbx
+	or	%r9,%rax
 	ret
 .size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
 
@@ -229,4 +264,21 @@ OPENSSL_wipe_cpu:
 .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 ___
 
+print<<___;
+.globl	OPENSSL_ia32_rdrand
+.type	OPENSSL_ia32_rdrand,\@abi-omnipotent
+.align	16
+OPENSSL_ia32_rdrand:
+	mov	\$8,%ecx
+.Loop_rdrand:
+	rdrand	%rax
+	jc	.Lbreak_rdrand
+	loop	.Loop_rdrand
+.Lbreak_rdrand:
+	cmp	\$0,%rax
+	cmove	%rcx,%rax
+	ret
+.size	OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
+___
+
 close STDOUT;	# flush
diff --git a/main/openssl/crypto/x86cpuid.S b/main/openssl/crypto/x86cpuid.S
new file mode 100644
index 00000000..73b5d98e
--- /dev/null
+++ b/main/openssl/crypto/x86cpuid.S
@@ -0,0 +1,334 @@
+.file	"x86cpuid.s"
+.text
+.globl	OPENSSL_ia32_cpuid
+.type	OPENSSL_ia32_cpuid,@function
+.align	16
+OPENSSL_ia32_cpuid:
+.L_OPENSSL_ia32_cpuid_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%edx,%edx
+	pushfl
+	popl	%eax
+	movl	%eax,%ecx
+	xorl	$2097152,%eax
+	pushl	%eax
+	popfl
+	pushfl
+	popl	%eax
+	xorl	%eax,%ecx
+	xorl	%eax,%eax
+	btl	$21,%ecx
+	jnc	.L000nocpuid
+	.byte	0x0f,0xa2
+	movl	%eax,%edi
+	xorl	%eax,%eax
+	cmpl	$1970169159,%ebx
+	setne	%al
+	movl	%eax,%ebp
+	cmpl	$1231384169,%edx
+	setne	%al
+	orl	%eax,%ebp
+	cmpl	$1818588270,%ecx
+	setne	%al
+	orl	%eax,%ebp
+	jz	.L001intel
+	cmpl	$1752462657,%ebx
+	setne	%al
+	movl	%eax,%esi
+	cmpl	$1769238117,%edx
+	setne	%al
+	orl	%eax,%esi
+	cmpl	$1145913699,%ecx
+	setne	%al
+	orl	%eax,%esi
+	jnz	.L001intel
+	movl	$2147483648,%eax
+	.byte	0x0f,0xa2
+	cmpl	$2147483649,%eax
+	jb	.L001intel
+	movl	%eax,%esi
+	movl	$2147483649,%eax
+	.byte	0x0f,0xa2
+	orl	%ecx,%ebp
+	andl	$2049,%ebp
+	cmpl	$2147483656,%esi
+	jb	.L001intel
+	movl	$2147483656,%eax
+	.byte	0x0f,0xa2
+	movzbl	%cl,%esi
+	incl	%esi
+	movl	$1,%eax
+	xorl	%ecx,%ecx
+	.byte	0x0f,0xa2
+	btl	$28,%edx
+	jnc	.L002generic
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	cmpl	%esi,%ebx
+	ja	.L002generic
+	andl	$4026531839,%edx
+	jmp	.L002generic
+.L001intel:
+	cmpl	$4,%edi
+	movl	$-1,%edi
+	jb	.L003nocacheinfo
+	movl	$4,%eax
+	movl	$0,%ecx
+	.byte	0x0f,0xa2
+	movl	%eax,%edi
+	shrl	$14,%edi
+	andl	$4095,%edi
+.L003nocacheinfo:
+	movl	$1,%eax
+	xorl	%ecx,%ecx
+	.byte	0x0f,0xa2
+	andl	$3220176895,%edx
+	cmpl	$0,%ebp
+	jne	.L004notintel
+	orl	$1073741824,%edx
+	andb	$15,%ah
+	cmpb	$15,%ah
+	jne	.L004notintel
+	orl	$1048576,%edx
+.L004notintel:
+	btl	$28,%edx
+	jnc	.L002generic
+	andl	$4026531839,%edx
+	cmpl	$0,%edi
+	je	.L002generic
+	orl	$268435456,%edx
+	shrl	$16,%ebx
+	cmpb	$1,%bl
+	ja	.L002generic
+	andl	$4026531839,%edx
+.L002generic:
+	andl	$2048,%ebp
+	andl	$4294965247,%ecx
+	movl	%edx,%esi
+	orl	%ecx,%ebp
+	btl	$27,%ecx
+	jnc	.L005clear_avx
+	xorl	%ecx,%ecx
+.byte	15,1,208
+	andl	$6,%eax
+	cmpl	$6,%eax
+	je	.L006done
+	cmpl	$2,%eax
+	je	.L005clear_avx
+.L007clear_xmm:
+	andl	$4261412861,%ebp
+	andl	$4278190079,%esi
+.L005clear_avx:
+	andl	$4026525695,%ebp
+.L006done:
+	movl	%esi,%eax
+	movl	%ebp,%edx
+.L000nocpuid:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	OPENSSL_ia32_cpuid,.-.L_OPENSSL_ia32_cpuid_begin
+.globl	OPENSSL_rdtsc
+.type	OPENSSL_rdtsc,@function
+.align	16
+OPENSSL_rdtsc:
+.L_OPENSSL_rdtsc_begin:
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	call	.L008PIC_me_up
+.L008PIC_me_up:
+	popl	%ecx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L008PIC_me_up](%ecx),%ecx
+	movl	OPENSSL_ia32cap_P@GOT(%ecx),%ecx
+	btl	$4,(%ecx)
+	jnc	.L009notsc
+	.byte	0x0f,0x31
+.L009notsc:
+	ret
+.size	OPENSSL_rdtsc,.-.L_OPENSSL_rdtsc_begin
+.globl	OPENSSL_instrument_halt
+.type	OPENSSL_instrument_halt,@function
+.align	16
+OPENSSL_instrument_halt:
+.L_OPENSSL_instrument_halt_begin:
+	call	.L010PIC_me_up
+.L010PIC_me_up:
+	popl	%ecx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%ecx),%ecx
+	movl	OPENSSL_ia32cap_P@GOT(%ecx),%ecx
+	btl	$4,(%ecx)
+	jnc	.L011nohalt
+.long	2421723150
+	andl	$3,%eax
+	jnz	.L011nohalt
+	pushfl
+	popl	%eax
+	btl	$9,%eax
+	jnc	.L011nohalt
+	.byte	0x0f,0x31
+	pushl	%edx
+	pushl	%eax
+	hlt
+	.byte	0x0f,0x31
+	subl	(%esp),%eax
+	sbbl	4(%esp),%edx
+	addl	$8,%esp
+	ret
+.L011nohalt:
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	ret
+.size	OPENSSL_instrument_halt,.-.L_OPENSSL_instrument_halt_begin
+.globl	OPENSSL_far_spin
+.type	OPENSSL_far_spin,@function
+.align	16
+OPENSSL_far_spin:
+.L_OPENSSL_far_spin_begin:
+	pushfl
+	popl	%eax
+	btl	$9,%eax
+	jnc	.L012nospin
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+.long	2430111262
+	xorl	%eax,%eax
+	movl	(%ecx),%edx
+	jmp	.L013spin
+.align	16
+.L013spin:
+	incl	%eax
+	cmpl	(%ecx),%edx
+	je	.L013spin
+.long	529567888
+	ret
+.L012nospin:
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	ret
+.size	OPENSSL_far_spin,.-.L_OPENSSL_far_spin_begin
+.globl	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,@function
+.align	16
+OPENSSL_wipe_cpu:
+.L_OPENSSL_wipe_cpu_begin:
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	call	.L014PIC_me_up
+.L014PIC_me_up:
+	popl	%ecx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L014PIC_me_up](%ecx),%ecx
+	movl	OPENSSL_ia32cap_P@GOT(%ecx),%ecx
+	movl	(%ecx),%ecx
+	btl	$1,(%ecx)
+	jnc	.L015no_x87
+.long	4007259865,4007259865,4007259865,4007259865,2430851995
+.L015no_x87:
+	leal	4(%esp),%eax
+	ret
+.size	OPENSSL_wipe_cpu,.-.L_OPENSSL_wipe_cpu_begin
+.globl	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,@function
+.align	16
+OPENSSL_atomic_add:
+.L_OPENSSL_atomic_add_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+	pushl	%ebx
+	nop
+	movl	(%edx),%eax
+.L016spin:
+	leal	(%eax,%ecx,1),%ebx
+	nop
+.long	447811568
+	jne	.L016spin
+	movl	%ebx,%eax
+	popl	%ebx
+	ret
+.size	OPENSSL_atomic_add,.-.L_OPENSSL_atomic_add_begin
+.globl	OPENSSL_indirect_call
+.type	OPENSSL_indirect_call,@function
+.align	16
+OPENSSL_indirect_call:
+.L_OPENSSL_indirect_call_begin:
+	pushl	%ebp
+	movl	%esp,%ebp
+	subl	$28,%esp
+	movl	12(%ebp),%ecx
+	movl	%ecx,(%esp)
+	movl	16(%ebp),%edx
+	movl	%edx,4(%esp)
+	movl	20(%ebp),%eax
+	movl	%eax,8(%esp)
+	movl	24(%ebp),%eax
+	movl	%eax,12(%esp)
+	movl	28(%ebp),%eax
+	movl	%eax,16(%esp)
+	movl	32(%ebp),%eax
+	movl	%eax,20(%esp)
+	movl	36(%ebp),%eax
+	movl	%eax,24(%esp)
+	call	*8(%ebp)
+	movl	%ebp,%esp
+	popl	%ebp
+	ret
+.size	OPENSSL_indirect_call,.-.L_OPENSSL_indirect_call_begin
+.globl	OPENSSL_cleanse
+.type	OPENSSL_cleanse,@function
+.align	16
+OPENSSL_cleanse:
+.L_OPENSSL_cleanse_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+	xorl	%eax,%eax
+	cmpl	$7,%ecx
+	jae	.L017lot
+	cmpl	$0,%ecx
+	je	.L018ret
+.L019little:
+	movb	%al,(%edx)
+	subl	$1,%ecx
+	leal	1(%edx),%edx
+	jnz	.L019little
+.L018ret:
+	ret
+.align	16
+.L017lot:
+	testl	$3,%edx
+	jz	.L020aligned
+	movb	%al,(%edx)
+	leal	-1(%ecx),%ecx
+	leal	1(%edx),%edx
+	jmp	.L017lot
+.L020aligned:
+	movl	%eax,(%edx)
+	leal	-4(%ecx),%ecx
+	testl	$-4,%ecx
+	leal	4(%edx),%edx
+	jnz	.L020aligned
+	cmpl	$0,%ecx
+	jne	.L019little
+	ret
+.size	OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
+.globl	OPENSSL_ia32_rdrand
+.type	OPENSSL_ia32_rdrand,@function
+.align	16
+OPENSSL_ia32_rdrand:
+.L_OPENSSL_ia32_rdrand_begin:
+	movl	$8,%ecx
+.L021loop:
+.byte	15,199,240
+	jc	.L022break
+	loop	.L021loop
+.L022break:
+	cmpl	$0,%eax
+	cmovel	%ecx,%eax
+	ret
+.size	OPENSSL_ia32_rdrand,.-.L_OPENSSL_ia32_rdrand_begin
+.comm	OPENSSL_ia32cap_P,8,4
+.section	.init
+	call	OPENSSL_cpuid_setup
diff --git a/main/openssl/crypto/x86cpuid.pl b/main/openssl/crypto/x86cpuid.pl
index a7464af1..b270b443 100644
--- a/main/openssl/crypto/x86cpuid.pl
+++ b/main/openssl/crypto/x86cpuid.pl
@@ -19,9 +19,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&pushf	();
 	&pop	("eax");
 	&xor	("ecx","eax");
-	&bt	("ecx",21);
-	&jnc	(&label("done"));
 	&xor	("eax","eax");
+	&bt	("ecx",21);
+	&jnc	(&label("nocpuid"));
 	&cpuid	();
 	&mov	("edi","eax");		# max value for standard query level
 
@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	# AMD specific
 	&mov	("eax",0x80000000);
 	&cpuid	();
-	&cmp	("eax",0x80000008);
+	&cmp	("eax",0x80000001);
+	&jb	(&label("intel"));
+	&mov	("esi","eax");
+	&mov	("eax",0x80000001);
+	&cpuid	();
+	&or	("ebp","ecx");
+	&and	("ebp",1<<11|1);	# isolate XOP bit
+	&cmp	("esi",0x80000008);
 	&jb	(&label("intel"));
 
 	&mov	("eax",0x80000008);
@@ -60,15 +67,16 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&inc	("esi");		# number of cores
 
 	&mov	("eax",1);
+	&xor	("ecx","ecx");
 	&cpuid	();
 	&bt	("edx",28);
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&shr	("ebx",16);
 	&and	("ebx",0xff);
 	&cmp	("ebx","esi");
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit
-	&jmp	(&label("done"));
+	&jmp	(&label("generic"));
 	
 &set_label("intel");
 	&cmp	("edi",4);
@@ -84,28 +92,53 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 
 &set_label("nocacheinfo");
 	&mov	("eax",1);
+	&xor	("ecx","ecx");
 	&cpuid	();
+	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
 	&cmp	("ebp",0);
-	&jne	(&label("notP4"));
+	&jne	(&label("notintel"));
+	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
 	&and	(&HB("eax"),15);	# familiy ID
 	&cmp	(&HB("eax"),15);	# P4?
-	&jne	(&label("notP4"));
-	&or	("edx",1<<20);		# use reserved bit to engage RC4_CHAR
-&set_label("notP4");
+	&jne	(&label("notintel"));
+	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
+&set_label("notintel");
 	&bt	("edx",28);		# test hyper-threading bit
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&and	("edx",0xefffffff);
 	&cmp	("edi",0);
-	&je	(&label("done"));
+	&je	(&label("generic"));
 
 	&or	("edx",0x10000000);
 	&shr	("ebx",16);
 	&cmp	(&LB("ebx"),1);
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
+
+&set_label("generic");
+	&and	("ebp",1<<11);		# isolate AMD XOP flag
+	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
+	&mov	("esi","edx");
+	&or	("ebp","ecx");		# merge AMD XOP flag
+
+	&bt	("ecx",27);		# check OSXSAVE bit
+	&jnc	(&label("clear_avx"));
+	&xor	("ecx","ecx");
+	&data_byte(0x0f,0x01,0xd0);	# xgetbv
+	&and	("eax",6);
+	&cmp	("eax",6);
+	&je	(&label("done"));
+	&cmp	("eax",2);
+	&je	(&label("clear_avx"));
+&set_label("clear_xmm");
+	&and	("ebp",0xfdfffffd);	# clear AESNI and PCLMULQDQ bits
+	&and	("esi",0xfeffffff);	# clear FXSR
+&set_label("clear_avx");
+	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
 &set_label("done");
-	&mov	("eax","edx");
-	&mov	("edx","ecx");
+	&mov	("eax","esi");
+	&mov	("edx","ebp");
+&set_label("nocpuid");
 &function_end("OPENSSL_ia32_cpuid");
 
 &external_label("OPENSSL_ia32cap_P");
@@ -134,7 +167,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&jnz	(&label("nohalt"));	# not enough privileges
 
 	&pushf	();
-	&pop	("eax")
+	&pop	("eax");
 	&bt	("eax",9);
 	&jnc	(&label("nohalt"));	# interrupts are disabled
 
@@ -199,8 +232,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&bt	(&DWP(0,"ecx"),1);
 	&jnc	(&label("no_x87"));
 	if ($sse2) {
-		&bt	(&DWP(0,"ecx"),26);
-		&jnc	(&label("no_sse2"));
+		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
+		&cmp	("ecx",1<<26|1<<24);
+		&jne	(&label("no_sse2"));
 		&pxor	("xmm0","xmm0");
 		&pxor	("xmm1","xmm1");
 		&pxor	("xmm2","xmm2");
@@ -248,7 +282,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 #	arguments is 1 or 2!
 &function_begin_B("OPENSSL_indirect_call");
 	{
-	my $i,$max=7;		# $max has to be chosen as 4*n-1
+	my ($max,$i)=(7,);	# $max has to be chosen as 4*n-1
 				# in order to preserve eventual
 				# stack alignment
 	&push	("ebp");
@@ -307,6 +341,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&ret	();
 &function_end_B("OPENSSL_cleanse");
 
+&function_begin_B("OPENSSL_ia32_rdrand");
+	&mov	("ecx",8);
+&set_label("loop");
+	&rdrand	("eax");
+	&jc	(&label("break"));
+	&loop	(&label("loop"));
+&set_label("break");
+	&cmp	("eax",0);
+	&cmove	("eax","ecx");
+	&ret	();
+&function_end_B("OPENSSL_ia32_rdrand");
+
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();
author	Arne Schwabe <arne@rfc2549.org>	2014-04-23 09:56:37 +0200
committer	Arne Schwabe <arne@rfc2549.org>	2014-04-23 09:56:37 +0200
commit	e436c963f0976b885a7db04681344779e26dd3b5 (patch)
tree	240663106f32e02e1c34080656f4ef21a2e1776e /main/openssl/crypto
parent	6a99715a9b072fa249e79c98cd9f03991f0f1219 (diff)